def get_verify_code(vehicle): global view_state # first request to get cookie and view_state first_resp = fetch_http(pre_request_url, "get") view_state = reg(first_resp.content, r'<input.+?id="__VIEWSTATE".+?value="(.+?)"')[0][0] print view_state verify_resp = fetch_http(captcha_url, 'get', headers=headers) headers["Cookie"] = verify_resp.headers['set-cookie'] print headers verify_code = VerifyCode(verify_resp.raw, headers=headers) verify_code.recognize() return verify_code
def crawl(vehicle, verify_code=None): global cxlm, chfs, sf # do some request with verify_code # if verify code is wrong, raise WrongVerifyCode # raise WrongVerifyCode() if verify_code.verify_code == None: raise WrongVerifyCode() result = dict() result['name'] = 'beijing_new' result['version'] = 1 data = {"sf":sf, "fdjhhm":vehicle.engine_num.encode("utf-8"), "carnono":vehicle.license_plate_num[-6:], "cxlm":cxlm, "yzm":verify_code.verify_code} if sf == 11: data["chfs"] = "" else: data["hpzllb"] = "02" print data second_resp = fetch_http(request_url, "post", data=data, headers=verify_code.headers) if second_resp.content.__contains__(u"验证码输入有误"): raise WrongVerifyCode() if second_resp.content.__contains__(u"您输入的车牌号或发动机号有误") or second_resp.content.__contains__(u"您没有输入完整的车牌号和发动机号"): result['error'] = 'VEHICLE INFO ERROR' return result if second_resp.content.__contains__(u"您没有未接受处理的违法记录") or second_resp.content.__contains__(u"您没有未接受处理的在京违法记录"): result['violations'] = [] return result tr_list = reg(second_resp.content, r"<tr[\s\S]+?(<td>\d{4}-\d{2}-\d{2}[\s\S]+?)</tr>") violations = [] for tr in tr_list: violation = Violation() tds = reg(tr[0], r"<td.*?>([\s\S]+?)</td>") violation.fine = tds[4][0] violation.address = tds[1][0] violation.time = tds[0][0] violation.violation_type = reg(tds[2][0], r"<a.+?>(.+?)</a>")[0][0] handled = True if tds[5][0].encode("utf-8") == "未处理": handled = False violation.handled = handled violation.point = tds[3][0] violation.agency = "" violations.append(dict(violation)) print violation.time print violation.address print violation.violation_type print violation.fine print violation.point print violation.handled print " -------- " result['violations'] = violations return result
def get_verify_code(vehicle): # return None for none verify code verify_resp = fetch_http(captcha_url, 'get', headers=headers, proxies=proxies) verify_code = VerifyCode(verify_resp.raw, headers=headers) verify_code.recognize() return verify_code
def crawl(vehicle, verify_code=None): global gg_value # do some request with verify_code # if verify code is wrong, raise WrongVerifyCode # raise WrongVerifyCode() if verify_code.verify_code == None: raise WrongVerifyCode() result = dict() result['name'] = 'dongguan' result['version'] = 1 data = {"action":"Illagel", "headno":vehicle.license_plate_num[:-6], "no":vehicle.license_plate_num[-6:], "back4":vehicle.body_num, "fdjh6":vehicle.engine_num, "validate":verify_code.verify_code, "type":"02", "tele":"18607325868", "gg":gg_value} for k in data.keys(): print k, data[k] second_resp = fetch_http(request_url, "post", data=data, headers=verify_code.headers) if second_resp.content.__contains__(u"验证码错误"): raise WrongVerifyCode() if second_resp.content.__contains__(u"发动机号错误") or second_resp.content.__contains__(u"车架号错误") or second_resp.content.__contains__(u"车辆信息错误"): result['error'] = 'VEHICLE INFO ERROR' return result if second_resp.content.__contains__(u"没有违章信息"): result['violations'] = [] return result tr_list = reg(second_resp.content, r"<tr><td>(.+?</table></td>.+?)</td></tr>") violations = [] for tr in tr_list: violation = Violation() violation.fine = reg(tr[0], r"<td>(\d{1,6}?)</td><td>")[0][0] sub_table_content = reg(tr[0], r'<table class="illegal_table".+?>(.+?)</table>')[0][0] sub_info_list = reg(sub_table_content, r'<td[^r]*?>(.+?)</td>') violation.address = sub_info_list[2][0] violation.time = sub_info_list[0][0].replace("年","-").replace("月","-").replace("日","").replace("时",":").replace("分",":").replace("秒","") violation.violation_type = sub_info_list[3][0] violation.handled = False point = sub_info_list[1][0][1] if point == "7": point = "12" violation.point = point violation.agency = "" violations.append(dict(violation)) result['violations'] = violations return result
def get_verify_code(vehicle): # return None for none verify code verify_resp = fetch_http(captcha_url, 'get', headers=headers) headers["Cookie"] = verify_resp.headers['set-cookie'] print headers verify_code = VerifyCode(verify_resp.raw, headers=headers) verify_code.recognize() return verify_code
def crawl(vehicle, verify_code=None): # do some request with verify_code # if verify code is wrong, raise WrongVerifyCode # raise WrongVerifyCode() if verify_code.verify_code == None: raise WrongVerifyCode() result = dict() result["name"] = "huizhou" result["version"] = 1 data = { "action": "Illagel", "headno": vehicle.license_plate_num[:-6], "no": vehicle.license_plate_num[-6:], "back4": vehicle.body_num, "validate": verify_code.verify_code, "type": "02", } second_resp = fetch_http(request_url, "post", data=data, headers=verify_code.headers) if second_resp.content.__contains__(u"验证码错误!0"): raise WrongVerifyCode() if second_resp.content.__contains__(u"车架号错误"): result["error"] = "VEHICLE INFO ERROR" return result if second_resp.content.__contains__(u"没有违章信息"): result["violations"] = [] return result tr_list = reg(second_resp.content, r"<tr><td>(.+?</table></td>.+?)</td></tr>") violations = [] for tr in tr_list: violation = Violation() violation.fine = reg(tr[0], r"<td>(\d+?)</td>")[1][0] sub_table_content = reg(tr[0], r'<table class="illegal_table".+?>(.+?)</table>')[0][0] sub_info_list = reg(sub_table_content, r"<td[^3]*?>(.+?)</td>") violation.address = sub_info_list[2][0] violation.time = ( sub_info_list[0][0] .replace("年", "-") .replace("月", "-") .replace("日", "") .replace("时", ":") .replace("分", ":") .replace("秒", "") ) violation.violation_type = sub_info_list[3][0] violation.handled = False violation.point = sub_info_list[1][0][1] violation.agency = "" violations.append(dict(violation)) result["violations"] = violations return result
def get_verify_code(vehicle): # first request to get cookie first_resp = fetch_http(pre_request_url, "get") headers["Cookie"] = first_resp.headers["set-cookie"] print first_resp.headers # return None for none verify code verify_resp = fetch_http(captcha_url, "get", headers=headers) verify_code = VerifyCode(verify_resp.raw, headers=headers) print verify_resp.headers verify_resp = fetch_http(captcha_url, "get", headers=headers) print verify_resp.headers verify_resp = fetch_http(captcha_url, "get", headers=headers) print verify_resp.headers verify_code.recognize() return verify_code
def first_request(): global cxlm, chfs, sf # first request to get cookie if unicode(vehicle.license_plate_num)[0].encode("utf-8") in sf_dic: sf = sf_dic.get(unicode(vehicle.license_plate_num)[0].encode("utf-8")) if sf == 11: pre_data = {"sf":sf, "carno":vehicle.license_plate_num[-6:], "fdjh":vehicle.engine_num} headers["Referer"] = pre_referer_for_beijing first_resp = fetch_http(mid_request_url_for_beijing, "post", data=pre_data, headers=headers) else: vehicle.engine_num = "wd" pre_data = {"sf":sf, "carnono":vehicle.license_plate_num[-6:], "fdjhhm":vehicle.engine_num} headers["Referer"] = pre_referer_for_other first_resp = fetch_http(mid_request_url_for_other, "post", data=pre_data, headers=headers) headers["Cookie"] = first_resp.headers['set-cookie'] cxlm = reg(first_resp.content, r'id="cxlm" value="(.*?)"')[0][0]
def crawl(vehicle, verify_code=None): global view_state # do some request with verify_code # if verify code is wrong, raise WrongVerifyCode # raise WrongVerifyCode() if verify_code.verify_code == None: raise WrongVerifyCode() result = dict() result['name'] = 'zhengzhou' result['version'] = 1 data = {"__VIEWSTATE":view_state, "txtHphm":vehicle.license_plate_num, "txtClsbdh":vehicle.body_num, "ddlHpzl":"02", "txtYzm":verify_code.verify_code, "Button1":" 查 询 "} second_resp = fetch_http(request_url, "post", data=data, headers=verify_code.headers) # print second_resp.content if second_resp.content.__contains__(u"验证码填写有误"): raise WrongVerifyCode() if second_resp.content.__contains__(u"恭喜您,没有您的违法信息") or second_resp.content.__contains__(u"没有找到相关的车辆信息"): result['violations'] = [] return result tr_list = reg(second_resp.content, r"<tr>(<td align='center'>\d{4}-\d{2}-\d{2}.+?)</tr>") violations = [] for tr in tr_list: violation = Violation() td_infos = reg(tr[0], r"<td.*?>(.+?)</td>") violation.fine = td_infos[4][0].strip() violation.address = td_infos[1][0].strip() violation.time = td_infos[0][0].strip() violation.violation_type = td_infos[2][0].strip() violation.handled = True if td_infos[5][0].strip() == "未交款": violation.handled = False violation.agency = "" violations.append(dict(violation)) result['violations'] = violations return result
def crawl(vehicle, verify_code=None): # do some request with verify_code # if verify code is wrong, raise WrongVerifyCode # raise WrongVerifyCode() if verify_code.verify_code == None: raise WrongVerifyCode() result = dict() result['name'] = 'jilin' result['version'] = 1 data = {"province":vehicle.license_plate_num[:-6], "hphm":vehicle.license_plate_num[-6:], "engine":vehicle.body_num[-4:], "yzm":verify_code.verify_code.strip(), "hpzl":"02"} second_resp = fetch_http(request_url, "post", data=data, headers=verify_code.headers) # print second_resp.content # return if second_resp.content.__contains__(u"验证码错误"): raise WrongVerifyCode() if second_resp.content.__contains__(u"车辆识别代号后四位输入有误"): result['error'] = 'VEHICLE INFO ERROR' return result result_table = reg(second_resp.content, r'<table id="wzjl_table"[\s\S]+?>([\s\S]+?)</table>') tr_list = reg(result_table[0][0], r"<tr>([\s\S]+?)</tr>") violations = [] for tr in tr_list: violation = Violation() tds = reg(tr[0], r"<td[\s\S]+?>([\s\S]+?)</td>") violation.fine = tds[4][0].strip() violation.address = tds[3][0].strip() violation.time = tds[2][0].strip() violation.violation_type = tds[6][0].strip() violation.handled = False violation.point = tds[5][0].strip() violation.agency = "" violations.append(dict(violation)) result['violations'] = violations return result
def crawl(vehicle, verify_code=None): # if verify code is wrong, raise WrongVerifyCode # raise WrongVerifyCode() if verify_code.verify_code == None: raise WrongVerifyCode() result = dict() result['name'] = 'guangdong' result['version'] = 1 data = { "province":vehicle.license_plate_num[:-6].encode("gb2312"), "hphm":vehicle.license_plate_num[-6:], "CJHM":vehicle.body_num[-6:], "fdjh":vehicle.body_num[-6:], "mofei":verify_code.verify_code, "hpzl":"02", "x":random.randint(10, 99), "y":random.randint(10, 99) } print data second_resp = fetch_http(request_url, "post", data=data, headers=verify_code.headers, proxies=proxies) print second_resp.content # return if second_resp.content.__contains__(u"车辆目前无未处理的违章记录"): result['violations'] = [] return result if second_resp.content.__contains__(u"您查询的次数过多") or second_resp.content.__contains__(u"系统繁忙,请等待30秒后再查"): result['error'] = 'NETWORK ERROR' return result tr_list = reg(second_resp.content, r'<tr.*?>[\s\S]+?(<td.+\d{4}-\d{2}-\d{2} \d{2}:\d{2}[\s\S]+?)</tr>') violations = [] further_request_url_prefix = "http://www.ttdaiban.com/" for tr in tr_list: tds = reg(tr[0], r'<td.*?>(.*?)</td>') further_request_url_suffix = reg(tds[1][0], r"<a.+'../(.+?)'")[0][0] # print further_request_url_prefix+further_request_url_suffix further_resp = fetch_http(further_request_url_prefix+further_request_url_suffix, "get", headers=verify_code.headers, proxies=proxies) # print further_resp.content sec_tds = reg(further_resp.content.encode("utf-8"), r"<td.*?>.+?:(.+?)</td>") violation = Violation() violation.fine = sec_tds[6][0].strip() violation.address = reg(sec_tds[3][0], r".*】([\S]+)")[0][0].strip() violation.time = sec_tds[2][0].strip() violation.violation_type = sec_tds[4][0].strip() violation.handled = False violation.point = reg(sec_tds[5][0], r"\[.*?(\d+?).*?分\]")[0][0].strip() violation.agency = "" violations.append(dict(violation)) print violation.time print violation.address print violation.violation_type print violation.fine print violation.point print " -------- " result['violations'] = violations return result
def first_request(): # first request to get cookie first_resp = fetch_http(pre_request_url, "get", proxies=proxies) print first_resp.headers headers["Cookie"] = first_resp.headers['set-cookie']
def crawl(vehicle): result = dict() result['name'] = 'jiangxi' result['version'] = 1 car_type = "02" car_no = vehicle.license_plate_num.encode("utf-8") body_no = vehicle.body_num data = {"a": car_type, "b": car_no, "c": body_no, "x": car_type, "y": car_no, "z": body_no, "w": car_type, "e": car_no, "s": body_no, "v": car_type, "x": car_no, "q": body_no, "j": car_type, "k": car_no, "l": body_no+"3a="+car_type, "b4": car_no, "c="+body_no+"4a": car_type, "gb": car_no, "gz": car_no, "cc": body_no+"2a="+car_type, "cb": car_no, "c1": body_no+"vb="+car_type, "av": car_no, "ab": body_no, "pageRecords":"100", "currPage":"1"} pre_response = fetch_http(url=pre_request_url, method='get', proxies=proxies) g = reg(pre_response.content, r"g\s+=\s*'(\w+)'") headers["Cookie"] = pre_response.headers['Set-Cookie'] data["g"] = g[0][0] values = "" for key in data: values += key+"="+data[key]+"&" headers['X-Requested-With'] = 'XMLHttpRequest' for i in range(5): base_response = fetch_http( url=post_url, method='post', data=values[:-1], headers=headers, proxies=proxies) print base_response.content if u"该信息不存在" in base_response.content or\ u"查询超时" in base_response.content: continue else: break else: result['error'] = 'NET ERROR' return result content = base_response.content if u'很抱歉,您的计算机已超过今天最大查询次数,请明天再使用' in content: result['error'] = 'NET ERROR' return result if u'您输入车牌号码或机动车类型不正确' in content or\ u'您输入车架号后6位不正确' in content: # vehicle info error result['error'] = 'VEHICLE INFO ERROR' return result if u'该车辆暂无违法信息' in content: # vehicle has no violation result['violations'] = [] return result time_reg_result = reg(content, r"\d{4}-\d{2}-\d{2}\s{1}\d{2}:\d{2}") time_dict = {} for time_str in time_reg_result: if time_str[0] in time_dict: time_dict[time_str[0]] += 1 else: time_dict[time_str[0]] = 1 time_list = [] for key in time_dict: if time_dict[key] == 1: time_list.append(key) time_list.sort(reverse=True) address_reg_result = reg( content, r"' style=\"padding-left:5px\" align='left'>(.+?)</td>") code_reg_result = reg( content, r"<span title=''>(.+?)</span>") agency_reg_result = reg( content, r'style="padding-left:5px" align="left">(.+?)</td>') violations = [] for i in range(len(time_list)): violation = Violation() violation.time = time_list[i] violation.violation_type = code_reg_result[i][0] violation.address = address_reg_result[i][0] violation.agency = agency_reg_result[i][0] violation.handled = False violations.append(dict(violation)) result['violations'] = violations return result
def crawl(vehicle, verify_code=None): # do some request with verify_code # if verify code is wrong, raise WrongVerifyCode # raise WrongVerifyCode() if verify_code.verify_code == None: raise WrongVerifyCode() result = dict() result['name'] = 'guangzhou' result['version'] = 1 captcha_data = {"captchaId":verify_code.verify_code} count = 0 while count < 5: count += 1 captcha_resp = fetch_http(captcha_confirm_url, "post", data=captcha_data, headers=verify_code.headers) print "captcha_content : ", captcha_resp.content if captcha_resp.content.__contains__("fail"): print "captcha retry ~!" continue else: break else: raise WrongVerifyCode() pre_data = {"hpzl":"02", "hphm":vehicle.license_plate_num, "fdjh":vehicle.engine_num, "clsbdh":vehicle.body_num, "captcha":verify_code.verify_code} count = 0 while count < 5: count += 1 pre_resp = fetch_http(pre_request_url, "post", data=pre_data, headers=verify_code.headers) print "pre_content", pre_resp.content if pre_resp.content != "": break if pre_resp.content.__contains__("_error"): result['error'] = 'VEHICLE INFO ERROR' return result key = reg(pre_resp.content, r"(?<=key:)(.+)(?=\$\d+)")[0][0] total = reg(pre_resp.content, r"(?<=\$)(\d+)")[0][0] if total == "0": result['violations'] = [] return result data = {"platenumtype":"02", "platenum":vehicle.license_plate_num, "engineno":vehicle.engine_num, "vehicleidnum":vehicle.body_num, "key":key} count = 0 while count < 5: count += 1 main_resp = fetch_http(request_url, "post", data=data, headers=verify_code.headers) print "main_content : ", main_resp.content if main_resp.content.__contains__('"data":"",'): print "retry ~!" continue else: break vio_data = json.loads(main_resp.content) violations = [] for elem in vio_data["data"]: violation = Violation() violation.fine = elem["FKJE"] violation.point = elem["WFJFS"] violation.address = elem["WFDZ"] violation.time = elem["WFSJ"] violation.violation_type = elem["WFXWMC"] violation.handled = False violation.agency = "" violations.append(dict(violation)) result['violations'] = violations return result
def first_request(): global gg_value # first request to get cookie first_resp = fetch_http(pre_request_url, "get") headers["Cookie"] = first_resp.headers['set-cookie'] gg_value = reg(first_resp.content, r"this,'(\w+?)'")[0][0]