def __init__(self): self._can_use_proxy_num = 0 self.is_debug = False if self.is_debug: Spider.__init__(self, 1) else: self.proxies_dict = [] self.read_proxy("proxy_032512.txt") Spider.__init__(self, len(self.proxies_dict)) self._aes_ = CCIQ_AES() #成功的 self.query_success = FileSaver("c_query_detail.txt") #失败的 self.query_failure = FileSaver("c_query_detail_failure.txt") #已经爬取过的 self.already_cname_list = FileSaver("c_already_detail.txt") #初始化已经爬过的公司 self.init_cname() #self.extJson = self._aes_.encrypt(spider.util.utf8str({"cl_screenSize": "640x960", "cl_cookieId": "B200BA9D-A3A0-4140-A293-9A1A671BA5CE", "Org_iOS_Version": "2.0.1"})) # self.extJson = "Hoi6oX70l9whauZmjq8jVAmoe3UspXXhX9mPG+KAeqs1rKZVr/uapICH92P/Crryt63u28aP4QP665AzcT/jN5Go1o3bvwMvVIkuN9e60k6WI2pVFBrwZMvxwW6BnQukSzDSlyPvEhgpR5DIHQEV6C51hMgp4Zc3OkTSsyezAm4=" # self.select_user_agent("=CCIQ/2.0.1 (iPhone; iOS 9.1; Scale/2.00)") self.bloom = set() self.extJsons = ["Hoi6oX70l9whauZmjq8jVAmoe3UspXXhX9mPG+KAeqs1rKZVr/uapICH92P/Crryt63u28aP4QP665AzcT/jN5Go1o3bvwMvVIkuN9e60k6WI2pVFBrwZMvxwW6BnQukSzDSlyPvEhgpR5DIHQEV6C51hMgp4Zc3OkTSsyezAm4=", "ctlCXDvoyaH2pCIArrgvXp7zrZTzpz2Q5rukh+aWvupEFABw6P2AvbmaN+HJ7IZgDJ/kgBkJt/rLppSGitYCPKGR2IGv6OXZsrJGgbRB3G3Ac4K8xpX3aMB5s8Ci2a/YpTpioZxAvptqJsQUCoNn0tLCOVM4XxMJQWbrErkOcl4=", "ctlCXDvoyaH2pCIArrgvXp7zrZTzpz2Q5rukh+aWvupEFABw6P2AvbmaN+HJ7IZgDJ/kgBkJt/rLppSGitYCPKGR2IGv6OXZsrJGgbRB3G1U2wdOlL49/aDwt3NZNp4TGa5iBFpYLm69F/6PPFoXIR/Aw5p48//8OgZFpddDUwQ="] self.user_agents = ["=CCIQ/2.0.1 (iPhone; iOS 9.1; Scale/2.00)", "=CCIQ/2.0.1 (iPhone; iOS 8.1.3; Scale/2.00)", "=CCIQ/2.0.1 (iPhone; iOS 8.4; Scale/2.00)"] self.is_first = True self.init_time = 0
def __init__(self): self.proxies_dict = [] self.read_proxy("proxy_20160218.txt") Spider.__init__(self, len(self.proxies_dict)) self.num_count = 0 #self.filter_name = [] self._aes_ = CCIQ_AES() #根据公司名字查询到的公司列表全部信息 self.query_company_info = FileSaver("query_company_info.txt") #根据公司名字查询到的公司列表局部信息 self.query_company_info_part = FileSaver("query_company_info_part.txt") #根据公司名字查询到的公司列表信息失败的 self.query_company_info_failure = FileSaver( "query_company_info_failure.txt") #已经爬取过的公司名 self.already_cname = FileSaver("already_cname.txt") #初始化已经爬过的公司 self.init_cname() #查询详情失败的公司名 self.detail_failure = FileSaver("detail_failure1.txt") #APP可以拿到的公司全部信息 包含股东信息 self.detail_company = FileSaver("detail_company.txt") self.extJson = self._aes_.encrypt( spider.util.utf8str({ "cl_screenSize": "640x960", "cl_cookieId": "16923697-D73E-485A-BDCF-68FAD456AC02", "Org_iOS_Version": "2.0.1" })) self.select_user_agent("=CCIQ/2.0.1 (iPhone; iOS 9.1; Scale/2.00)")
def __init__(self): self.is_debug = False self._can_use_proxy_num = 0 if self.is_debug: Spider.__init__(self, 1) else: self.proxies_dict = [] self.read_proxy("../../_ct_proxy/proxy_all_filter.txt") Spider.__init__(self, len(self.proxies_dict)) self.error_cnt = 0 self._aes_ = CCIQ_AES() #根据公司名字查询到的公司列表全部信息 self.query_company_list = FileSaver("all_company_list.txt") #已经爬取过的公司名 self.already_cname_list = FileSaver("all_company_list_already.txt") #爬过的 错误类型 self.already_error_type = FileSaver("all_already_error_type.txt") #初始化已经爬过的公司 self.init_cname() self.extJsons = [ "Hoi6oX70l9whauZmjq8jVAmoe3UspXXhX9mPG+KAeqs1rKZVr/uapICH92P/Crryt63u28aP4QP665AzcT/jN5Go1o3bvwMvVIkuN9e60k6WI2pVFBrwZMvxwW6BnQukSzDSlyPvEhgpR5DIHQEV6C51hMgp4Zc3OkTSsyezAm4=", "ctlCXDvoyaH2pCIArrgvXp7zrZTzpz2Q5rukh+aWvupEFABw6P2AvbmaN+HJ7IZgDJ/kgBkJt/rLppSGitYCPKGR2IGv6OXZsrJGgbRB3G3Ac4K8xpX3aMB5s8Ci2a/YpTpioZxAvptqJsQUCoNn0tLCOVM4XxMJQWbrErkOcl4=", "ctlCXDvoyaH2pCIArrgvXp7zrZTzpz2Q5rukh+aWvupEFABw6P2AvbmaN+HJ7IZgDJ/kgBkJt/rLppSGitYCPKGR2IGv6OXZsrJGgbRB3G1U2wdOlL49/aDwt3NZNp4TGa5iBFpYLm69F/6PPFoXIR/Aw5p48//8OgZFpddDUwQ=" ] self.user_agents = [ "=CCIQ/2.0.1 (iPhone; iOS 9.1; Scale/2.00)", "=CCIQ/2.0.1 (iPhone; iOS 8.1.3; Scale/2.00)", "=CCIQ/2.0.1 (iPhone; iOS 8.4; Scale/2.00)" ] self.bloom = set()
def __init__(self): #self.proxies_dict = [] #self.read_proxy("../spider/proxy/proxy.txt") #Spider.__init__(self, len(self.proxies_dict)) Spider.__init__(self, 1) self.num_count = 0 self._aes_ = CCIQ_AES() #APP可以拿到的公司全部信息 self.save_success = FileSaver("exist_company.txt") #APP可以拿到的公司局部信息 self.part_success = FileSaver("part_company.txt") #查询失败的公司名 self.fail_name = FileSaver("fail_name.txt")
def get_detail(self, cname, code, area): url = "http://appsvc.qiye.qianzhan.com/OrgCompany.svc/orgcompany/combine/detail" headers = {"Content-Type": "application/json"} encryptedJson = { "bl_oc_code": code, #"71526726X" "v1": "QZOrgV004", "isDirect": "1", "bl_oc_name": cname, #"腾讯科技" "bl_oc_area": area #"4403" } param = { "encryptedJson": self._aes_.encrypt(spider.util.utf8str(encryptedJson)), "extJson": self.extJson } param = spider.util.utf8str(param) res = self.request_url(url, headers=headers, data=param, proxies=self.proxies_dict[self.get_tid()]) if res is None: print 'res is none -- encryptedJson -->', str(encryptedJson) self.detail_failure.append(cname + "|" + str(code) + "|" + str(area)) return elif res.code == 404: print "404 ------ ", code self.detail_failure.append(cname + "|" + str(code) + "|" + str(area)) return elif res.code == 503 or res.code == 500 or res.code == 502 or res.code == 504: print res.code, '------', code time.sleep(0.5) self.get_detail(cname, code, area) return elif res.code == 200: c = eval(res.text)['c'] if len(c) == 0: print '-----------------------------code ', code, ' res.text is null----------------------------' self.detail_failure.append(cname + "|" + str(code) + "|" + str(area)) return result = CCIQ_AES("BB1856A312580D41256311147089E0CC").decrypt(c) detail = eval(result) listGD = self.get_gd(area, code) if listGD is not None: detail['listGD'] = listGD['listGD'] print 'detail=================================', spider.util.utf8str( detail) self.detail_company.append(spider.util.utf8str(detail)) return else: print "cname %s #######################################UNKNOWN ERROR############################################# [ %d ]" % ( cname, res.code)
def get_branch(self,cname, now_page=1, list_branch=[], retry=0): """ 查询分支机构 """ url = "http://appsvc.qiye.qianzhan.com/OrgCompany.svc/orgcompany/branch/select/page" encryptedJson = { "companyName" : cname, "v1" : "QZOrgV005", "page" : now_page, "pagesize" : "10" } res = self.req_all(url, encryptedJson) if res is None: return None if res.code == 200: try: c = eval(res.text)['c'] if len(c) == 0: print "get_branch --- cname=%s --- retry=%d --- reason:len(c)=0" % (cname, retry) return None result = CCIQ_AES("BF1856A312580D41256311147089E1CC").decrypt(c) temp = eval(result) if temp is not None: for t in temp['Branch']: list_branch.append(t) if len(temp['Branch']) == 10: if now_page > 3: return list_branch now_page += 1 print cname, "翻页 -----------------------------------> now_page", now_page return self.get_branch(cname, now_page=now_page, list_branch=list_branch, retry=retry) else: return list_branch else: print "get_branch --- cname=%s --- retry=%d --- now_page=%d --- res.code=%d --- Branch is NULL" % (cname, retry, now_page) return None except Exception as err: print "get_branch --- cname=%s --- retry=%d --- reason:%s" % (cname, retry, err) if retry < 5: retry += 1 time.sleep(retry*1.5) return self.get_branch(cname, now_page=now_page, list_branch=list_branch, retry=retry) else: return None else: print "get_branch --- cname=%s --- retry=%d --- res.code=%d" % (cname, retry, res.code) if retry < 5: retry += 1 time.sleep(retry*1.5) return self.get_branch(cname, now_page=now_page, list_branch=list_branch, retry=retry) else: return None
def get_branch(self, cname, now_page, list_branch): """ 查询分支机构 """ url = "http://appsvc.qiye.qianzhan.com/OrgCompany.svc/orgcompany/branch/select/page" encryptedJson = { "companyName": cname, "v1": "QZOrgV004", "page": now_page, "pagesize": "10" } param = spider.util.utf8str({ "encryptedJson": self._aes_.encrypt(spider.util.utf8str(encryptedJson)), "extJson": self.extJson }) res = self.request_url(url, headers=self.headers, data=param, proxies=self.proxies_dict[self.get_tid()]) if res is None: print 'get_branch ------ res is none ---->', cname, now_page return None elif res.code == 404: print "get_branch ------ 404 --- ", cname, now_page return None elif res.code == 503 or res.code == 500 or res.code == 502 or res.code == 504: print 'get_branch ------ ', res.code, cname, now_page time.sleep(0.5) return self.get_branch(cname, now_page) elif res.code == 200: c = eval(res.text)['c'] if len(c) == 0: print 'get_branch------res.text is null----------------------------', cname, now_page return None result = CCIQ_AES("BB1856A312580D41256311147089E0CC").decrypt(c) temp = eval(result) if temp is not None: for t in temp['Branch']: list_branch['Branch'].append(t) if len(temp['Branch']) == 10: now_page += 1 return self.get_branch(cname, now_page, list_branch) else: return list_branch else: print 'get_branch------Branch is null----------------------------', cname, now_page return None else: print cname, "######## get_branch ################ UNKNOWN ERROR ######################", res.code return None
def __init__(self): Spider.__init__(self, 20) self._aes_ = CCIQ_AES() #self.select_user_agent("=CCIQ/2.0.1 (iPhone; iOS 8.4; Scale/2.00)") self.proxy_filter = FileSaver("proxy_filter_030309_detail1.txt") self.extJsons = ['"Hoi6oX70l9whauZmjq8jVAmoe3UspXXhX9mPG+KAeqs1rKZVr\/uapICH92P\/Crryt63u28aP4QP665AzcT\/jN5Go1o3bvwMvVIkuN9e60k6WI2pVFBrwZMvxwW6BnQukSzDSlyPvEhgpR5DIHQEV6C51hMgp4Zc3OkTSsyezAm4="', '"ctlCXDvoyaH2pCIArrgvXp7zrZTzpz2Q5rukh+aWvupEFABw6P2AvbmaN+HJ7IZgDJ\/kgBkJt\/rLppSGitYCPKGR2IGv6OXZsrJGgbRB3G3Ac4K8xpX3aMB5s8Ci2a\/YpTpioZxAvptqJsQUCoNn0tLCOVM4XxMJQWbrErkOcl4="', '"ctlCXDvoyaH2pCIArrgvXp7zrZTzpz2Q5rukh+aWvupEFABw6P2AvbmaN+HJ7IZgDJ\/kgBkJt\/rLppSGitYCPKGR2IGv6OXZsrJGgbRB3G1U2wdOlL49\/aDwt3NZNp4TGa5iBFpYLm69F\/6PPFoXIR\/Aw5p48\/\/8OgZFpddDUwQ="'] self.user_agents = ["=CCIQ/2.0.1 (iPhone; iOS 9.1; Scale/2.00)", "=CCIQ/2.0.1 (iPhone; iOS 8.1.3; Scale/2.00)", "=CCIQ/2.0.1 (iPhone; iOS 8.4; Scale/2.00)"]
def get_gd(self, area, code): """ 获取股东信息 """ url = "http://appsvc.qiye.qianzhan.com/OrgCompany.svc/orgcompany/gd/detail" encryptedJson = { "bl_oc_area": area, #4107 "v1": "QZOrgV004", "bl_oc_code": code #672867774 } param = spider.util.utf8str({ "encryptedJson": self._aes_.encrypt(spider.util.utf8str(encryptedJson)), "extJson": self.extJson }) res = self.request_url(url, headers=self.headers, data=param, proxies=self.proxies_dict[self.get_tid()]) if res is None: print 'get_gd ------ res is none -- get_gd code is -->', code return None elif res.code == 404: print "get_gd ------ 404 ------ ", code return None elif res.code == 503 or res.code == 500 or res.code == 502 or res.code == 504: print 'get_gd ------ ', res.code, code time.sleep(0.5) return self.get_gd(area, code) elif res.code == 200: c = eval(res.text)['c'] if len(c) == 0: print 'get_gd ------', code, ' res.text is null----------------------------' return None result = CCIQ_AES("BB1856A312580D41256311147089E0CC").decrypt(c) list_gd = eval(result) return list_gd else: print code, "#######################################UNKNOWN ERROR#############################################", res.code return None
def get_inversted(self, url, encryptedJson): """ 通用请求方法 """ param = spider.util.utf8str({ "encryptedJson": self._aes_.encrypt(spider.util.utf8str(encryptedJson)), "extJson": self.extJson }) res = self.request_url(url, headers={"Content-Type": "application/json"}, data=param, proxies=self.proxies_dict[self.get_tid()]) if res is None: print 'res is none -- search gd code is -->', code return None elif res.code == 404: print "404 ------ ", code return None elif res.code == 503 or res.code == 500 or res.code == 502 or res.code == 504: print res.code, '------', code time.sleep(0.5) return self.get_gd(area, code) elif res.code == 200: c = eval(res.text)['c'] if len(c) == 0: print '-----------------------------gd code', code, ' res.text is null----------------------------' % cname return None result = CCIQ_AES("BB1856A312580D41256311147089E0CC").decrypt(c) list_gd = eval(result) #print 'gd infos =======================',spider.util.utf8str(list_gd) return list_gd else: print code, "#######################################UNKNOWN ERROR#############################################", res.code return None
def get_inversted(self, cname): """ 查询投资信息 """ url = "http://appsvc.qiye.qianzhan.com/OrgCompany.svc/orgcompany/map/invesment" headers = {"Content-Type": "application/json"} encryptedJson = {"input": cname, "v1": "QZOrgV004"} param = { "encryptedJson": self._aes_.encrypt(spider.util.utf8str(encryptedJson)), "extJson": self.extJson } param = spider.util.utf8str(param) res = self.request_url(url, headers=headers, data=param, proxies=self.proxies_dict[self.get_tid()]) if res is None: print 'get_inversted ------ res is none --', cname return None elif res.code == 404: print "get_inversted ------ 404 --- ", cname return None elif res.code == 503 or res.code == 500 or res.code == 502 or res.code == 504: print 'get_inversted ------ ', res.code, cname time.sleep(0.5) return self.get_inversted(cname) elif res.code == 200: c = eval(res.text)['c'] if len(c) == 0: print 'get_inversted ------ ', cname, ' res.text is null----------------------------' return None result = CCIQ_AES("BB1856A312580D41256311147089E0CC").decrypt(c) list_inversted = eval(result) return list_inversted else: print cname, "############## get_inversted ############ UNKNOWN ERROR #################", res.code return None
def get_gd(self, code, retry=0): """ 获取股东信息 """ url = "http://appsvc.qiye.qianzhan.com/OrgCompany.svc/orgcompany/gd/detail" encryptedJson = { "bl_oc_area": "", "v1": "QZOrgV005", "bl_oc_code": code } res = self.req_all(url, encryptedJson) if res is None: return None if res.code == 200: try: c = eval(res.text)['c'] if len(c) == 0: print "get_gd --- retry=%d --- reason:len(c)=0" % retry return None result = CCIQ_AES("BF1856A312580D41256311147089E1CC").decrypt( c) #print "获取股东信息结果:", spider.util.utf8str(result) return eval(result) except Exception as err: print "get_gd --- retry=%d --- reason:%s" % (retry, err) if retry < 5: retry += 1 time.sleep(retry * 1.5) return self.get_gd(code, retry=retry) else: return None else: print "get_gd --- retry=%d --- res.code=%d" % (retry, res.code) if retry < 5: retry += 1 time.sleep(retry * 1.5) return self.get_gd(code, retry=retry) else: return None
def get_inversted(self, cname, retry=0): """ 查询投资信息 """ url = "http://appsvc.qiye.qianzhan.com/OrgCompany.svc/orgcompany/map/invesment" encryptedJson = {"input": cname, "v1": "QZOrgV005"} res = self.req_all(url, encryptedJson) if res is None: return None if res.code == 200: try: c = eval(res.text)['c'] if len(c) == 0: print "get_inversted --- cname=%s --- retry=%d --- reason:len(c)=0" % ( cname, retry) return None result = CCIQ_AES("BF1856A312580D41256311147089E1CC").decrypt( c) return eval(result) except Exception as err: print "get_inversted --- cname=%s --- retry=%d --- reason:%s" % ( cname, retry, err) if retry < 5: retry += 1 time.sleep(retry * 1.5) return self.get_inversted(cname, retry=retry) else: return None else: print "get_inversted --- cname=%s --- retry=%d --- res.code=%d" % ( cname, retry, res.code) if retry < 5: retry += 1 time.sleep(retry * 1.5) return self.get_inversted(cname, retry=retry) else: return None
def get_branch(self,cname, now_page, list_branch, retry): """ 查询分支机构 """ url = "http://appsvc.qiye.qianzhan.com/OrgCompany.svc/orgcompany/branch/select/page" encryptedJson = { "companyName" : cname, "v1" : "QZOrgV005", "page" : now_page, "pagesize" : "10" } res = self.req_all(url, encryptedJson) res_code = 0 if res is None or (res.code >= 400 and res.code < 500): if res is not None: res_code = res.code print "get_branch --- cname=%s --- retry=%d --- now_page=%d --- res.code=%d" % (cname, retry, now_page , res_code) if retry < 5: time.sleep(0.1) return self.get_branch(cname,now_page, list_branch, (retry+1)) else: return None res_code = res.code if res_code >= 500: print "get_branch --- cname=%s --- retry=%d --- now_page=%d --- res.code=%d" % (cname, retry, now_page , res_code) time.sleep(1) return self.get_branch(cname, now_page, list_branch, (retry+1)) elif res_code == 200: try: c = eval(res.text)['c'] except Exception as err: print "get_branch --- cname=%s --- retry=%d --- now_page=%d --- res.code=%d" % (cname, retry, now_page , res_code) print "get_branch --- exception res.text:\n", res.text if retry < 5: time.sleep(0.1) return self.get_branch(cname, now_page, list_branch, (retry+1)) else: return None if len(c) == 0: print "get_branch --- cname=%s --- retry=%d --- now_page=%d --- res.code=%d --- len(c)=0" % (cname, retry, now_page , res_code) if retry < 5: time.sleep(0.1) return self.get_branch(cname, now_page, list_branch, (retry+1)) else: return None result = CCIQ_AES("BF1856A312580D41256311147089E1CC").decrypt(c) temp = eval(result) if temp is not None: for t in temp['Branch']: list_branch['Branch'].append(t) if len(temp['Branch']) == 10: now_page += 1 # if now_page >= 10: # return list_branch return self.get_branch(cname, now_page, list_branch, 0) else: return list_branch else: print "get_branch --- cname=%s --- retry=%d --- now_page=%d --- res.code=%d --- Branch is NULL" % (cname, retry, now_page , res_code) return None else: print "get_branch --- cname=%s --- retry=%d --- now_page=%d --- res.code=%d --- UNKNOW ERROR" % (cname, retry, now_page , res_code) if retry < 5: time.sleep(1) return self.get_branch(cname, now_page, list_branch, (retry+1)) else: return None
def flip_over(self, now_page, cname, cnt, retry): tid = self.get_tid() """ 根据公司名查询公司列表,翻页 """ encryptedJson = { "pagesize": "20", "page": now_page, "od_orderBy": "0", "sh_searchType": "一般搜索", "sh_oc_areaName": "", "od_statusFilter": "0", "v1": "QZOrgV005", "oc_name": cname, "sh_u_uid": "", "sh_u_name": "" } r_result = {"cname": cname} res = self.req_all(encryptedJson) res_code = 0 if res is None: if self.get_fail_cnt('failcount-none', 1) < 10: self.re_add_job({'cname': cname, 'cnt': cnt, 'retry': retry}) print "tid=%d --- cnt=%d --- cname=%s --- retry=%d --- res.code=%d --- now_page:%d" % ( tid, cnt, cname, retry, res_code, now_page) return else: # if retry > 5: # r_result["type"] = "None" # self.already_error_type.append(spider.util.utf8str(r_result)) # self.record_spider(cname) # print "tid=%d --- cnt=%d --- cname=%s --- retry=%d --- res.code=%d --- now_page:%d" % (tid, cnt, cname, retry, res_code, now_page) # else: # self.re_add_job({'cname':cname,'cnt':cnt, 'retry':(retry+1)}) self.re_add_job({'cname': cname, 'cnt': cnt, 'retry': retry}) raise AccountErrors.NoAccountError( "Maybe the proxy invalid,failcount-none = [ %d ],tid=[ %d ]" % (self.get_fail_cnt('failcount-none', 0), tid)) else: setattr(self._curltls, 'failcount-none', 0) res_code = res.code if (res_code >= 400 and res_code < 500) or res_code == 202: if self.get_fail_cnt('failcount-400', 1) < 5: self.re_add_job({'cname': cname, 'cnt': cnt, 'retry': retry}) print "tid=%d --- cnt=%d --- cname=%s --- retry=%d --- res.code=%d --- now_page:%d" % ( tid, cnt, cname, retry, res_code, now_page) return else: if retry > 5: r_result["type"] = "400+" self.already_error_type.append( spider.util.utf8str(r_result)) self.record_spider(cname) else: self.re_add_job({ 'cname': cname, 'cnt': cnt, 'retry': (retry + 1) }) self._can_use_proxy_num -= 1 raise AccountErrors.NoAccountError( "Maybe the proxy invalid,failcount-400 = [ %d ],tid=[ %d ]" % (self.get_fail_cnt('failcount-400', 0), tid)) else: setattr(self._curltls, 'failcount-400', 0) if res_code >= 500: # if retry > 2: # r_result["type"]="500" # self.already_error_type.append(spider.util.utf8str(r_result)) # self.record_spider(cname) # else: self.re_add_job({'cname': cname, 'cnt': cnt, 'retry': retry}) print "tid=%d --- cnt=%d --- cname=%s --- retry=%d --- res.code=%d --- now_page:%d " % ( tid, cnt, cname, retry, res_code, now_page) time.sleep(random.randrange(1, 10, 1)) return elif res_code == 200: try: c = eval(res.text)['c'] except Exception as err: print "tid=%d --- cnt=%d --- cname=%s --- retry=%d --- res.code=%d --- now_page:%d --- exception res.text - %s" % ( tid, cnt, cname, retry, res_code, now_page, err) # r_result["type"] = "res_error" # self.already_error_type.append(spider.util.utf8str(r_result)) # self.record_spider(cname) # self.error_cnt += 1 self.re_add_job({'cname': cname, 'cnt': cnt, 'retry': retry}) return if len(c) == 0: print "tid=%d --- cnt=%d --- cname=%s --- retry=%d --- res.code=%d --- now_page:%d --- exception 'C' IS NULL" % ( tid, cnt, cname, retry, res_code, now_page) r_result["type"] = "c=0" self.already_error_type.append(spider.util.utf8str(r_result)) self.record_spider(cname) self.error_cnt += 1 return result = CCIQ_AES("BF1856A312580D41256311147089E1CC").decrypt(c) try: dic = eval(result) except Exception as err: print "tid=%d --- cnt=%d --- cname=%s --- retry=%d --- res.code=%d --- now_page:%d --- exception result:%s" % ( tid, cnt, cname, retry, res_code, now_page, result) r_result["type"] = "result_error" self.already_error_type.append(spider.util.utf8str(r_result)) self.record_spider(cname) self.error_cnt += 1 return list = dic['list'] if len(list) == 0: print "tid=%d --- cnt=%d --- cname=%s --- retry=%d --- res.code=%d --- now_page:%d --- exception len(list)=0" % ( tid, cnt, cname, retry, res_code, now_page) r_result["type"] = "list=0" self.already_error_type.append(spider.util.utf8str(r_result)) self.record_spider(cname) self.error_cnt += 1 return #print "tid=%d ### cnt=%d ### cname=%s ### retry=%d ### res.code=%d ### now_page:%d ### success:len(list):%d " % (tid, cnt, cname, retry, res_code, now_page, len(list)) for l in list: aa = {"query_name": cname} for k, v in l.items(): aa[k] = v self.query_company_list.append(spider.util.utf8str(aa)) print "******", len(list), spider.util.utf8str(list) if len(list) < 20: # r_result["type"] = "success" # self.already_error_type.append(spider.util.utf8str(r_result)) self.record_spider(cname) return elif len(list) == 20: if now_page > 100: self.already_error_type.append( spider.util.utf8str(r_result)) self.record_spider(cname) return now_page += 1 self.flip_over(now_page, cname, cnt, retry) else: print "tid=%d --- cnt=%d --- cname=%s --- retry=%d --- res.code=%d --- now_page:%d --- exception UNKNOW ERROR" % ( tid, cnt, cname, retry, res_code, now_page) if retry < 3: self.re_add_job({ 'cname': cname, 'cnt': cnt, 'retry': (retry + 1) }) return r_result["type"] = "unknown_error" self.already_error_type.append(spider.util.utf8str(r_result)) self.record_spider(cname) return
def get_gd(self, area, code, cname, retry): """ 获取股东信息 """ url = "http://appsvc.qiye.qianzhan.com/OrgCompany.svc/orgcompany/gd/detail" encryptedJson = { "bl_oc_area" : area, #4107 "v1" : "QZOrgV005", "bl_oc_code" : code #672867774 } res = self.req_all(url, encryptedJson) res_code = 0 if res is None or (res.code >= 400 and res.code < 500): if res is not None: res_code = res.code print "get_gd --- cname=%s --- retry=%d --- res.code=%d" % (cname, retry, res_code) if retry < 5: time.sleep(0.1) return self.get_gd(area, code, cname, (retry+1)) else: return None res_code = res.code if res_code >= 500: print "get_gd --- cname=%s --- retry=%d --- res.code=%d" % (cname, retry, res_code) time.sleep(1) return self.get_gd(area, code, cname, retry) elif res_code == 200: try: c = eval(res.text)['c'] except Exception as err: print "get_gd --- cname=%s --- retry=%d --- res.code=%d " % (cname, retry, res_code) print "get_gd --- exception res.text:\n", res.text if retry < 5: time.sleep(0.1) return self.get_gd(area, code, cname, (retry+1)) else: return None if len(c) == 0: print "get_gd --- cname=%s --- retry=%d --- res.code=%d len(c)=0" % (cname, retry, res_code) if retry < 5: time.sleep(0.1) return self.get_gd(area, code, cname, (retry+1)) else: return None result = CCIQ_AES("BF1856A312580D41256311147089E1CC").decrypt(c) try: list_gd = eval(result) except Exception as err: print "get_gd --- cname=%s --- retry=%d --- res.code=%d " % (cname, retry, res_code) print 'get_gd --- eval(result) exception , result:\n',result if retry < 5: time.sleep(0.1) return self.get_gd(area, code, cname, (retry+1)) else: return None return list_gd else: print "get_gd --- cname=%s --- retry=%d --- res.code=%d ---UNKNOW ERROR" % (cname, retry, res_code) if retry < 5: time.sleep(0.1) return self.get_gd(area, code, cname, (retry+1)) else: return None
def get_inversted(self, cname, retry): """ 查询投资信息 """ url = "http://appsvc.qiye.qianzhan.com/OrgCompany.svc/orgcompany/map/invesment" encryptedJson = { "input" : cname, "v1" : "QZOrgV005" } res = self.req_all(url, encryptedJson) res_code = 0 if res is None or (res.code >= 400 and res.code < 500): if res is not None: res_code = res.code print "get_inversted --- cname=%s --- retry=%d --- res.code=%d" % (cname, retry, res_code) if retry < 5: return self.get_inversted(cname, (retry+1)) else: return None res_code = res.code if res_code >= 400 and res_code < 500: print "get_inversted --- cname=%s --- retry=%d --- res.code=%d" % (cname, retry, res_code) if retry < 5: return self.get_inversted(cname, (retry+1)) else: return None elif res_code >= 500: print "get_inversted --- cname=%s --- retry=%d --- res.code=%d" % (cname, retry, res_code) time.sleep(1) return self.get_inversted(cname, retry) elif res.code == 200: try: c = eval(res.text)['c'] except Exception as err: print "get_inversted --- cname=%s --- retry=%d --- res.code=%d" % (cname, retry, res_code) print "get_inversted --- exception res.text:\n", res.text if retry < 5: time.sleep(0.1) return self.get_inversted(cname, (retry+1)) else: return None if len(c) == 0: print "get_inversted --- cname=%s --- retry=%d --- res.code=%d" % (cname, retry, res_code) if retry < 5: time.sleep(0.1) return self.get_inversted(cname, (retry+1)) else: return None result = CCIQ_AES("BF1856A312580D41256311147089E1CC").decrypt(c) try: list_inversted = eval(result) except Exception as err: print "get_inversted --- cname=%s --- retry=%d --- res.code=%d" % (cname, retry, res_code) print 'get_inversted --- eval(result) exception , result:\n', result if retry < 5: time.sleep(0.1) return self.get_inversted(cname, (retry+1)) else: return None return list_inversted else: print "get_inversted --- cname=%s --- retry=%d --- res.code=%d ---UNKNOW ERROR" % (cname, retry, res_code) if retry < 5: time.sleep(0.1) return self.get_inversted(cname, (retry+1)) else: return None
class QycxbDetail(Spider): """ 根据企业名称.查询企业列表 121.40.186.237:18889:ipin:helloipin """ def __init__(self): self._can_use_proxy_num = 0 self.is_debug = False if self.is_debug: Spider.__init__(self, 1) else: self.proxies_dict = [] self.read_proxy("proxy_032512.txt") Spider.__init__(self, len(self.proxies_dict)) self._aes_ = CCIQ_AES() #成功的 self.query_success = FileSaver("c_query_detail.txt") #失败的 self.query_failure = FileSaver("c_query_detail_failure.txt") #已经爬取过的 self.already_cname_list = FileSaver("c_already_detail.txt") #初始化已经爬过的公司 self.init_cname() #self.extJson = self._aes_.encrypt(spider.util.utf8str({"cl_screenSize": "640x960", "cl_cookieId": "B200BA9D-A3A0-4140-A293-9A1A671BA5CE", "Org_iOS_Version": "2.0.1"})) # self.extJson = "Hoi6oX70l9whauZmjq8jVAmoe3UspXXhX9mPG+KAeqs1rKZVr/uapICH92P/Crryt63u28aP4QP665AzcT/jN5Go1o3bvwMvVIkuN9e60k6WI2pVFBrwZMvxwW6BnQukSzDSlyPvEhgpR5DIHQEV6C51hMgp4Zc3OkTSsyezAm4=" # self.select_user_agent("=CCIQ/2.0.1 (iPhone; iOS 9.1; Scale/2.00)") self.bloom = set() self.extJsons = ["Hoi6oX70l9whauZmjq8jVAmoe3UspXXhX9mPG+KAeqs1rKZVr/uapICH92P/Crryt63u28aP4QP665AzcT/jN5Go1o3bvwMvVIkuN9e60k6WI2pVFBrwZMvxwW6BnQukSzDSlyPvEhgpR5DIHQEV6C51hMgp4Zc3OkTSsyezAm4=", "ctlCXDvoyaH2pCIArrgvXp7zrZTzpz2Q5rukh+aWvupEFABw6P2AvbmaN+HJ7IZgDJ/kgBkJt/rLppSGitYCPKGR2IGv6OXZsrJGgbRB3G3Ac4K8xpX3aMB5s8Ci2a/YpTpioZxAvptqJsQUCoNn0tLCOVM4XxMJQWbrErkOcl4=", "ctlCXDvoyaH2pCIArrgvXp7zrZTzpz2Q5rukh+aWvupEFABw6P2AvbmaN+HJ7IZgDJ/kgBkJt/rLppSGitYCPKGR2IGv6OXZsrJGgbRB3G1U2wdOlL49/aDwt3NZNp4TGa5iBFpYLm69F/6PPFoXIR/Aw5p48//8OgZFpddDUwQ="] self.user_agents = ["=CCIQ/2.0.1 (iPhone; iOS 9.1; Scale/2.00)", "=CCIQ/2.0.1 (iPhone; iOS 8.1.3; Scale/2.00)", "=CCIQ/2.0.1 (iPhone; iOS 8.4; Scale/2.00)"] self.is_first = True self.init_time = 0 def req_all(self, url, encryptedJson): #time.sleep(random.randrange(5, 11, 1)) #time.sleep(2) number = random.randrange(0, 3, 1) self.select_user_agent(self.user_agents[number]) param = spider.util.utf8str({"encryptedJson":self._aes_.encrypt(spider.util.utf8str(encryptedJson)), "extJson":self.extJsons[number]}) param = param.replace('/', "\/") if self.is_first: self.init_time = time.time() print '初始化时间',self.init_time self.is_first = False if self.is_debug: res = self.request_url(url, headers={"Content-Type": "application/json"}, data=param, proxies={'http': 'http://*****:*****@121.41.79.4:18889', 'https': 'https://*****:*****@121.41.79.4:18889'}) #res = self.request_url(url, headers={"Content-Type": "application/json"}, data=param, proxies={'http': 'http://137.135.166.225:8120', 'https': 'https://137.135.166.225:8120'}) else: res = self.request_url(url, headers={"Content-Type": "application/json"}, data=param, proxies=self.proxies_dict[self.get_tid()]) return res def init_cname(self): with open("c_already_detail.txt","r") as f: for line in f: filter_name.add(line.strip()) def wait_q_breakable(self): lt = 0 while True: if not self.job_queue.empty() or not self.job_queue2.empty() or not self.job_queue3.empty(): time.sleep(5) if time.time() < lt + 1 and self._running_count == 0: return True time.sleep(2) lt = time.time() if self._worker_count == 0: return False def dispatch(self): #with open("a_queried_company_list.txt","r") as f: with open("un_spider_queries.txt", "r") as f: cnt = 0 for line in f: line = line.strip() cnt += 1 if line in filter_name: print cnt, "already spider!!!" continue job = {"line":line, "cnt":cnt, "retry":0} self.add_job(job, True) self.wait_q_breakable() self.add_job(None, True) def record_spider(self, line, cname): """ 已经爬过的,无论成功失败都算爬过. """ filter_name.add(line) self.already_cname_list.append(line) self.bloom.add(cname) def run_job(self, jobid): line = jobid.get("line") cnt = jobid.get("cnt") retry = jobid.get("retry") self.get_detail(line, cnt, retry) def get_detail(self, line, cnt, retry): tid = self.get_tid() try: param = eval(line) except Exception as err: print 'tid=%d --- cnt=%d --- data is not json, return'%(tid, cnt) self.record_spider(line,'UNKNOW') return cname = param['oc_name'] if cname in self.bloom: cname = param['query_name'] if cname in self.bloom: print 'query_name:%s aleready crawler...'%cname return ccode = param['oc_code'] carea = param['oc_area'] url = "http://appsvc.qiye.qianzhan.com/OrgCompany.svc/orgcompany/combine/detail" encryptedJson = { "bl_oc_code" : ccode,#code, #"71526726X" "v1" : "QZOrgV005", "isDirect" : "0", "bl_oc_name" : cname,#cname, #"腾讯科技" "bl_oc_area" : carea #area #"4403" } res = self.req_all(url, encryptedJson) res_code = 0 if res is None : if self.get_fail_cnt(1, 'failcount-none') < 10: self.re_add_job({'line':line,'cnt':cnt, 'retry':retry}) print "tid=%d --- cnt=%d --- cname=%s --- retry=%d --- res.code=%d " % (tid, cnt, cname, retry, res_code) return else: # if retry > 5: # self.query_failure.append(line) # self.record_spider(line, cname) # return # else: self.re_add_job({'line':line,'cnt':cnt, 'retry':(retry+1)}) self._can_use_proxy_num -= 1 raise AccountErrors.NoAccountError("Maybe the proxy invalid,failcount-none = [ %d ]" % self.get_fail_cnt(0, 'failcount-none')) else: setattr(self._curltls, 'failcount-none', 0) res_code = res.code if (res_code >= 400 and res_code < 500) or res_code == 202 : #print time.time(),"出现################",(time.time()-self.init_time), " res.code=", res_code # if retry > 20: # self.query_failure.append(line) # self.record_spider(line, cname) # else: self.re_add_job({'line':line,'cnt':cnt, 'retry':(retry+1)}) print "tid=%d --- cnt=%d --- cname=%s --- retry=%d --- res.code=%d " % (tid, cnt, cname, retry, res_code) if self.get_fail_cnt(1, 'failcount-400') > 30: self._can_use_proxy_num -= 1 raise AccountErrors.NoAccountError("Maybe the proxy invalid,failcount-400 = [ %d ]" % self.get_fail_cnt(0, 'failcount-400')) return else: setattr(self._curltls, 'failcount-400', 0) if res_code >= 500: # if retry > 5: # self.query_failure.append(line) # self.record_spider(line, cname) # else: self.re_add_job({'line':line,'cnt':cnt, 'retry':(retry+1)}) print "tid=%d --- cnt=%d --- cname=%s --- retry=%d --- res.code=%d " % (tid, cnt, cname, retry, res_code) time.sleep(2) return elif res_code == 200: try: c = eval(res.text)['c'] except Exception as err: print "tid=%d --- cnt=%d --- cname=%s --- retry=%d --- res.code=%d exception res.text " % (tid, cnt, cname, retry, res_code) #print "exception res.text:\n", res.text self.query_failure.append(line) self.record_spider(line, cname) return if len(c) == 0: print "tid=%d --- cnt=%d --- cname=%s --- retry=%d --- res.code=%d --- exception 'C' IS NULL" % (tid, cnt, cname, retry, res_code) self.query_failure.append(line) self.record_spider(line, cname) return result = CCIQ_AES("BF1856A312580D41256311147089E1CC").decrypt(c) try: detail = eval(result) except Exception as err: print "tid=%d --- cnt=%d --- cname=%s --- retry=%d --- res.code=%d --- exception result:%s" % (tid, cnt, cname, retry, res_code, result) self.query_failure.append(line) self.record_spider(line, cname) return #print 'tid=', tid, 'proxy=', self.proxies_dict[tid], ' detail=',spider.util.utf8str(detail) #print 'tid=', tid, ' detail=',spider.util.utf8str(detail) #股东信息 listGD = self.get_gd(carea, ccode, cname, 0) if listGD is not None: #print "tid=",tid," listGD=",spider.util.utf8str(listGD) detail['listGD'] = listGD['listGD'] #投资信息 list_inversted = self.get_inversted(cname, 0) if list_inversted is not None: #print "tid=",tid," list_inversted=",spider.util.utf8str(list_inversted) detail['inversted'] = list_inversted['inversted'] #获取分支机构信息 list_branch = self.get_branch(cname, 1, {"Branch": []}, 0) if list_branch is not None: #print "tid=",tid," list_branch=",spider.util.utf8str(list_branch) detail['Branch'] = list_branch['Branch'] self.query_success.append(spider.util.utf8str(detail)) self.record_spider(line, cname) print "tid=%d --- proxy=%s --- cnt=%d --- cname=%s --- retry=%d --- res.code=%d --- success:\n %s" % (tid,self.proxies_dict[tid], cnt, cname, retry, res_code, spider.util.utf8str(detail)) else: self.query_failure.append(line) self.record_spider(line, cname) print "tid=%d --- cnt=%d --- cname=%s --- retry=%d --- res.code=%d --- exception UNKNOW ERROR" % (tid, cnt, cname, retry, res_code) return def get_gd(self, area, code, cname, retry): """ 获取股东信息 """ url = "http://appsvc.qiye.qianzhan.com/OrgCompany.svc/orgcompany/gd/detail" encryptedJson = { "bl_oc_area" : area, #4107 "v1" : "QZOrgV005", "bl_oc_code" : code #672867774 } res = self.req_all(url, encryptedJson) res_code = 0 if res is None or (res.code >= 400 and res.code < 500): if res is not None: res_code = res.code print "get_gd --- cname=%s --- retry=%d --- res.code=%d" % (cname, retry, res_code) if retry < 5: time.sleep(0.1) return self.get_gd(area, code, cname, (retry+1)) else: return None res_code = res.code if res_code >= 500: print "get_gd --- cname=%s --- retry=%d --- res.code=%d" % (cname, retry, res_code) time.sleep(1) return self.get_gd(area, code, cname, retry) elif res_code == 200: try: c = eval(res.text)['c'] except Exception as err: print "get_gd --- cname=%s --- retry=%d --- res.code=%d " % (cname, retry, res_code) print "get_gd --- exception res.text:\n", res.text if retry < 5: time.sleep(0.1) return self.get_gd(area, code, cname, (retry+1)) else: return None if len(c) == 0: print "get_gd --- cname=%s --- retry=%d --- res.code=%d len(c)=0" % (cname, retry, res_code) if retry < 5: time.sleep(0.1) return self.get_gd(area, code, cname, (retry+1)) else: return None result = CCIQ_AES("BF1856A312580D41256311147089E1CC").decrypt(c) try: list_gd = eval(result) except Exception as err: print "get_gd --- cname=%s --- retry=%d --- res.code=%d " % (cname, retry, res_code) print 'get_gd --- eval(result) exception , result:\n',result if retry < 5: time.sleep(0.1) return self.get_gd(area, code, cname, (retry+1)) else: return None return list_gd else: print "get_gd --- cname=%s --- retry=%d --- res.code=%d ---UNKNOW ERROR" % (cname, retry, res_code) if retry < 5: time.sleep(0.1) return self.get_gd(area, code, cname, (retry+1)) else: return None def get_inversted(self, cname, retry): """ 查询投资信息 """ url = "http://appsvc.qiye.qianzhan.com/OrgCompany.svc/orgcompany/map/invesment" encryptedJson = { "input" : cname, "v1" : "QZOrgV005" } res = self.req_all(url, encryptedJson) res_code = 0 if res is None or (res.code >= 400 and res.code < 500): if res is not None: res_code = res.code print "get_inversted --- cname=%s --- retry=%d --- res.code=%d" % (cname, retry, res_code) if retry < 5: return self.get_inversted(cname, (retry+1)) else: return None res_code = res.code if res_code >= 400 and res_code < 500: print "get_inversted --- cname=%s --- retry=%d --- res.code=%d" % (cname, retry, res_code) if retry < 5: return self.get_inversted(cname, (retry+1)) else: return None elif res_code >= 500: print "get_inversted --- cname=%s --- retry=%d --- res.code=%d" % (cname, retry, res_code) time.sleep(1) return self.get_inversted(cname, retry) elif res.code == 200: try: c = eval(res.text)['c'] except Exception as err: print "get_inversted --- cname=%s --- retry=%d --- res.code=%d" % (cname, retry, res_code) print "get_inversted --- exception res.text:\n", res.text if retry < 5: time.sleep(0.1) return self.get_inversted(cname, (retry+1)) else: return None if len(c) == 0: print "get_inversted --- cname=%s --- retry=%d --- res.code=%d" % (cname, retry, res_code) if retry < 5: time.sleep(0.1) return self.get_inversted(cname, (retry+1)) else: return None result = CCIQ_AES("BF1856A312580D41256311147089E1CC").decrypt(c) try: list_inversted = eval(result) except Exception as err: print "get_inversted --- cname=%s --- retry=%d --- res.code=%d" % (cname, retry, res_code) print 'get_inversted --- eval(result) exception , result:\n', result if retry < 5: time.sleep(0.1) return self.get_inversted(cname, (retry+1)) else: return None return list_inversted else: print "get_inversted --- cname=%s --- retry=%d --- res.code=%d ---UNKNOW ERROR" % (cname, retry, res_code) if retry < 5: time.sleep(0.1) return self.get_inversted(cname, (retry+1)) else: return None def get_branch(self,cname, now_page, list_branch, retry): """ 查询分支机构 """ url = "http://appsvc.qiye.qianzhan.com/OrgCompany.svc/orgcompany/branch/select/page" encryptedJson = { "companyName" : cname, "v1" : "QZOrgV005", "page" : now_page, "pagesize" : "10" } res = self.req_all(url, encryptedJson) res_code = 0 if res is None or (res.code >= 400 and res.code < 500): if res is not None: res_code = res.code print "get_branch --- cname=%s --- retry=%d --- now_page=%d --- res.code=%d" % (cname, retry, now_page , res_code) if retry < 5: time.sleep(0.1) return self.get_branch(cname,now_page, list_branch, (retry+1)) else: return None res_code = res.code if res_code >= 500: print "get_branch --- cname=%s --- retry=%d --- now_page=%d --- res.code=%d" % (cname, retry, now_page , res_code) time.sleep(1) return self.get_branch(cname, now_page, list_branch, (retry+1)) elif res_code == 200: try: c = eval(res.text)['c'] except Exception as err: print "get_branch --- cname=%s --- retry=%d --- now_page=%d --- res.code=%d" % (cname, retry, now_page , res_code) print "get_branch --- exception res.text:\n", res.text if retry < 5: time.sleep(0.1) return self.get_branch(cname, now_page, list_branch, (retry+1)) else: return None if len(c) == 0: print "get_branch --- cname=%s --- retry=%d --- now_page=%d --- res.code=%d --- len(c)=0" % (cname, retry, now_page , res_code) if retry < 5: time.sleep(0.1) return self.get_branch(cname, now_page, list_branch, (retry+1)) else: return None result = CCIQ_AES("BF1856A312580D41256311147089E1CC").decrypt(c) temp = eval(result) if temp is not None: for t in temp['Branch']: list_branch['Branch'].append(t) if len(temp['Branch']) == 10: now_page += 1 # if now_page >= 10: # return list_branch return self.get_branch(cname, now_page, list_branch, 0) else: return list_branch else: print "get_branch --- cname=%s --- retry=%d --- now_page=%d --- res.code=%d --- Branch is NULL" % (cname, retry, now_page , res_code) return None else: print "get_branch --- cname=%s --- retry=%d --- now_page=%d --- res.code=%d --- UNKNOW ERROR" % (cname, retry, now_page , res_code) if retry < 5: time.sleep(1) return self.get_branch(cname, now_page, list_branch, (retry+1)) else: return None def get_fail_cnt(self, addv , type): fc = getattr(self._curltls,type,0) if (addv): fc += addv setattr(self._curltls, type, fc) return fc def event_handler(self, evt, msg, **kwargs): if evt == 'DONE': msg += '企业查询宝APP公司详情detail查询已经停止...' spider.util.sendmail('*****@*****.**', '%s DONE' % sys.argv[0], msg) def read_proxy(self,fn): with open(fn, 'r') as f: for line in f: line = line.strip() self._match_proxy(line) # m = re.match(r'(\d+\.\d+\.\d+\.\d+:\d+)', line, re.I) # m1 = re.match(r'(\d+\.\d+\.\d+\.\d+:\d+:\w+:\w+)', line, re.I) # if m: # prstr = m.group(1) # proxies = {'http': 'http://' + prstr+"/", 'https': 'https://' + prstr+"/"} # self.proxies_dict.append(proxies) # elif re.match('\s*#', line): # continue print " loaded [ %d ] proxis " % len(self.proxies_dict) def _match_proxy(self,line): m = re.match('([0-9.]+):(\d+):([a-z0-9]+):([a-z0-9._-]+)$', line, re.I) m1 = re.match('([0-9.]+):(\d+):([a-z0-9]+)$', line, re.I) if m: prstr = '%s:%s@%s:%s' % (m.group(3), m.group(4), m.group(1), m.group(2)) proxies = {'http': 'http://' + prstr, 'https': 'https://' + prstr} elif m1: prstr = '%s:%s' % (m1.group(1), m1.group(2)) proxies = {'http': 'http://' + prstr, 'https': 'https://' + prstr} else: proxies = {'http': 'http://' + line, 'https': 'https://' + line} self.proxies_dict.append(proxies)
class QycxbQuery(Spider): """ 根据企业名称.查询企业列表------针对900多万的公司名查询 """ def __init__(self): self.is_debug = True self._can_use_proxy_num = 0 if self.is_debug: Spider.__init__(self, 80) else: self.proxies_dict = [] self.read_proxy("../../_ct_proxy/proxy_all_filter.txt") Spider.__init__(self, len(self.proxies_dict)) self.error_cnt = 0 self._aes_ = CCIQ_AES() #根据公司名字查询到的公司列表全部信息 self.query_company_list = FileSaver("all_company_list.txt") #已经爬取过的公司名 self.already_cname_list = FileSaver("all_company_list_already.txt") #爬过的 错误类型 self.already_error_type = FileSaver("all_already_error_type.txt") self.need_flip_page_data = FileSaver("beijing_need_flip_page_data.txt") #初始化已经爬过的公司 self.init_cname() self.extJsons = ["Hoi6oX70l9whauZmjq8jVAmoe3UspXXhX9mPG+KAeqs1rKZVr/uapICH92P/Crryt63u28aP4QP665AzcT/jN5Go1o3bvwMvVIkuN9e60k6WI2pVFBrwZMvxwW6BnQukSzDSlyPvEhgpR5DIHQEV6C51hMgp4Zc3OkTSsyezAm4=", "ctlCXDvoyaH2pCIArrgvXp7zrZTzpz2Q5rukh+aWvupEFABw6P2AvbmaN+HJ7IZgDJ/kgBkJt/rLppSGitYCPKGR2IGv6OXZsrJGgbRB3G3Ac4K8xpX3aMB5s8Ci2a/YpTpioZxAvptqJsQUCoNn0tLCOVM4XxMJQWbrErkOcl4=", "ctlCXDvoyaH2pCIArrgvXp7zrZTzpz2Q5rukh+aWvupEFABw6P2AvbmaN+HJ7IZgDJ/kgBkJt/rLppSGitYCPKGR2IGv6OXZsrJGgbRB3G1U2wdOlL49/aDwt3NZNp4TGa5iBFpYLm69F/6PPFoXIR/Aw5p48//8OgZFpddDUwQ="] self.user_agents = ["=CCIQ/2.0.1 (iPhone; iOS 9.1; Scale/2.00)", "=CCIQ/2.0.1 (iPhone; iOS 8.1.3; Scale/2.00)", "=CCIQ/2.0.1 (iPhone; iOS 8.4; Scale/2.00)"] self.bloom = set() self.proxy_error_cnt = 0 self.lock = threading.Lock() def req_all(self, encryptedJson, retry=0, cname=None): url = "http://appsvc.qiye.qianzhan.com/OrgCompany.svc/orgcompany/combine/search" number = random.randrange(0, 3, 1) self.select_user_agent(self.user_agents[number]) param = spider.util.utf8str({"encryptedJson": self._aes_.encrypt(spider.util.utf8str(encryptedJson)), "extJson": self.extJsons[number]}) param = param.replace('/', "\/") res = None if self.is_debug: res = self.request_url(url, headers={"Content-Type": "application/json", "Accept-Language": "zh-Hans-CN;q=1"}, timeout=20, data=param, proxies={'http': 'http://*****:*****@192.168.1.39:3428', 'https': 'https://*****:*****@192.168.1.39:3428'}) else: res = self.request_url(url, headers={"Content-Type": "application/json", "Accept-Language": "zh-Hans-CN;q=1"}, data=param, proxies=self.proxies_dict[self.get_tid()]) if res is None or res.code != 200: print "访问错误", cname, "res is none" if res is None else "res.code=%d" % (res.code) self.error_add() if retry < 10: time.sleep(random.randrange(1, 5, 1)) return self.req_all(encryptedJson, retry=(retry+1)) return res def init_cname(self): i = 0 with open("all_company_list_already.txt", "r") as f: for line in f: i += 1 filter_name.add(line.strip()) print "init already query company name finish...", i def error_add(self): pass # with self.lock: # self.proxy_error_cnt += 1 # if self.proxy_error_cnt > 200: # self.restart_jb() def restart_jb(self): if self.proxy_error_cnt < 200: return self.proxy_error_cnt = 0 print "=============================重新启动拨号脚本=================================" os.system("sshpass -p 'helloipin' ssh [email protected] /home/ipin/bin/redial") time.sleep(10) os.system("sshpass -p 'helloipin' ssh [email protected] /home/ipin/bin/getip") print "=============================重新启动拨号脚本成功==============================" def wait_q_breakable(self): lt = 0 while True: if not self.job_queue.empty() or not self.job_queue2.empty() or not self.job_queue3.empty(): time.sleep(5) if time.time() < lt + 1 and self._running_count == 0: return True time.sleep(2) lt = time.time() if self._worker_count == 0: return False def dispatch(self): with open("beijing_cname.txt", "r") as f: cnt = 0 for line in f: line = line.strip() cnt += 1 if line in filter_name: #print cnt, line, "already spider!!!" continue job = {"cname": line, "cnt": cnt, "retry": 0} self.add_job(job, True) self.wait_q_breakable() self.add_job(None, True) def record_spider(self,line): """ 已经爬过的,无论成功失败都算爬过. """ filter_name.add(line) self.already_cname_list.append(line) self.proxy_error_cnt = 0 def run_job(self, job): cname = job.get("cname") cnt = job.get("cnt") retry = job.get("retry") if cname is None: print 'cname = ', cnt, ' is None ,break~' return self.flip_over(1, cname, cnt, retry) def flip_over(self , now_page , cname , cnt , retry): tid = self.get_tid() """ 根据公司名查询公司列表,翻页 """ encryptedJson = { "pagesize" : "20", "page" : now_page, "od_orderBy" : "0", "sh_searchType" : "一般搜索", "sh_oc_areaName" : "", "od_statusFilter" : "0", "v1" : "QZOrgV005", "oc_name" : cname, "sh_u_uid" : "", "sh_u_name" : "" } r_result = {"cname": cname} res = self.req_all(encryptedJson, cname=cname) res_code = 0 if res is None: self.re_add_job({'cname': cname, 'cnt': cnt, 'retry': retry}) return if u"处理请求时服务器遇到错误。有关详细信息,请参见服务器日志" in res.text: print "处理请求时服务器遇到错误。有关详细信息,请参见服务器日志..." if retry < 3: self.re_add_job({'cname': cname, 'cnt': cnt, 'retry': (retry+1)}) else: r_result["type"] = "request-server-error" self.already_error_type.append(spider.util.utf8str(r_result)) self.record_spider(cname) return try: if u"服务不可用。" in res.text or u"Unauthorized!" in res.text: self.re_add_job({'cname': cname, 'cnt': cnt, 'retry': retry}) print "系统不可用...", cname, res.text return c = eval(res.text)['c'] except Exception as err: print "tid=%d --- cnt=%d --- cname=%s --- retry=%d --- res.code=%d --- now_page:%d --- exception:%s --- res.text: %s " % (tid, cnt, cname, retry, res_code, now_page, err, spider.util.utf8str(res.text)) if retry < 3: self.re_add_job({'cname': cname, 'cnt': cnt, 'retry': (retry+1)}) else: r_result["type"] = "res.text=invalid" self.already_error_type.append(spider.util.utf8str(r_result)) self.record_spider(cname) return if len(c) == 0: print "tid=%d --- cnt=%d --- cname=%s --- retry=%d --- res.code=%d --- now_page:%d --- exception 'C' IS NULL" % (tid, cnt, cname, retry, res_code, now_page) r_result["type"] = "c=0" self.already_error_type.append(spider.util.utf8str(r_result)) self.record_spider(cname) self.error_cnt += 1 return result = CCIQ_AES("BF1856A312580D41256311147089E1CC").decrypt(c) try: dic = eval(result) except Exception as err: print "tid=%d --- cnt=%d --- cname=%s --- retry=%d --- res.code=%d --- now_page:%d --- exception result:%s" % (tid, cnt, cname, retry, res_code, now_page, result) r_result["type"] = "result_error" self.already_error_type.append(spider.util.utf8str(r_result)) self.record_spider(cname) self.error_cnt += 1 return list = dic['list'] if len(list) == 0: print "tid=%d --- cnt=%d --- cname=%s --- retry=%d --- res.code=%d --- now_page:%d --- exception len(list)=0" % (tid, cnt, cname, retry, res_code, now_page) r_result["type"] = "list=0" self.already_error_type.append(spider.util.utf8str(r_result)) self.record_spider(cname) self.error_cnt += 1 return for l in list: aa = {"query_name": cname} for k, v in l.items(): aa[k] = v self.query_company_list.append(spider.util.utf8str(aa)) print cnt, "******", len(list), spider.util.utf8str(list) if len(list) < 20: self.record_spider(cname) return elif len(list) == 20: if now_page > 3: #self.already_error_type.append(spider.util.utf8str(r_result)) self.need_flip_page_data.append(spider.util.utf8str(encryptedJson)) self.record_spider(cname) return now_page += 1 self.flip_over(now_page, cname, cnt, retry) def get_fail_cnt(self, type_key, addv): fc = getattr(self._curltls, type_key, 0) if (addv): fc += addv setattr(self._curltls, type_key, fc) return fc def event_handler(self, evt, msg, **kwargs): if evt == 'DONE': msg += '企业查询宝APP公司列表查询已经停止...错误数:'+str(self.error_cnt) spider.util.sendmail('*****@*****.**', '%s DONE' % sys.argv[0], msg) def read_proxy(self,fn): with open(fn, 'r') as f: for line in f: line = line.strip() self._match_proxy(line) self._can_use_proxy_num = len(self.proxies_dict) print " loaded [ %d ] proxis " % self._can_use_proxy_num def _match_proxy(self,line): m = re.match('([0-9.]+):(\d+):([a-z0-9]+):([a-z0-9._-]+)$', line, re.I) m1 = re.match('([0-9.]+):(\d+):([a-z0-9]+)$', line, re.I) if m: prstr = '%s:%s@%s:%s' % (m.group(3), m.group(4), m.group(1), m.group(2)) proxies = {'http': 'http://' + prstr, 'https': 'https://' + prstr} elif m1: prstr = '%s:%s' % (m1.group(1), m1.group(2)) proxies = {'http': 'http://' + prstr, 'https': 'https://' + prstr} else: proxies = {'http': 'http://' + line, 'https': 'https://' + line} self.proxies_dict.append(proxies)
def flip_over(self, now_page, cname): url = "http://appsvc.qiye.qianzhan.com/OrgCompany.svc/orgcompany/combine/search" headers = {"Content-Type": "application/json"} encryptedJson = { "pagesize": "20", "page": now_page, "od_orderBy": "0", "sh_searchType": "一般搜索", "od_statusFilter": "0", "v1": "QZOrgV004", "oc_name": cname, "sh_u_uid": "", "sh_u_name": "" } extJson = { "cl_screenSize": "640x960", "cl_cookieId": "16923697-D73E-485A-BDCF-68FAD456AC02", "Org_iOS_Version": "2.0.1" } param = { "encryptedJson": self._aes_.encrypt(spider.util.utf8str(encryptedJson)), "extJson": self._aes_.encrypt(spider.util.utf8str(extJson)) } param = spider.util.utf8str(param) res = self.request_url(url, headers=headers, data=param) if res is None: print 'res is none -- search company name is -->', cname self.fail_name.append(cname) return elif res.code == 404: print "%s ------ 404" % cname return elif res.code == 503 or res.code == 500 or res.code == 502 or res.code == 504: print "%s ------ %d " % (cname, res.code) self.add_job({'cname': cname}) time.sleep(0.5) return elif res.code == 200: c = eval(res.text)['c'] if len(c) == 0: print '-----------------------------cname %s res.text is null----------------------------' % cname return result = CCIQ_AES("BB1856A312580D41256311147089E0CC").decrypt(c) dic = eval(result) list = dic['list'] if len(list) == 0: print 'cname %s result list length = 0 ' % cname return print 'cname %s result ################### now get list length is %d' % ( cname, len(list)) for l in list: aa = {} for k, v in l.items(): aa[k] = v self.save_success.append(spider.util.utf8str(aa)) x = cname + "|" + l['oc_name'] + "|" + str( l['oc_area']) + "|" + str(l['oc_code']) + "|" + str( l['oc_number']) self.part_success.append(x) print "-------------------------------------------cname %s page %d finish-----------------------------------" % ( cname, now_page) rowcount = dic['rowcount'] print "==============cname %s=======page %d=========rowcount %d===========" % ( cname, now_page, rowcount) # page_count = rowcount/20 if rowcount%20==0 else (rowcount/20+1) # if now_page < page_count: # now_page += 1 # self.flip_over(now_page,cname) # time.sleep(0.1) now_page += 1 time.sleep(0.1) self.flip_over(now_page, cname) return else: print "cname %s #######################################UNKNOWN ERROR############################################# [ %d ]" % ( cname, res.code)
def get_detail(self, line, cnt, retry): tid = self.get_tid() try: param = eval(line) except Exception as err: print 'tid=%d --- cnt=%d --- data is not json, return'%(tid, cnt) self.record_spider(line,'UNKNOW') return cname = param['oc_name'] if cname in self.bloom: cname = param['query_name'] if cname in self.bloom: print 'query_name:%s aleready crawler...'%cname return ccode = param['oc_code'] carea = param['oc_area'] url = "http://appsvc.qiye.qianzhan.com/OrgCompany.svc/orgcompany/combine/detail" encryptedJson = { "bl_oc_code" : ccode,#code, #"71526726X" "v1" : "QZOrgV005", "isDirect" : "0", "bl_oc_name" : cname,#cname, #"腾讯科技" "bl_oc_area" : carea #area #"4403" } res = self.req_all(url, encryptedJson) res_code = 0 if res is None : if self.get_fail_cnt(1, 'failcount-none') < 10: self.re_add_job({'line':line,'cnt':cnt, 'retry':retry}) print "tid=%d --- cnt=%d --- cname=%s --- retry=%d --- res.code=%d " % (tid, cnt, cname, retry, res_code) return else: # if retry > 5: # self.query_failure.append(line) # self.record_spider(line, cname) # return # else: self.re_add_job({'line':line,'cnt':cnt, 'retry':(retry+1)}) self._can_use_proxy_num -= 1 raise AccountErrors.NoAccountError("Maybe the proxy invalid,failcount-none = [ %d ]" % self.get_fail_cnt(0, 'failcount-none')) else: setattr(self._curltls, 'failcount-none', 0) res_code = res.code if (res_code >= 400 and res_code < 500) or res_code == 202 : #print time.time(),"出现################",(time.time()-self.init_time), " res.code=", res_code # if retry > 20: # self.query_failure.append(line) # self.record_spider(line, cname) # else: self.re_add_job({'line':line,'cnt':cnt, 'retry':(retry+1)}) print "tid=%d --- cnt=%d --- cname=%s --- retry=%d --- res.code=%d " % (tid, cnt, cname, retry, res_code) if self.get_fail_cnt(1, 'failcount-400') > 30: self._can_use_proxy_num -= 1 raise AccountErrors.NoAccountError("Maybe the proxy invalid,failcount-400 = [ %d ]" % self.get_fail_cnt(0, 'failcount-400')) return else: setattr(self._curltls, 'failcount-400', 0) if res_code >= 500: # if retry > 5: # self.query_failure.append(line) # self.record_spider(line, cname) # else: self.re_add_job({'line':line,'cnt':cnt, 'retry':(retry+1)}) print "tid=%d --- cnt=%d --- cname=%s --- retry=%d --- res.code=%d " % (tid, cnt, cname, retry, res_code) time.sleep(2) return elif res_code == 200: try: c = eval(res.text)['c'] except Exception as err: print "tid=%d --- cnt=%d --- cname=%s --- retry=%d --- res.code=%d exception res.text " % (tid, cnt, cname, retry, res_code) #print "exception res.text:\n", res.text self.query_failure.append(line) self.record_spider(line, cname) return if len(c) == 0: print "tid=%d --- cnt=%d --- cname=%s --- retry=%d --- res.code=%d --- exception 'C' IS NULL" % (tid, cnt, cname, retry, res_code) self.query_failure.append(line) self.record_spider(line, cname) return result = CCIQ_AES("BF1856A312580D41256311147089E1CC").decrypt(c) try: detail = eval(result) except Exception as err: print "tid=%d --- cnt=%d --- cname=%s --- retry=%d --- res.code=%d --- exception result:%s" % (tid, cnt, cname, retry, res_code, result) self.query_failure.append(line) self.record_spider(line, cname) return #print 'tid=', tid, 'proxy=', self.proxies_dict[tid], ' detail=',spider.util.utf8str(detail) #print 'tid=', tid, ' detail=',spider.util.utf8str(detail) #股东信息 listGD = self.get_gd(carea, ccode, cname, 0) if listGD is not None: #print "tid=",tid," listGD=",spider.util.utf8str(listGD) detail['listGD'] = listGD['listGD'] #投资信息 list_inversted = self.get_inversted(cname, 0) if list_inversted is not None: #print "tid=",tid," list_inversted=",spider.util.utf8str(list_inversted) detail['inversted'] = list_inversted['inversted'] #获取分支机构信息 list_branch = self.get_branch(cname, 1, {"Branch": []}, 0) if list_branch is not None: #print "tid=",tid," list_branch=",spider.util.utf8str(list_branch) detail['Branch'] = list_branch['Branch'] self.query_success.append(spider.util.utf8str(detail)) self.record_spider(line, cname) print "tid=%d --- proxy=%s --- cnt=%d --- cname=%s --- retry=%d --- res.code=%d --- success:\n %s" % (tid,self.proxies_dict[tid], cnt, cname, retry, res_code, spider.util.utf8str(detail)) else: self.query_failure.append(line) self.record_spider(line, cname) print "tid=%d --- cnt=%d --- cname=%s --- retry=%d --- res.code=%d --- exception UNKNOW ERROR" % (tid, cnt, cname, retry, res_code) return
class QycxbApp(Spider): """ 根据企业名称 访问接口 获得公司是否存在,存在则拿出其注册号和公司名称保存,不存在则忽略 """ def __init__(self): #self.proxies_dict = [] #self.read_proxy("../spider/proxy/proxy.txt") #Spider.__init__(self, len(self.proxies_dict)) Spider.__init__(self, 1) self.num_count = 0 self._aes_ = CCIQ_AES() #APP可以拿到的公司全部信息 self.save_success = FileSaver("exist_company.txt") #APP可以拿到的公司局部信息 self.part_success = FileSaver("part_company.txt") #查询失败的公司名 self.fail_name = FileSaver("fail_name.txt") def wait_q_breakable(self): lt = 0 while True: if not self.job_queue.empty() or not self.job_queue2.empty( ) or not self.job_queue3.empty(): time.sleep(5) if time.time() < lt + 1 and self._running_count == 0: return True time.sleep(2) lt = time.time() if self._worker_count == 0: return False def dispatch(self): with open("old-company.txt", "r") as f: while True: line = f.readline().strip() ary = line.split(" ") if len(ary) == 3: #print 'read company name is ', ary[2] job = {"cname": ary[2]} self.add_job(job, True) self.wait_q_breakable() self.add_job(None, True) def get_fail_cnt(self, addv): fc = getattr(self._curltls, 'failcount', 0) if (addv): fc += addv setattr(self._curltls, 'failcount', fc) return fc def run_job(self, jobid): self.select_user_agent("=CCIQ/2.0.1 (iPhone; iOS 9.1; Scale/2.00)") cname = jobid.get("cname") self.flip_over(1, cname) def flip_over(self, now_page, cname): url = "http://appsvc.qiye.qianzhan.com/OrgCompany.svc/orgcompany/combine/search" headers = {"Content-Type": "application/json"} encryptedJson = { "pagesize": "20", "page": now_page, "od_orderBy": "0", "sh_searchType": "一般搜索", "od_statusFilter": "0", "v1": "QZOrgV004", "oc_name": cname, "sh_u_uid": "", "sh_u_name": "" } extJson = { "cl_screenSize": "640x960", "cl_cookieId": "16923697-D73E-485A-BDCF-68FAD456AC02", "Org_iOS_Version": "2.0.1" } param = { "encryptedJson": self._aes_.encrypt(spider.util.utf8str(encryptedJson)), "extJson": self._aes_.encrypt(spider.util.utf8str(extJson)) } param = spider.util.utf8str(param) res = self.request_url(url, headers=headers, data=param) if res is None: print 'res is none -- search company name is -->', cname self.fail_name.append(cname) return elif res.code == 404: print "%s ------ 404" % cname return elif res.code == 503 or res.code == 500 or res.code == 502 or res.code == 504: print "%s ------ %d " % (cname, res.code) self.add_job({'cname': cname}) time.sleep(0.5) return elif res.code == 200: c = eval(res.text)['c'] if len(c) == 0: print '-----------------------------cname %s res.text is null----------------------------' % cname return result = CCIQ_AES("BB1856A312580D41256311147089E0CC").decrypt(c) dic = eval(result) list = dic['list'] if len(list) == 0: print 'cname %s result list length = 0 ' % cname return print 'cname %s result ################### now get list length is %d' % ( cname, len(list)) for l in list: aa = {} for k, v in l.items(): aa[k] = v self.save_success.append(spider.util.utf8str(aa)) x = cname + "|" + l['oc_name'] + "|" + str( l['oc_area']) + "|" + str(l['oc_code']) + "|" + str( l['oc_number']) self.part_success.append(x) print "-------------------------------------------cname %s page %d finish-----------------------------------" % ( cname, now_page) rowcount = dic['rowcount'] print "==============cname %s=======page %d=========rowcount %d===========" % ( cname, now_page, rowcount) # page_count = rowcount/20 if rowcount%20==0 else (rowcount/20+1) # if now_page < page_count: # now_page += 1 # self.flip_over(now_page,cname) # time.sleep(0.1) now_page += 1 time.sleep(0.1) self.flip_over(now_page, cname) return else: print "cname %s #######################################UNKNOWN ERROR############################################# [ %d ]" % ( cname, res.code) def event_handler(self, evt, msg, **kwargs): if evt == 'DONE': msg += '企业查询宝APP爬取已经停止...' spider.util.sendmail('*****@*****.**', '%s DONE' % sys.argv[0], msg) def read_proxy(self, fn): with open(fn, 'r') as f: for line in f: line = line.strip() m = re.match(r'(\d+\.\d+\.\d+\.\d+:\d+)', line, re.I) if m: prstr = m.group(1) proxies = { 'http': 'http://' + prstr + "/", 'https': 'https://' + prstr + "/" } self.proxies_dict.append(proxies) elif re.match('\s*#', line): continue print " loaded [ %d ] proxis " % len(self.proxies_dict)
def flip_over(self, now_page, cname, line, cnt, retry): url = "http://appsvc.qiye.qianzhan.com/OrgCompany.svc/orgcompany/combine/search" headers = {"Content-Type": "application/json"} encryptedJson = { "pagesize": "20", "page": now_page, "od_orderBy": "0", "sh_searchType": "一般搜索", "od_statusFilter": "0", "v1": "QZOrgV004", "oc_name": cname, "sh_u_uid": "", "sh_u_name": "" } param = { "encryptedJson": self._aes_.encrypt(spider.util.utf8str(encryptedJson)), "extJson": self.extJson } param = spider.util.utf8str(param) res = self.request_url(url, headers=headers, data=param, proxies=self.proxies_dict[self.get_tid()]) if res is None: if self.get_fail_cnt(1) < 10: print "%d-----%s ------ res is None" % (cnt, cname) self.add_job({'line': line, 'cnt': cnt}) return False else: print "id is [ %s ] thread and [ %s ] proxy will be close and drop." % ( self.get_tid(), self.proxies_dict[self.get_tid()]) #self.query_company_info_failure.append(line) self.add_job({'line': line, 'cnt': cnt}) raise AccountErrors.NoAccountError( "Maybe the proxy[ %s ] invalid,failcount = [ %d ]" % (self.proxies_dict[self.get_tid()], self.get_fail_cnt(0))) elif res.code == 404 or res.code == 403: if self.get_fail_cnt(1) < 20: print "%d-----%s ------ %d" % (cnt, cname, res.code) self.add_job({'line': line, 'cnt': cnt}) return False else: print "id is [ %s ] thread and [ %s ] proxy will be close and drop." % ( self.get_tid(), self.proxies_dict[self.get_tid()]) self.add_job({'line': line, 'cnt': cnt}) raise AccountErrors.NoAccountError( "Maybe the proxy[ %s ] invalid,failcount = [ %d ]" % (self.proxies_dict[self.get_tid()], self.get_fail_cnt(0))) elif res.code == 503 or res.code == 500 or res.code == 502 or res.code == 504: print "%d------%s ------ %d " % (cnt, cname, res.code) self.add_job({'line': line, 'cnt': cnt}) time.sleep(1) return False elif res.code == 200: c = eval(res.text)['c'] if len(c) == 0: print '-----------------------------cname %s res.text is null----------------------------' % cname self.query_company_info_failure.append(line) return True result = CCIQ_AES("BB1856A312580D41256311147089E0CC").decrypt(c) dic = eval(result) list = dic['list'] if len(list) == 0: print 'cname %s result list length = 0 ' % cname self.query_company_info_failure.append(line) return True print 'cname %s result ################### list length ------ %d' % ( cname, len(list)) for l in list: aa = {} for k, v in l.items(): aa[k] = v self.query_company_info.append(spider.util.utf8str(aa)) part = cname + "|" + l['oc_name'] + "|" + str( l['oc_area']) + "|" + str(l['oc_code']) + "|" + str( l['oc_number']) self.query_company_info_part.append(part) self.get_detail(l['oc_name'], l['oc_code'], l['oc_area']) if len(list) < 20: return True elif len(list) == 20: now_page += 1 self.flip_over(now_page, cname, line, cnt) else: print "cname %s #######################################UNKNOWN ERROR############################################# [ %d ]" % ( cname, res.code) self.query_company_info_failure.append(line) return True
class QycxbApp(Spider): """ 根据企业名称 查询公司列表(可能多个,可能不存在) , 存在则根据每条数据内容查询详情detail,查询完详情再请求股东信息,跟详情拼接在一起存储到文件detail_company.txt """ def __init__(self): self.proxies_dict = [] self.read_proxy("proxy_20160218.txt") Spider.__init__(self, len(self.proxies_dict)) self.num_count = 0 #self.filter_name = [] self._aes_ = CCIQ_AES() #根据公司名字查询到的公司列表全部信息 self.query_company_info = FileSaver("query_company_info.txt") #根据公司名字查询到的公司列表局部信息 self.query_company_info_part = FileSaver("query_company_info_part.txt") #根据公司名字查询到的公司列表信息失败的 self.query_company_info_failure = FileSaver( "query_company_info_failure.txt") #已经爬取过的公司名 self.already_cname = FileSaver("already_cname.txt") #初始化已经爬过的公司 self.init_cname() #查询详情失败的公司名 self.detail_failure = FileSaver("detail_failure1.txt") #APP可以拿到的公司全部信息 包含股东信息 self.detail_company = FileSaver("detail_company.txt") self.extJson = self._aes_.encrypt( spider.util.utf8str({ "cl_screenSize": "640x960", "cl_cookieId": "16923697-D73E-485A-BDCF-68FAD456AC02", "Org_iOS_Version": "2.0.1" })) self.select_user_agent("=CCIQ/2.0.1 (iPhone; iOS 9.1; Scale/2.00)") def init_cname(self): with open("already_cname.txt", "r") as f: for line in f: filter_name.add(line.strip()) def wait_q_breakable(self): lt = 0 while True: if not self.job_queue.empty() or not self.job_queue2.empty( ) or not self.job_queue3.empty(): time.sleep(5) if time.time() < lt + 1 and self._running_count == 0: return True time.sleep(2) lt = time.time() if self._worker_count == 0: return False def dispatch(self): with open("corp_name.txt", "r") as f: cnt = 0 while True: line = f.readline().strip() cnt += 1 if line is None: break if line in filter_name: print line, " already spider~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" continue job = {"line": line, "cnt": cnt, "retry": 0} self.add_job(job, True) self.wait_q_breakable() self.add_job(None, True) def get_fail_cnt(self, addv): fc = getattr(self._curltls, 'failcount', 0) if (addv): fc += addv setattr(self._curltls, 'failcount', fc) return fc def run_job(self, jobid): line = jobid.get("line") cnt = jobid.get("cnt") retry = jobid.get("retry") if line is None: print 'line = ', cnt, ' is None ,break~' return ary = line.split(" ") if len(ary) == 4: cname = ary[3] flag = self.flip_over(1, cname, line, cnt, retry) #爬取结束,加入到set并写入文件 if flag: filter_name.add(line) self.already_cname.append(line) print cnt, ' execute perfect~~~~~~~~~~~~~~~~~~~~~~~' else: print '@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ company data line is error @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@', cnt #根据公司名查询公司列表,翻页 def flip_over(self, now_page, cname, line, cnt, retry): url = "http://appsvc.qiye.qianzhan.com/OrgCompany.svc/orgcompany/combine/search" headers = {"Content-Type": "application/json"} encryptedJson = { "pagesize": "20", "page": now_page, "od_orderBy": "0", "sh_searchType": "一般搜索", "od_statusFilter": "0", "v1": "QZOrgV004", "oc_name": cname, "sh_u_uid": "", "sh_u_name": "" } param = { "encryptedJson": self._aes_.encrypt(spider.util.utf8str(encryptedJson)), "extJson": self.extJson } param = spider.util.utf8str(param) res = self.request_url(url, headers=headers, data=param, proxies=self.proxies_dict[self.get_tid()]) if res is None: if self.get_fail_cnt(1) < 10: print "%d-----%s ------ res is None" % (cnt, cname) self.add_job({'line': line, 'cnt': cnt}) return False else: print "id is [ %s ] thread and [ %s ] proxy will be close and drop." % ( self.get_tid(), self.proxies_dict[self.get_tid()]) #self.query_company_info_failure.append(line) self.add_job({'line': line, 'cnt': cnt}) raise AccountErrors.NoAccountError( "Maybe the proxy[ %s ] invalid,failcount = [ %d ]" % (self.proxies_dict[self.get_tid()], self.get_fail_cnt(0))) elif res.code == 404 or res.code == 403: if self.get_fail_cnt(1) < 20: print "%d-----%s ------ %d" % (cnt, cname, res.code) self.add_job({'line': line, 'cnt': cnt}) return False else: print "id is [ %s ] thread and [ %s ] proxy will be close and drop." % ( self.get_tid(), self.proxies_dict[self.get_tid()]) self.add_job({'line': line, 'cnt': cnt}) raise AccountErrors.NoAccountError( "Maybe the proxy[ %s ] invalid,failcount = [ %d ]" % (self.proxies_dict[self.get_tid()], self.get_fail_cnt(0))) elif res.code == 503 or res.code == 500 or res.code == 502 or res.code == 504: print "%d------%s ------ %d " % (cnt, cname, res.code) self.add_job({'line': line, 'cnt': cnt}) time.sleep(1) return False elif res.code == 200: c = eval(res.text)['c'] if len(c) == 0: print '-----------------------------cname %s res.text is null----------------------------' % cname self.query_company_info_failure.append(line) return True result = CCIQ_AES("BB1856A312580D41256311147089E0CC").decrypt(c) dic = eval(result) list = dic['list'] if len(list) == 0: print 'cname %s result list length = 0 ' % cname self.query_company_info_failure.append(line) return True print 'cname %s result ################### list length ------ %d' % ( cname, len(list)) for l in list: aa = {} for k, v in l.items(): aa[k] = v self.query_company_info.append(spider.util.utf8str(aa)) part = cname + "|" + l['oc_name'] + "|" + str( l['oc_area']) + "|" + str(l['oc_code']) + "|" + str( l['oc_number']) self.query_company_info_part.append(part) self.get_detail(l['oc_name'], l['oc_code'], l['oc_area']) if len(list) < 20: return True elif len(list) == 20: now_page += 1 self.flip_over(now_page, cname, line, cnt) else: print "cname %s #######################################UNKNOWN ERROR############################################# [ %d ]" % ( cname, res.code) self.query_company_info_failure.append(line) return True #查询详细信息 def get_detail(self, cname, code, area): url = "http://appsvc.qiye.qianzhan.com/OrgCompany.svc/orgcompany/combine/detail" headers = {"Content-Type": "application/json"} encryptedJson = { "bl_oc_code": code, #"71526726X" "v1": "QZOrgV004", "isDirect": "1", "bl_oc_name": cname, #"腾讯科技" "bl_oc_area": area #"4403" } param = { "encryptedJson": self._aes_.encrypt(spider.util.utf8str(encryptedJson)), "extJson": self.extJson } param = spider.util.utf8str(param) res = self.request_url(url, headers=headers, data=param, proxies=self.proxies_dict[self.get_tid()]) if res is None: print 'res is none -- encryptedJson -->', str(encryptedJson) self.detail_failure.append(cname + "|" + str(code) + "|" + str(area)) return elif res.code == 404: print "404 ------ ", code self.detail_failure.append(cname + "|" + str(code) + "|" + str(area)) return elif res.code == 503 or res.code == 500 or res.code == 502 or res.code == 504: print res.code, '------', code time.sleep(0.5) self.get_detail(cname, code, area) return elif res.code == 200: c = eval(res.text)['c'] if len(c) == 0: print '-----------------------------code ', code, ' res.text is null----------------------------' self.detail_failure.append(cname + "|" + str(code) + "|" + str(area)) return result = CCIQ_AES("BB1856A312580D41256311147089E0CC").decrypt(c) detail = eval(result) listGD = self.get_gd(area, code) if listGD is not None: detail['listGD'] = listGD['listGD'] print 'detail=================================', spider.util.utf8str( detail) self.detail_company.append(spider.util.utf8str(detail)) return else: print "cname %s #######################################UNKNOWN ERROR############################################# [ %d ]" % ( cname, res.code) #获取股东信息 def get_gd(self, area, code): url = "http://appsvc.qiye.qianzhan.com/OrgCompany.svc/orgcompany/gd/detail" headers = {"Content-Type": "application/json"} encryptedJson = { "bl_oc_area": area, #4107 "v1": "QZOrgV004", "bl_oc_code": code #672867774 } param = { "encryptedJson": self._aes_.encrypt(spider.util.utf8str(encryptedJson)), "extJson": self.extJson } param = spider.util.utf8str(param) res = self.request_url(url, headers=headers, data=param, proxies=self.proxies_dict[self.get_tid()]) if res is None: print 'res is none -- search gd code is -->', code return None elif res.code == 404: print "404 ------ ", code return None elif res.code == 503 or res.code == 500 or res.code == 502 or res.code == 504: print res.code, '------', code time.sleep(0.5) return self.get_gd(area, code) elif res.code == 200: c = eval(res.text)['c'] if len(c) == 0: print '-----------------------------gd code', code, ' res.text is null----------------------------' % cname return None result = CCIQ_AES("BB1856A312580D41256311147089E0CC").decrypt(c) list_gd = eval(result) #print 'gd infos =======================',spider.util.utf8str(list_gd) return list_gd else: print code, "#######################################UNKNOWN ERROR#############################################", res.code return None def get_inversted(self, url, encryptedJson): """ 通用请求方法 """ param = spider.util.utf8str({ "encryptedJson": self._aes_.encrypt(spider.util.utf8str(encryptedJson)), "extJson": self.extJson }) res = self.request_url(url, headers={"Content-Type": "application/json"}, data=param, proxies=self.proxies_dict[self.get_tid()]) if res is None: print 'res is none -- search gd code is -->', code return None elif res.code == 404: print "404 ------ ", code return None elif res.code == 503 or res.code == 500 or res.code == 502 or res.code == 504: print res.code, '------', code time.sleep(0.5) return self.get_gd(area, code) elif res.code == 200: c = eval(res.text)['c'] if len(c) == 0: print '-----------------------------gd code', code, ' res.text is null----------------------------' % cname return None result = CCIQ_AES("BB1856A312580D41256311147089E0CC").decrypt(c) list_gd = eval(result) #print 'gd infos =======================',spider.util.utf8str(list_gd) return list_gd else: print code, "#######################################UNKNOWN ERROR#############################################", res.code return None def event_handler(self, evt, msg, **kwargs): if evt == 'DONE': msg += '企业查询宝APP[公司名]和[组织机构代码]爬取已经停止...' spider.util.sendmail('*****@*****.**', '%s DONE' % sys.argv[0], msg) def read_proxy(self, fn): with open(fn, 'r') as f: for line in f: line = line.strip() m = re.match(r'(\d+\.\d+\.\d+\.\d+:\d+)', line, re.I) if m: prstr = m.group(1) proxies = { 'http': 'http://' + prstr + "/", 'https': 'https://' + prstr + "/" } self.proxies_dict.append(proxies) elif re.match('\s*#', line): continue print " loaded [ %d ] proxis " % len(self.proxies_dict)
class QycxbSpider(Spider): """ 测试只使用9位数组织机构代码去获取详情 121.40.186.237:18889:ipin:helloipin """ def __init__(self): self._can_use_proxy_num = 0 self.is_debug = "multiADSL" self.proxies = {} if self.is_debug == "singleADSL": #单一代理ADSL模式 Spider.__init__(self, 200) self.proxy_error_cnt = 0 elif self.is_debug == "kuaidaili": #快代理模式 self.proxies_dict = [] self.read_proxy("../../_ct_proxy/proxy_all_filter.txt") Spider.__init__(self, len(self.proxies_dict)) elif self.is_debug == "multiADSL": #多代理ADSL模式 #proxies1 = {'http': 'http://*****:*****@183.56.160.174:50001', 'https': 'https://*****:*****@183.56.160.174:50001'} #proxies2 = {'http': 'http://*****:*****@183.56.160.174:50001', 'https': 'https://*****:*****@183.56.160.174:50001'} proxies1 = { 'http': 'http://*****:*****@121.40.186.237:50001', 'https': 'https://*****:*****@121.40.186.237:50001' } proxies2 = { 'http': 'http://*****:*****@121.40.186.237:50001', 'https': 'https://*****:*****@121.40.186.237:50001' } proxies3 = { 'http': 'http://*****:*****@192.168.1.39:3428', 'https': 'https://*****:*****@192.168.1.39:3428' } proxies4 = { 'http': 'http://*****:*****@192.168.1.39:3428', 'https': 'https://*****:*****@192.168.1.39:3428' } proxies5 = { 'http': 'http://*****:*****@192.168.1.39:3428', 'https': 'https://*****:*****@192.168.1.39:3428' } self.proxies_dict = [proxies1, proxies2, proxies3, proxies4] #, proxies5] Spider.__init__(self, 400) self._aes_ = CCIQ_AES() #成功拿到的详情 self.query_success = FileSaver("成功拿到的详情900.txt") #失败的 self.query_failure = FileSaver("获取失败的机构代码和原因900.txt") #已经爬取过的列表 self.already_cname_list = FileSaver("已经爬过机构代码900.txt") #结果http 为400的code self.result400 = FileSaver("结果http=400的机构代码900.txt") #初始化已经爬过的公司 self.init_cname() self.extJsons = [ "Hoi6oX70l9whauZmjq8jVAmoe3UspXXhX9mPG+KAeqs1rKZVr/uapICH92P/Crryt63u28aP4QP665AzcT/jN5Go1o3bvwMvVIkuN9e60k6WI2pVFBrwZMvxwW6BnQukSzDSlyPvEhgpR5DIHQEV6C51hMgp4Zc3OkTSsyezAm4=", "ctlCXDvoyaH2pCIArrgvXp7zrZTzpz2Q5rukh+aWvupEFABw6P2AvbmaN+HJ7IZgDJ/kgBkJt/rLppSGitYCPKGR2IGv6OXZsrJGgbRB3G3Ac4K8xpX3aMB5s8Ci2a/YpTpioZxAvptqJsQUCoNn0tLCOVM4XxMJQWbrErkOcl4=", "ctlCXDvoyaH2pCIArrgvXp7zrZTzpz2Q5rukh+aWvupEFABw6P2AvbmaN+HJ7IZgDJ/kgBkJt/rLppSGitYCPKGR2IGv6OXZsrJGgbRB3G1U2wdOlL49/aDwt3NZNp4TGa5iBFpYLm69F/6PPFoXIR/Aw5p48//8OgZFpddDUwQ=" ] self.user_agents = [ "=CCIQ/2.0.1 (iPhone; iOS 9.1; Scale/2.00)", "=CCIQ/2.0.1 (iPhone; iOS 8.1.3; Scale/2.00)", "=CCIQ/2.0.1 (iPhone; iOS 8.4; Scale/2.00)" ] self.is_first = True self.init_time = 0 self.lock = threading.Lock() self.req_cnt = 0 def req_all(self, url, encryptedJson, retry=0): number = random.randrange(0, 3, 1) self.select_user_agent(self.user_agents[number]) param = spider.util.utf8str({ "encryptedJson": self._aes_.encrypt(spider.util.utf8str(encryptedJson)), "extJson": self.extJsons[number] }) param = param.replace('/', "\/") res = None if self.is_first: self.init_time = time.time() print '初始化时间', self.init_time self.is_first = False if self.is_debug == "singleADSL": #res = self.request_url(url, headers={"Content-Type": "application/json"}, timeout=20, data=param) self.proxies = { 'http': 'http://*****:*****@192.168.1.39:3428', 'https': 'https://*****:*****@192.168.1.39:3428' } #self.proxies = {'http': 'http://*****:*****@183.56.160.174:50001', 'https': 'https://*****:*****@183.56.160.174:50001'} #self.proxies = {'http': 'http://*****:*****@121.40.186.237:50001', 'https': 'https://*****:*****@121.40.186.237:50001'} #self.proxies = {'http': 'http://*****:*****@106.75.134.190:18889', 'https': 'https://*****:*****@106.75.134.190:18889'} res = self.request_url( url, headers={"Content-Type": "application/json"}, timeout=20, data=param, proxies=self.proxies) time.sleep(5) #res = self.request_url(url, headers={"Content-Type": "application/json", "Accept-Language": "zh-Hans-CN;q=1"}, data=param, proxies={'http': 'http://*****:*****@106.75.134.191:18889', 'https': 'https://*****:*****@106.75.134.191:18889'}) elif self.is_debug == "kuaidaili": self.proxies = self.proxies_dict[self.get_tid()] res = self.request_url( url, headers={"Content-Type": "application/json"}, data=param, proxies=self.proxies) elif self.is_debug == "multiADSL": num = self.get_tid() % len(self.proxies_dict) self.proxies = self.proxies_dict[num] res = self.request_url( url, headers={"Content-Type": "application/json"}, data=param, proxies=self.proxies_dict[num], timeout=20) if res is None or res.code != 200: print "访问错误", "res is none" if res is None else "res.code=%d" % ( res.code), self.proxies if res is not None and res.code == 400 and retry > 2: return res #self.error_add() if retry < 7: time.sleep(random.randrange(1, 5, 1)) return self.req_all(url, encryptedJson, retry=(retry + 1)) return res def init_cname(self): cnt = 0 with open("已经爬过机构代码900.txt", "r") as f: for line in f: cnt += 1 filter_name.add(line.strip()) print "初始化结束...", cnt with open("结果http=400的机构代码900.txt", "r") as f: for line in f: cnt += 1 filter_name.add(line.strip()) print "初始化结束...", cnt def count_proxy_error(self, error_type): pass def wait_q_breakable(self): lt = 0 while True: if not self.job_queue.empty() or not self.job_queue2.empty( ) or not self.job_queue3.empty(): time.sleep(5) if time.time() < lt + 1 and self._running_count == 0: return True time.sleep(2) lt = time.time() if self._worker_count == 0: return False def dispatch(self): # i = 0 # while i < 100000000: # i += 1 # code = self.bu0(i) # if len(code) == 9 and code not in filter_name: # job = {"code": code, "retry": 0} # self.add_job(job, True) # else: # print "已爬过 或 代码错误:", code with open("推测组织机构代码.txt", "r") as f: for line in f: line = line.strip() if line in filter_name: #print "already query...", line continue else: job = {"code": line, "retry": 0} self.add_job(job, True) self.wait_q_breakable() self.add_job(None, True) # def bu0(self, code): # code = str(code) # if len(code) != 8: # sub = 8 - len(code) # while sub != 0: # code = "0" + code # sub -= 1 # code = self.compute_code(code) # return code # # def compute_code(self, code): # code = code.strip() # assert len(code) == 8 # vs = [3, 7, 9, 10, 5, 8, 4, 2] # v = 0 # for i in range(0, 8): # if '0' <= code[i] <= '9': # v += (ord(code[i]) - ord('0')) * vs[i] # elif 'A' <= code[i] <= 'Z': # v += (ord(code[i]) - ord('A') + 10) * vs[i] # elif 'a' <= code[i] <= 'z': # v += (ord(code[i]) - ord('a') + 10) * vs[i] # else: # raise RuntimeError("invalid code") # v = (11 - v % 11) % 11 # return code + '0123456789X'[v] def record_spider(self, code): """ 已经爬过的,无论成功失败都算爬过. """ filter_name.add(code) self.already_cname_list.append(code) self.proxy_error_cnt = 0 self.req_cnt += 1 print "speed ======================>", self.req_cnt / (time.time() - self.init_time) def error_add(self): pass # with self.lock: # self.proxy_error_cnt += 1 # if self.proxy_error_cnt > 200: # self.restart_jb() def restart_jb(self): if self.proxy_error_cnt < 200: return self.proxy_error_cnt = 0 print "=============================重新启动拨号脚本=================================" os.system( "sshpass -p 'helloipin' ssh [email protected] /home/ipin/bin/redial" ) time.sleep(10) os.system( "sshpass -p 'helloipin' ssh [email protected] /home/ipin/bin/getip" ) print "=============================重新启动拨号脚本成功==============================" def run_job(self, jobid): code = jobid.get("code") retry = jobid.get("retry") tid = self.get_tid() url = "http://appsvc.qiye.qianzhan.com/OrgCompany.svc/orgcompany/combine/detail" encryptedJson = { "bl_oc_code": code, #code, #"71526726X" "v1": "QZOrgV005", "isDirect": "0", "bl_oc_name": "腾讯科技", #cname, #"腾讯科技" "bl_oc_area": "" #area #"4403" } detail = {} res = self.req_all(url, encryptedJson) res_code = 0 if res is None: print code, "get detail res is None !!" return res_code = res.code if res_code == 400: self.result400.append(code) self.req_cnt += 1 return try: if u"服务不可用。" in res.text or u"Unauthorized!" in res.text: # or u"处理请求时服务器遇到错误。有关详细信息,请参见服务器日志。" in res.text: self.re_add_job({'cname': code, 'retry': retry}) print "系统不可用...", code, res.text return c = eval(res.text)['c'] except Exception as err: print "tid=%d --- retry=%d --- res.code=%d exception " % ( tid, retry, res_code), err #res.text=%s#, spider.util.utf8str(res.text) self.re_add_job({'cname': code, 'retry': retry}) return if len(c) == 0: print "tid=%d --- retry=%d --- res.code=%d --- exception 'C' IS NULL" % ( tid, retry, res_code) self.query_failure.append(code + ",c=0") self.record_spider(code) return result = CCIQ_AES("BF1856A312580D41256311147089E1CC").decrypt(c) try: detail = eval(result) except Exception as err: print "tid=%d --- retry=%d --- res.code=%d --- exception result:%s" % ( tid, retry, res_code, result) self.query_failure.append(code + ",result_error") self.record_spider(code) return cname = None try: basic = detail["list"] if basic is None or len(basic) == 0: print code, " 此码无效...", spider.util.utf8str(detail) self.query_failure.append(code + ",list=0") self.record_spider(code) return cname = basic[0]["oc_name"] except Exception as err: print code, "获取基本详情错误,拿不到oc_name,detail : ", spider.util.utf8str( detail) return #股东信息 # listGD = self.get_gd(code) # if listGD is not None: # #print "tid=", tid, " listGD=", spider.util.utf8str(listGD) # detail['listGD'] = listGD['listGD'] #投资信息 # list_inversted = self.get_inversted(cname) # if list_inversted is not None: # #print "tid=", tid, " list_inversted=", spider.util.utf8str(list_inversted) # detail['inversted'] = list_inversted['inversted'] # #获取分支机构信息 # branch = [] # list_branch = self.get_branch(cname, list_branch=branch) # if list_branch is not None: # #print "tid=", tid, " list_branch=", spider.util.utf8str(list_branch) # detail['Branch'] = list_branch #['Branch'] self.query_success.append(spider.util.utf8str(detail)) self.record_spider(code) print "tid=%d --- retry=%d --- res.code=%d @@@ success: %s \n " % ( tid, retry, res_code, spider.util.utf8str( self.proxies)), spider.util.utf8str(detail) def get_gd(self, code, retry=0): """ 获取股东信息 """ url = "http://appsvc.qiye.qianzhan.com/OrgCompany.svc/orgcompany/gd/detail" encryptedJson = { "bl_oc_area": "", "v1": "QZOrgV005", "bl_oc_code": code } res = self.req_all(url, encryptedJson) if res is None: return None if res.code == 200: try: c = eval(res.text)['c'] if len(c) == 0: print "get_gd --- retry=%d --- reason:len(c)=0" % retry return None result = CCIQ_AES("BF1856A312580D41256311147089E1CC").decrypt( c) #print "获取股东信息结果:", spider.util.utf8str(result) return eval(result) except Exception as err: print "get_gd --- retry=%d --- reason:%s" % (retry, err) if retry < 5: retry += 1 time.sleep(retry * 1.5) return self.get_gd(code, retry=retry) else: return None else: print "get_gd --- retry=%d --- res.code=%d" % (retry, res.code) if retry < 5: retry += 1 time.sleep(retry * 1.5) return self.get_gd(code, retry=retry) else: return None def get_inversted(self, cname, retry=0): """ 查询投资信息 """ url = "http://appsvc.qiye.qianzhan.com/OrgCompany.svc/orgcompany/map/invesment" encryptedJson = {"input": cname, "v1": "QZOrgV005"} res = self.req_all(url, encryptedJson) if res is None: return None if res.code == 200: try: c = eval(res.text)['c'] if len(c) == 0: print "get_inversted --- cname=%s --- retry=%d --- reason:len(c)=0" % ( cname, retry) return None result = CCIQ_AES("BF1856A312580D41256311147089E1CC").decrypt( c) return eval(result) except Exception as err: print "get_inversted --- cname=%s --- retry=%d --- reason:%s" % ( cname, retry, err) if retry < 5: retry += 1 time.sleep(retry * 1.5) return self.get_inversted(cname, retry=retry) else: return None else: print "get_inversted --- cname=%s --- retry=%d --- res.code=%d" % ( cname, retry, res.code) if retry < 5: retry += 1 time.sleep(retry * 1.5) return self.get_inversted(cname, retry=retry) else: return None def get_branch(self, cname, now_page=1, list_branch=[], retry=0): """ 查询分支机构 """ url = "http://appsvc.qiye.qianzhan.com/OrgCompany.svc/orgcompany/branch/select/page" encryptedJson = { "companyName": cname, "v1": "QZOrgV005", "page": now_page, "pagesize": "10" } res = self.req_all(url, encryptedJson) if res is None: return None if res.code == 200: try: c = eval(res.text)['c'] if len(c) == 0: print "get_branch --- cname=%s --- retry=%d --- reason:len(c)=0" % ( cname, retry) return None result = CCIQ_AES("BF1856A312580D41256311147089E1CC").decrypt( c) temp = eval(result) if temp is not None: for t in temp['Branch']: list_branch.append(t) if len(temp['Branch']) == 10: if now_page > 3: return list_branch now_page += 1 print cname, "翻页 -----------------------------------> now_page", now_page return self.get_branch(cname, now_page=now_page, list_branch=list_branch, retry=retry) else: return list_branch else: print "get_branch --- cname=%s --- retry=%d --- now_page=%d --- res.code=%d --- Branch is NULL" % ( cname, retry, now_page) return None except Exception as err: print "get_branch --- cname=%s --- retry=%d --- reason:%s" % ( cname, retry, err) if retry < 5: retry += 1 time.sleep(retry * 1.5) return self.get_branch(cname, now_page=now_page, list_branch=list_branch, retry=retry) else: return None else: print "get_branch --- cname=%s --- retry=%d --- res.code=%d" % ( cname, retry, res.code) if retry < 5: retry += 1 time.sleep(retry * 1.5) return self.get_branch(cname, now_page=now_page, list_branch=list_branch, retry=retry) else: return None def get_fail_cnt(self, addv, type): fc = getattr(self._curltls, type, 0) if (addv): fc += addv setattr(self._curltls, type, fc) return fc def event_handler(self, evt, msg, **kwargs): if evt == 'DONE': msg += '企业查询宝APP公司详情detail查询已经停止...' spider.util.sendmail('*****@*****.**', '%s DONE' % sys.argv[0], msg) def read_proxy(self, fn): with open(fn, 'r') as f: for line in f: line = line.strip() self._match_proxy(line) print " loaded [ %d ] proxis " % len(self.proxies_dict) def _match_proxy(self, line): m = re.match('([0-9.]+):(\d+):([a-z0-9]+):([a-z0-9._-]+)$', line, re.I) m1 = re.match('([0-9.]+):(\d+):([a-z0-9]+)$', line, re.I) if m: prstr = '%s:%s@%s:%s' % (m.group(3), m.group(4), m.group(1), m.group(2)) proxies = {'http': 'http://' + prstr, 'https': 'https://' + prstr} elif m1: prstr = '%s:%s' % (m1.group(1), m1.group(2)) proxies = {'http': 'http://' + prstr, 'https': 'https://' + prstr} else: proxies = {'http': 'http://' + line, 'https': 'https://' + line} self.proxies_dict.append(proxies)
def __init__(self): self._can_use_proxy_num = 0 self.is_debug = "multiADSL" self.proxies = {} if self.is_debug == "singleADSL": #单一代理ADSL模式 Spider.__init__(self, 200) self.proxy_error_cnt = 0 elif self.is_debug == "kuaidaili": #快代理模式 self.proxies_dict = [] self.read_proxy("../../_ct_proxy/proxy_all_filter.txt") Spider.__init__(self, len(self.proxies_dict)) elif self.is_debug == "multiADSL": #多代理ADSL模式 #proxies1 = {'http': 'http://*****:*****@183.56.160.174:50001', 'https': 'https://*****:*****@183.56.160.174:50001'} #proxies2 = {'http': 'http://*****:*****@183.56.160.174:50001', 'https': 'https://*****:*****@183.56.160.174:50001'} proxies1 = { 'http': 'http://*****:*****@121.40.186.237:50001', 'https': 'https://*****:*****@121.40.186.237:50001' } proxies2 = { 'http': 'http://*****:*****@121.40.186.237:50001', 'https': 'https://*****:*****@121.40.186.237:50001' } proxies3 = { 'http': 'http://*****:*****@192.168.1.39:3428', 'https': 'https://*****:*****@192.168.1.39:3428' } proxies4 = { 'http': 'http://*****:*****@192.168.1.39:3428', 'https': 'https://*****:*****@192.168.1.39:3428' } proxies5 = { 'http': 'http://*****:*****@192.168.1.39:3428', 'https': 'https://*****:*****@192.168.1.39:3428' } self.proxies_dict = [proxies1, proxies2, proxies3, proxies4] #, proxies5] Spider.__init__(self, 400) self._aes_ = CCIQ_AES() #成功拿到的详情 self.query_success = FileSaver("成功拿到的详情900.txt") #失败的 self.query_failure = FileSaver("获取失败的机构代码和原因900.txt") #已经爬取过的列表 self.already_cname_list = FileSaver("已经爬过机构代码900.txt") #结果http 为400的code self.result400 = FileSaver("结果http=400的机构代码900.txt") #初始化已经爬过的公司 self.init_cname() self.extJsons = [ "Hoi6oX70l9whauZmjq8jVAmoe3UspXXhX9mPG+KAeqs1rKZVr/uapICH92P/Crryt63u28aP4QP665AzcT/jN5Go1o3bvwMvVIkuN9e60k6WI2pVFBrwZMvxwW6BnQukSzDSlyPvEhgpR5DIHQEV6C51hMgp4Zc3OkTSsyezAm4=", "ctlCXDvoyaH2pCIArrgvXp7zrZTzpz2Q5rukh+aWvupEFABw6P2AvbmaN+HJ7IZgDJ/kgBkJt/rLppSGitYCPKGR2IGv6OXZsrJGgbRB3G3Ac4K8xpX3aMB5s8Ci2a/YpTpioZxAvptqJsQUCoNn0tLCOVM4XxMJQWbrErkOcl4=", "ctlCXDvoyaH2pCIArrgvXp7zrZTzpz2Q5rukh+aWvupEFABw6P2AvbmaN+HJ7IZgDJ/kgBkJt/rLppSGitYCPKGR2IGv6OXZsrJGgbRB3G1U2wdOlL49/aDwt3NZNp4TGa5iBFpYLm69F/6PPFoXIR/Aw5p48//8OgZFpddDUwQ=" ] self.user_agents = [ "=CCIQ/2.0.1 (iPhone; iOS 9.1; Scale/2.00)", "=CCIQ/2.0.1 (iPhone; iOS 8.1.3; Scale/2.00)", "=CCIQ/2.0.1 (iPhone; iOS 8.4; Scale/2.00)" ] self.is_first = True self.init_time = 0 self.lock = threading.Lock() self.req_cnt = 0
class QycxbQuery(Spider): """ 根据企业名称.查询企业列表------针对900多万的公司名查询 """ def __init__(self): self.is_debug = False self._can_use_proxy_num = 0 if self.is_debug: Spider.__init__(self, 1) else: self.proxies_dict = [] self.read_proxy("../../_ct_proxy/proxy_all_filter.txt") Spider.__init__(self, len(self.proxies_dict)) self.error_cnt = 0 self._aes_ = CCIQ_AES() #根据公司名字查询到的公司列表全部信息 self.query_company_list = FileSaver("all_company_list.txt") #已经爬取过的公司名 self.already_cname_list = FileSaver("all_company_list_already.txt") #爬过的 错误类型 self.already_error_type = FileSaver("all_already_error_type.txt") #初始化已经爬过的公司 self.init_cname() self.extJsons = [ "Hoi6oX70l9whauZmjq8jVAmoe3UspXXhX9mPG+KAeqs1rKZVr/uapICH92P/Crryt63u28aP4QP665AzcT/jN5Go1o3bvwMvVIkuN9e60k6WI2pVFBrwZMvxwW6BnQukSzDSlyPvEhgpR5DIHQEV6C51hMgp4Zc3OkTSsyezAm4=", "ctlCXDvoyaH2pCIArrgvXp7zrZTzpz2Q5rukh+aWvupEFABw6P2AvbmaN+HJ7IZgDJ/kgBkJt/rLppSGitYCPKGR2IGv6OXZsrJGgbRB3G3Ac4K8xpX3aMB5s8Ci2a/YpTpioZxAvptqJsQUCoNn0tLCOVM4XxMJQWbrErkOcl4=", "ctlCXDvoyaH2pCIArrgvXp7zrZTzpz2Q5rukh+aWvupEFABw6P2AvbmaN+HJ7IZgDJ/kgBkJt/rLppSGitYCPKGR2IGv6OXZsrJGgbRB3G1U2wdOlL49/aDwt3NZNp4TGa5iBFpYLm69F/6PPFoXIR/Aw5p48//8OgZFpddDUwQ=" ] self.user_agents = [ "=CCIQ/2.0.1 (iPhone; iOS 9.1; Scale/2.00)", "=CCIQ/2.0.1 (iPhone; iOS 8.1.3; Scale/2.00)", "=CCIQ/2.0.1 (iPhone; iOS 8.4; Scale/2.00)" ] self.bloom = set() def req_all(self, encryptedJson): url = "http://appsvc.qiye.qianzhan.com/OrgCompany.svc/orgcompany/combine/search" number = random.randrange(0, 3, 1) self.select_user_agent(self.user_agents[number]) param = spider.util.utf8str({ "encryptedJson": self._aes_.encrypt(spider.util.utf8str(encryptedJson)), "extJson": self.extJsons[number] }) param = param.replace('/', "\/") try: if self.is_debug: res = self.request_url( url, headers={ "Content-Type": "application/json", "Accept-Language": "zh-Hans-CN;q=1" }, data=param, proxies={ 'http': 'http://*****:*****@121.41.79.4:18889', 'https': 'https://*****:*****@121.41.79.4:18889' }) #res = self.request_url(url, headers={"Content-Type": "application/json"}, data=param, proxies={'http': 'http://104.236.48.178:8080', 'https': 'https://104.236.48.178:8080'}) else: res = self.request_url( url, headers={ "Content-Type": "application/json", "Accept-Language": "zh-Hans-CN;q=1" }, data=param, proxies=self.proxies_dict[self.get_tid()]) if res is not None and res.code == 200: time.sleep(random.randrange(30, 50, 1)) else: time.sleep(5) return res except Exception as err: proxies = self.proxies_dict[self.get_tid()] print proxies[ 'http'], "@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@\n error = ", err def init_cname(self): with open("all_company_list_already.txt", "r") as f: for line in f: filter_name.add(line.strip()) def wait_q_breakable(self): lt = 0 while True: if not self.job_queue.empty() or not self.job_queue2.empty( ) or not self.job_queue3.empty(): time.sleep(5) if time.time() < lt + 1 and self._running_count == 0: return True time.sleep(2) lt = time.time() if self._worker_count == 0: return False def dispatch(self): with open("beijing_cname.txt", "r") as f: cnt = 0 for line in f: line = line.strip() cnt += 1 if line in filter_name: #print cnt, line, "already spider!!!" continue job = {"cname": line, "cnt": cnt, "retry": 0} self.add_job(job, True) self.wait_q_breakable() self.add_job(None, True) def record_spider(self, line): """ 已经爬过的,无论成功失败都算爬过. """ filter_name.add(line) self.already_cname_list.append(line) def run_job(self, job): cname = job.get("cname") cnt = job.get("cnt") retry = job.get("retry") if cname is None: print 'cname = ', cnt, ' is None ,break~' return self.flip_over(1, cname, cnt, retry) def flip_over(self, now_page, cname, cnt, retry): tid = self.get_tid() """ 根据公司名查询公司列表,翻页 """ encryptedJson = { "pagesize": "20", "page": now_page, "od_orderBy": "0", "sh_searchType": "一般搜索", "sh_oc_areaName": "", "od_statusFilter": "0", "v1": "QZOrgV005", "oc_name": cname, "sh_u_uid": "", "sh_u_name": "" } r_result = {"cname": cname} res = self.req_all(encryptedJson) res_code = 0 if res is None: if self.get_fail_cnt('failcount-none', 1) < 10: self.re_add_job({'cname': cname, 'cnt': cnt, 'retry': retry}) print "tid=%d --- cnt=%d --- cname=%s --- retry=%d --- res.code=%d --- now_page:%d" % ( tid, cnt, cname, retry, res_code, now_page) return else: # if retry > 5: # r_result["type"] = "None" # self.already_error_type.append(spider.util.utf8str(r_result)) # self.record_spider(cname) # print "tid=%d --- cnt=%d --- cname=%s --- retry=%d --- res.code=%d --- now_page:%d" % (tid, cnt, cname, retry, res_code, now_page) # else: # self.re_add_job({'cname':cname,'cnt':cnt, 'retry':(retry+1)}) self.re_add_job({'cname': cname, 'cnt': cnt, 'retry': retry}) raise AccountErrors.NoAccountError( "Maybe the proxy invalid,failcount-none = [ %d ],tid=[ %d ]" % (self.get_fail_cnt('failcount-none', 0), tid)) else: setattr(self._curltls, 'failcount-none', 0) res_code = res.code if (res_code >= 400 and res_code < 500) or res_code == 202: if self.get_fail_cnt('failcount-400', 1) < 5: self.re_add_job({'cname': cname, 'cnt': cnt, 'retry': retry}) print "tid=%d --- cnt=%d --- cname=%s --- retry=%d --- res.code=%d --- now_page:%d" % ( tid, cnt, cname, retry, res_code, now_page) return else: if retry > 5: r_result["type"] = "400+" self.already_error_type.append( spider.util.utf8str(r_result)) self.record_spider(cname) else: self.re_add_job({ 'cname': cname, 'cnt': cnt, 'retry': (retry + 1) }) self._can_use_proxy_num -= 1 raise AccountErrors.NoAccountError( "Maybe the proxy invalid,failcount-400 = [ %d ],tid=[ %d ]" % (self.get_fail_cnt('failcount-400', 0), tid)) else: setattr(self._curltls, 'failcount-400', 0) if res_code >= 500: # if retry > 2: # r_result["type"]="500" # self.already_error_type.append(spider.util.utf8str(r_result)) # self.record_spider(cname) # else: self.re_add_job({'cname': cname, 'cnt': cnt, 'retry': retry}) print "tid=%d --- cnt=%d --- cname=%s --- retry=%d --- res.code=%d --- now_page:%d " % ( tid, cnt, cname, retry, res_code, now_page) time.sleep(random.randrange(1, 10, 1)) return elif res_code == 200: try: c = eval(res.text)['c'] except Exception as err: print "tid=%d --- cnt=%d --- cname=%s --- retry=%d --- res.code=%d --- now_page:%d --- exception res.text - %s" % ( tid, cnt, cname, retry, res_code, now_page, err) # r_result["type"] = "res_error" # self.already_error_type.append(spider.util.utf8str(r_result)) # self.record_spider(cname) # self.error_cnt += 1 self.re_add_job({'cname': cname, 'cnt': cnt, 'retry': retry}) return if len(c) == 0: print "tid=%d --- cnt=%d --- cname=%s --- retry=%d --- res.code=%d --- now_page:%d --- exception 'C' IS NULL" % ( tid, cnt, cname, retry, res_code, now_page) r_result["type"] = "c=0" self.already_error_type.append(spider.util.utf8str(r_result)) self.record_spider(cname) self.error_cnt += 1 return result = CCIQ_AES("BF1856A312580D41256311147089E1CC").decrypt(c) try: dic = eval(result) except Exception as err: print "tid=%d --- cnt=%d --- cname=%s --- retry=%d --- res.code=%d --- now_page:%d --- exception result:%s" % ( tid, cnt, cname, retry, res_code, now_page, result) r_result["type"] = "result_error" self.already_error_type.append(spider.util.utf8str(r_result)) self.record_spider(cname) self.error_cnt += 1 return list = dic['list'] if len(list) == 0: print "tid=%d --- cnt=%d --- cname=%s --- retry=%d --- res.code=%d --- now_page:%d --- exception len(list)=0" % ( tid, cnt, cname, retry, res_code, now_page) r_result["type"] = "list=0" self.already_error_type.append(spider.util.utf8str(r_result)) self.record_spider(cname) self.error_cnt += 1 return #print "tid=%d ### cnt=%d ### cname=%s ### retry=%d ### res.code=%d ### now_page:%d ### success:len(list):%d " % (tid, cnt, cname, retry, res_code, now_page, len(list)) for l in list: aa = {"query_name": cname} for k, v in l.items(): aa[k] = v self.query_company_list.append(spider.util.utf8str(aa)) print "******", len(list), spider.util.utf8str(list) if len(list) < 20: # r_result["type"] = "success" # self.already_error_type.append(spider.util.utf8str(r_result)) self.record_spider(cname) return elif len(list) == 20: if now_page > 100: self.already_error_type.append( spider.util.utf8str(r_result)) self.record_spider(cname) return now_page += 1 self.flip_over(now_page, cname, cnt, retry) else: print "tid=%d --- cnt=%d --- cname=%s --- retry=%d --- res.code=%d --- now_page:%d --- exception UNKNOW ERROR" % ( tid, cnt, cname, retry, res_code, now_page) if retry < 3: self.re_add_job({ 'cname': cname, 'cnt': cnt, 'retry': (retry + 1) }) return r_result["type"] = "unknown_error" self.already_error_type.append(spider.util.utf8str(r_result)) self.record_spider(cname) return def get_fail_cnt(self, type_key, addv): fc = getattr(self._curltls, type_key, 0) if (addv): fc += addv setattr(self._curltls, type_key, fc) return fc def event_handler(self, evt, msg, **kwargs): if evt == 'DONE': msg += '企业查询宝APP公司列表查询已经停止...错误数:' + str(self.error_cnt) spider.util.sendmail('*****@*****.**', '%s DONE' % sys.argv[0], msg) def read_proxy(self, fn): with open(fn, 'r') as f: for line in f: line = line.strip() self._match_proxy(line) self._can_use_proxy_num = len(self.proxies_dict) print " loaded [ %d ] proxis " % self._can_use_proxy_num def _match_proxy(self, line): m = re.match('([0-9.]+):(\d+):([a-z0-9]+):([a-z0-9._-]+)$', line, re.I) m1 = re.match('([0-9.]+):(\d+):([a-z0-9]+)$', line, re.I) if m: prstr = '%s:%s@%s:%s' % (m.group(3), m.group(4), m.group(1), m.group(2)) proxies = {'http': 'http://' + prstr, 'https': 'https://' + prstr} elif m1: prstr = '%s:%s' % (m1.group(1), m1.group(2)) proxies = {'http': 'http://' + prstr, 'https': 'https://' + prstr} else: proxies = {'http': 'http://' + line, 'https': 'https://' + line} self.proxies_dict.append(proxies)
def run_job(self, jobid): code = jobid.get("code") retry = jobid.get("retry") tid = self.get_tid() url = "http://appsvc.qiye.qianzhan.com/OrgCompany.svc/orgcompany/combine/detail" encryptedJson = { "bl_oc_code": code, #code, #"71526726X" "v1": "QZOrgV005", "isDirect": "0", "bl_oc_name": "腾讯科技", #cname, #"腾讯科技" "bl_oc_area": "" #area #"4403" } detail = {} res = self.req_all(url, encryptedJson) res_code = 0 if res is None: print code, "get detail res is None !!" return res_code = res.code if res_code == 400: self.result400.append(code) self.req_cnt += 1 return try: if u"服务不可用。" in res.text or u"Unauthorized!" in res.text: # or u"处理请求时服务器遇到错误。有关详细信息,请参见服务器日志。" in res.text: self.re_add_job({'cname': code, 'retry': retry}) print "系统不可用...", code, res.text return c = eval(res.text)['c'] except Exception as err: print "tid=%d --- retry=%d --- res.code=%d exception " % ( tid, retry, res_code), err #res.text=%s#, spider.util.utf8str(res.text) self.re_add_job({'cname': code, 'retry': retry}) return if len(c) == 0: print "tid=%d --- retry=%d --- res.code=%d --- exception 'C' IS NULL" % ( tid, retry, res_code) self.query_failure.append(code + ",c=0") self.record_spider(code) return result = CCIQ_AES("BF1856A312580D41256311147089E1CC").decrypt(c) try: detail = eval(result) except Exception as err: print "tid=%d --- retry=%d --- res.code=%d --- exception result:%s" % ( tid, retry, res_code, result) self.query_failure.append(code + ",result_error") self.record_spider(code) return cname = None try: basic = detail["list"] if basic is None or len(basic) == 0: print code, " 此码无效...", spider.util.utf8str(detail) self.query_failure.append(code + ",list=0") self.record_spider(code) return cname = basic[0]["oc_name"] except Exception as err: print code, "获取基本详情错误,拿不到oc_name,detail : ", spider.util.utf8str( detail) return #股东信息 # listGD = self.get_gd(code) # if listGD is not None: # #print "tid=", tid, " listGD=", spider.util.utf8str(listGD) # detail['listGD'] = listGD['listGD'] #投资信息 # list_inversted = self.get_inversted(cname) # if list_inversted is not None: # #print "tid=", tid, " list_inversted=", spider.util.utf8str(list_inversted) # detail['inversted'] = list_inversted['inversted'] # #获取分支机构信息 # branch = [] # list_branch = self.get_branch(cname, list_branch=branch) # if list_branch is not None: # #print "tid=", tid, " list_branch=", spider.util.utf8str(list_branch) # detail['Branch'] = list_branch #['Branch'] self.query_success.append(spider.util.utf8str(detail)) self.record_spider(code) print "tid=%d --- retry=%d --- res.code=%d @@@ success: %s \n " % ( tid, retry, res_code, spider.util.utf8str( self.proxies)), spider.util.utf8str(detail)
def flip_over(self, now_page, cname, cnt, retry): tid = self.get_tid() """ 根据公司名查询公司列表,翻页 """ encryptedJson = { "pagesize": "20", "page": now_page, "od_orderBy": "0", "sh_searchType": "一般搜索", "sh_oc_areaName": "", "od_statusFilter": "0", "v1": "QZOrgV005", "oc_name": cname, "sh_u_uid": "", "sh_u_name": "" } r_result = {"cname": cname} res = self.req_all(encryptedJson, cname=cname) res_code = 0 if res is None: self.re_add_job({'cname': cname, 'cnt': cnt, 'retry': retry}) return if u"处理请求时服务器遇到错误。有关详细信息,请参见服务器日志" in res.text: print "处理请求时服务器遇到错误。有关详细信息,请参见服务器日志..." if retry < 3: self.re_add_job({ 'cname': cname, 'cnt': cnt, 'retry': (retry + 1) }) return else: r_result["type"] = "request-server-error" self.already_error_type.append(spider.util.utf8str(r_result)) self.record_spider(cname) return try: c = eval(res.text)['c'] except Exception as err: print "tid=%d --- cnt=%d --- cname=%s --- retry=%d --- res.code=%d --- now_page:%d --- exception res.text = %s" % ( tid, cnt, cname, retry, res_code, now_page, spider.util.utf8str(res.text)) if retry < 3: self.re_add_job({ 'cname': cname, 'cnt': cnt, 'retry': (retry + 1) }) else: r_result["type"] = "res.text=invalid" self.already_error_type.append(spider.util.utf8str(r_result)) self.record_spider(cname) return if len(c) == 0: print "tid=%d --- cnt=%d --- cname=%s --- retry=%d --- res.code=%d --- now_page:%d --- exception 'C' IS NULL" % ( tid, cnt, cname, retry, res_code, now_page) r_result["type"] = "c=0" self.already_error_type.append(spider.util.utf8str(r_result)) self.record_spider(cname) self.error_cnt += 1 return result = CCIQ_AES("BF1856A312580D41256311147089E1CC").decrypt(c) try: dic = eval(result) except Exception as err: print "tid=%d --- cnt=%d --- cname=%s --- retry=%d --- res.code=%d --- now_page:%d --- exception result:%s" % ( tid, cnt, cname, retry, res_code, now_page, result) r_result["type"] = "result_error" self.already_error_type.append(spider.util.utf8str(r_result)) self.record_spider(cname) self.error_cnt += 1 return list = dic['list'] if len(list) == 0: print "tid=%d --- cnt=%d --- cname=%s --- retry=%d --- res.code=%d --- now_page:%d --- exception len(list)=0" % ( tid, cnt, cname, retry, res_code, now_page) r_result["type"] = "list=0" self.already_error_type.append(spider.util.utf8str(r_result)) self.record_spider(cname) self.error_cnt += 1 return for l in list: aa = {"query_name": cname} for k, v in l.items(): aa[k] = v self.query_company_list.append(spider.util.utf8str(aa)) print "******", len(list), spider.util.utf8str(list) if len(list) < 20: self.record_spider(cname) return elif len(list) == 20: if now_page > 2: self.already_error_type.append(spider.util.utf8str(r_result)) self.record_spider(cname) return now_page += 1 self.flip_over(now_page, cname, cnt, retry)
def get_detail(self, cname, code, area): """ 查询某公司详细信息 """ url = "http://appsvc.qiye.qianzhan.com/OrgCompany.svc/orgcompany/combine/detail" encryptedJson = { "bl_oc_code": code, #"71526726X" "v1": "QZOrgV004", "isDirect": "1", "bl_oc_name": cname, #"腾讯科技" "bl_oc_area": area #"4403" } param = spider.util.utf8str({ "encryptedJson": self._aes_.encrypt(spider.util.utf8str(encryptedJson)), "extJson": self.extJson }) res = self.request_url(url, headers=self.headers, data=param, proxies=self.proxies_dict[self.get_tid()]) if res is None: print 'get_detail ------ res is none ,---->cname=', cname self.detail_failure.append(cname + "|" + str(code) + "|" + str(area)) return elif res.code == 404: print "get_detail ------ 404 ------ ", cname, code self.detail_failure.append(cname + "|" + str(code) + "|" + str(area)) return elif res.code == 503 or res.code == 500 or res.code == 502 or res.code == 504: print 'get_detail ------ ', res.code, cname, code time.sleep(0.5) self.get_detail(cname, code, area) return elif res.code == 200: c = eval(res.text)['c'] if len(c) == 0: print '-----------------------------code ', code, ' res.text is null----------------------------' self.detail_failure.append(cname + "|" + str(code) + "|" + str(area)) return result = CCIQ_AES("BB1856A312580D41256311147089E0CC").decrypt(c) detail = eval(result) #获取股东信息 listGD = self.get_gd(area, code) if listGD is not None: detail['listGD'] = listGD['listGD'] #获取投资信息 list_inversted = self.get_inversted(cname) if list_inversted is not None: detail['inversted'] = list_inversted['inversted'] #获取分支机构信息 list_branch = self.get_branch(cname, 1, {"Branch": []}) if list_branch is not None: detail['Branch'] = list_branch['Branch'] print 'detail=================================', spider.util.utf8str( detail) self.detail_company.append(spider.util.utf8str(detail)) return else: print "cname %s #######################################UNKNOWN ERROR############################################# [ %d ]" % ( cname, res.code)