def save_info(self, job, jsonobj): with self.locker: if job["type"] == "QuerySummary": name = jsonobj.get("ENTNAME", "-") regNo = jsonobj.get("REGNO", "-") id = jsonobj.get("ID", "-") self.namefile.write(job["line"] + " " + name.encode("utf-8") + " " + regNo.encode("utf-8") + " " + id.encode("utf-8") + "\n") self.namefile.flush() self.binsaver.append( name.encode("utf-8") + "_" + regNo.encode("utf-8"), json.dumps(jsonobj)) spider.runtime.Log.info("%s:%s=========>saved." % (job["Q"], name)) elif job["type"] == "QueryAutoName": if "ERRCODE" in jsonobj: if not self.re_add_job(job): self.save_fail_info(job) Log.error("ErrCode, proxy down.") raise AccountErrors.NoAccountError() for name in jsonobj: self.namefile.write(job["line"] + " " + name.encode("utf-8") + "\n") self.namefile.flush() spider.runtime.Log.info("%s:%s=========>saved." % (job["Q"], name))
def run_job(self, jobid): self.num_count += 1 #print "job is ", jobid url = "http://www.jobui.com/job/%d/" % (jobid) # url = 'http://www.jobui.com/job/1962956760/' res = self.request_url(url) print "id:{} , Page status: {} ".format(jobid, res.code) if res is None: print "%d failed, sleeping 10 secs." % jobid time.sleep(5) self.add_job(jobid) return elif res.code == 404: time.sleep(3) return elif res.code == 503: print "maybe speed too fast..." time.sleep(5) self.add_job(jobid) return elif res.code == 200: print "saving %d ..." % jobid with self._savelock: with open("jobid.txt", "a+b") as f: f.write("%s\n" % jobid) #f.flush() #fn = 'jobui_job.%d.%d' % (jobid, int(time.time())) #self.bs.append(fn, res.text) time.sleep(5) else: Log.error("unknown xxxxx") Log.errorbin("%s" % jobid, res.text) raise AccountErrors.NoAccountError('fatal error')
def run_job(self, jobid): gsweb = getattr(self._curltls, "gsweb", None) if gsweb is None: gsweb = self.init_obj() cname = jobid.get("name") cnt = jobid.get("cnt") out = gsweb.search_company(cname) if out is None: self.job_retry(jobid) if self.get_fail_cnt("failcnt", 1) > 10: raise AccountErrors.NoAccountError( "Maybe the proxy invalid,failcount-none = [ %d ]" % self.get_fail_cnt("failcnt", 0)) else: setattr(self._curltls, "failcnt", 0) if len(out) == 0: print cnt, "--->", cname, '---> query list result length = 0' #self.record_spider(cname) self.result_null.append(cname) filter_name.add(cname) else: for oi in out: self.success.append(spider.util.utf8str(oi)) self.record_spider(cname) if time.time() - self.run_time > 20: print "speed------> ------> ------> ------> ------> ------>", self.cnt / ( time.time() - self.run_time), "t/s" self.run_time = time.time() self.cnt = 1 time.sleep(random.randrange(1, 6, 1))
def get_fail_cnt(self, addv): fc = getattr(self._curltls, "failcnt", 0) if fc > 10: raise AccountErrors.NoAccountError("Maybe the proxy invalid,failcnt = [ 10 ]") else: if addv: fc += addv setattr(self._curltls, "failcnt", fc)
def run_job(self, jobid): jobid_int = jobid.get("id") url = "http://www.jobui.com/job/%d/" % (jobid_int) tid = self.get_tid() proxies = self.proxies_dict[tid] #print "this request tid = [ %s ] proxies = [ %s ]" % (tid,proxies) res = self.request_url(url, proxies=self.proxies_dict[self.get_tid()]) self.num_count += 1 #print "id : %d ------------- response code : %s " % (jobid_int, "Response Is None" if res is None else str(res.code)) if res is None: if self.get_fail_cnt(1) < 10: self.add_job(jobid) else: print "id is [ %s ] thread and [ %s ] proxy will be close and drop." % ( tid, proxies) self.__fail_ids.append(str(jobid_int)) raise AccountErrors.NoAccountError( "Maybe the proxy[ %s ] invalid,failcount = [ %d ]" % (proxies, self.get_fail_cnt(0))) return else: setattr(self._curltls, 'failcount', 0) if res.code == 404: print "%d ------ 404" % jobid_int return elif res.code == 503 or res.code == 500 or res.code == 502 or res.code == 504: print "%d ------ %d " % (jobid_int, res.code) self.add_job(jobid) time.sleep(0.8) return elif res.code == 200: print "%d ------ saving " % jobid_int fn = 'jobui_job.%d.%d' % (jobid_int, int(time.time())) self.bs.append(fn, res.text) if self.bs.getsize() >= 8 * 1024 * 1024 * 1024: raise AccountErrors.NoAccountError('file too large') else: print "#######################################UNKNOWN ERROR############################################# [ %d ]" % res.code Log.error("unknown error...") Log.errorbin("%s" % jobid_int, res.text) raise AccountErrors.NoAccountError('fatal error')
def count_proxy_error(self, error_type): cnt = getattr(self._proxy_error, "proxy_error_cnt", 0) if error_type: setattr(self._proxy_error, "proxy_error_cnt", 0) else: if cnt > 10: raise AccountErrors.NoAccountError("THE PROXY IS INVALID ! ! !") else: setattr(self._proxy_error, "proxy_error_cnt", (cnt+1))
def job_retry(self, job, addv): retry = job.get("retry") retry += 1 job.update({"retry": retry}) self.re_add_job(job) if self.get_fail_cnt("failcnt", addv) > 15: if not self.is_debug: raise AccountErrors.NoAccountError( "Maybe the proxy invalid,failcount-none = [ %d ]" % self.get_fail_cnt("failcnt", 0))
def run_job(self, jobid): url = jobid.get("url") print "url == ",url tid = self.get_tid() res = self.sessionReq.request_url(url) self.num_count += 1 if res is None: if self.get_fail_cnt(1) < 10: self.add_job(jobid) else: self.__fail_urls.append(url) raise AccountErrors.NoAccountError("failcount = [ %d ]" % (self.get_fail_cnt(0))) return else: setattr(self._curltls,'failcount',0) if res.code == 404: print "%s ------ 404" % url return elif res.code == 503 or res.code == 500 or res.code == 502 or res.code == 504: print "%s ------ %d " % (url,res.code) self.add_job(jobid) time.sleep(0.8) return elif res.code == 200: print "%s ------ saving " % url self.parse(res.content) con = [] type = {"key":con} str1 = json.dumps(type) self.savefile.append(str1) #print "content======\n",res.content #self.bs.append(fn, res.text) else: print "#######################################UNKNOWN ERROR############################################# [ %d ]" % res.code #Log.error("unknown error...") #Log.errorbin("%s" %url, res.text) raise AccountErrors.NoAccountError('fatal error')
def run_job(self, job): Log.info("Running job:" + spider.util.utf8str(job)) # thread_check if self.thread_check() == False: # raise NoAccountError to set end_this_thread true, spider will readd the job. See Spider._job_runner raise AccountErrors.NoAccountError() if job["type"] == "QuerySummary": self.get_summary(job) elif job["type"] == "QueryDetail": self.get_detail(job) elif job["type"] == "QueryAutoName": self.get_autoname(job)
def run_job(self, jobid): url = jobid.get("url") tid = self.get_tid() proxies = self.proxies_dict[tid] res = self.request_url(url, proxies=self.proxies_dict[self.get_tid()]) self.request_count += 1 if res is None: if self.get_fail_cnt(1) < 3: self.add_job(jobid) else: print "id is [ %s ] thread and [ %s ] proxy will be close and drop." % ( tid, proxies) self.__fail_add_url.append(url) raise AccountErrors.NoAccountError( "Maybe the proxy[ %s ] invalid,failcount = [ %d ]" % (proxies, self.get_fail_cnt(0))) return else: setattr(self._curltls, 'failcount', 0) if res.code == 404: print "%s ======》 404" % url return elif res.code == 503 or res.code == 500 or res.code == 502 or res.code == 504: print "%s ------> %d " % (url, res.code) self.add_job(jobid) time.sleep(1) return elif res.code == 200: print "%s ————> will be into database......." % url #http://www.jobui.com/job/92336088/ m = re.search(ur'http://www.jobui.com/job/(\d+)/', url) if m: jid = m.group(1) self.page_store.save(int(time.time()), jid, url, res.text) self.success_count += 1 self.parseDomain(res.text) else: print "#######################################UNKNOWN ERROR############################################# [ %d ]" % res.code self.__fail_add_url.append(url) #raise AccountErrors.NoAccountError('fatal error') #if self.request_count % 10000 == range(0,9): print "request_count:{},success_count:{},request_speed:{}".format( self.request_count, self.success_count, self.request_count / (time.time() - self.start_time))
def run_job(self, job): gsweb = self.init_obj() kw = job.get("kw") retry = job.get("retry") cnt = job.get("cnt") out = gsweb.search_company(kw) if out is None: self.job_retry(job) return if len(out) != 0 and out[0] == "stop": self.job_retry(job) raise AccountErrors.NoAccountError("The proxy invalid , IP stop !!!") all = len(out) scs_cnt = 0 for oi in out: cname = oi["name"] url = oi["url"] regcode = oi["regcode"] s = cname+","+str(regcode) if s in filter_queries: #如果已经爬取过了,略过 all -= 1 continue retry2 = 0 while True: flag = gsweb.get_detail(url, cname, regcode) if flag: self.record_spider_queries(s) scs_cnt += 1 break else: #self.get_fail_cnt(1) retry2 += 1 if retry2 > 5: break else: time.sleep(random.randrange(3, 8, 1)) if scs_cnt == all: self.record_spider_kw(kw) else: self.job_retry(job) if time.time() - self.run_time > 20: print "speed------> ------> ------> ------> ------> ------>", self.cnt/(time.time() - self.run_time), "t/s" self.run_time = time.time() self.cnt = 1
def run_job(self, job): if job["type"] is "u1": key = job["kw"] page = str(job["page"]) url = "http://qichacha.com/search?key=" + key + "&index=name&" + "p=" + page # con = self.qcclogin.request_url(url) con = self.qcc_acc_manager.el_request(url) res = con.text if res.strip() == "": time.sleep(10) self.add_job(job) return elif re.search(u'小查还没找到数据', res): Log.error("key=" + key + ", page=" + page + ", no data!\n") else: Log.error("searching %s" % key) urls = self._match( res, r'<h3 class="site-list-title"><a href="(.*?)"') if len(urls) == 0: Log.errorbin("%s %s" % (key, url), con.text) raise AccountErrors.NoAccountError(key) for u in urls: job2 = {"url": u, "type": "u2", "retrycnt": "0"} self.add_job(job2) # catch page 1 only # if page is '1': # corp_count = int(self._match(res, r'<span class="search-key">(.*?)</span>')[0]) # pg_count = (corp_count + 9)/10 # #not vip limit in 10 pages # if pg_count >= 10: # pg_count = 10 # for i in range(2, pg_count+1): # job3 = {"kw": key, "page": str(i), "type": "u1"} # self.add_job(job3) elif job["type"] is "u2": url = "http://qichacha.com" + job["url"] cpid = job["url"][1:] if self.pagestore.check_should_fetch(cpid): con = self.request_url(url) if con is None or self.retry(con, job): return self.pagestore.save(int(time.time()), cpid, url, con.text) else: Log.warning("skip ", cpid)
def run_job(self, jobid): jobid_int = jobid.get("id") url = "http://www.jobui.com/job/%d/" % (jobid_int) tid = self.get_tid() proxies = self.proxies_dict[tid] res = self.request_url(url, proxies=self.proxies_dict[self.get_tid()]) self.request_count += 1 if res is None: if self.get_fail_cnt(1) < 10: self.add_job(jobid) else: print "id is [ %s ] thread and [ %s ] proxy will be close and drop." % ( tid, proxies) self.__fail_ids.append(str(jobid_int)) raise AccountErrors.NoAccountError( "Maybe the proxy[ %s ] invalid,failcount = [ %d ]" % (proxies, self.get_fail_cnt(0))) return else: setattr(self._curltls, 'failcount', 0) if res.code == 404: print "%d ======》 404" % jobid_int return elif res.code == 503 or res.code == 500 or res.code == 502 or res.code == 504: print "%d ------> %d " % (jobid_int, res.code) self.add_job(jobid) time.sleep(1) return elif res.code == 200: print "%d ————> will be into database......." % jobid_int self.page_store.save(int(time.time()), str(jobid_int), url, res.text) self.success_count += 1 else: print "#######################################UNKNOWN ERROR############################################# [ %d ]" % res.code self.__fail_ids.append(str(jobid_int)) #raise AccountErrors.NoAccountError('fatal error') #if self.request_count % 10000 == range(0,9): print "request_count:{},success_count:{},request_speed:{}".format( self.request_count, self.success_count, self.request_count / (time.time() - self.start_time))
def flip_over(self, now_page, cname, line, cnt, retry): url = "http://appsvc.qiye.qianzhan.com/OrgCompany.svc/orgcompany/combine/search" headers = {"Content-Type": "application/json"} encryptedJson = { "pagesize": "20", "page": now_page, "od_orderBy": "0", "sh_searchType": "一般搜索", "od_statusFilter": "0", "v1": "QZOrgV004", "oc_name": cname, "sh_u_uid": "", "sh_u_name": "" } param = { "encryptedJson": self._aes_.encrypt(spider.util.utf8str(encryptedJson)), "extJson": self.extJson } param = spider.util.utf8str(param) res = self.request_url(url, headers=headers, data=param, proxies=self.proxies_dict[self.get_tid()]) if res is None: if self.get_fail_cnt(1) < 10: print "%d-----%s ------ res is None" % (cnt, cname) self.add_job({'line': line, 'cnt': cnt}) return False else: print "id is [ %s ] thread and [ %s ] proxy will be close and drop." % ( self.get_tid(), self.proxies_dict[self.get_tid()]) #self.query_company_info_failure.append(line) self.add_job({'line': line, 'cnt': cnt}) raise AccountErrors.NoAccountError( "Maybe the proxy[ %s ] invalid,failcount = [ %d ]" % (self.proxies_dict[self.get_tid()], self.get_fail_cnt(0))) elif res.code == 404 or res.code == 403: if self.get_fail_cnt(1) < 20: print "%d-----%s ------ %d" % (cnt, cname, res.code) self.add_job({'line': line, 'cnt': cnt}) return False else: print "id is [ %s ] thread and [ %s ] proxy will be close and drop." % ( self.get_tid(), self.proxies_dict[self.get_tid()]) self.add_job({'line': line, 'cnt': cnt}) raise AccountErrors.NoAccountError( "Maybe the proxy[ %s ] invalid,failcount = [ %d ]" % (self.proxies_dict[self.get_tid()], self.get_fail_cnt(0))) elif res.code == 503 or res.code == 500 or res.code == 502 or res.code == 504: print "%d------%s ------ %d " % (cnt, cname, res.code) self.add_job({'line': line, 'cnt': cnt}) time.sleep(1) return False elif res.code == 200: c = eval(res.text)['c'] if len(c) == 0: print '-----------------------------cname %s res.text is null----------------------------' % cname self.query_company_info_failure.append(line) return True result = CCIQ_AES("BB1856A312580D41256311147089E0CC").decrypt(c) dic = eval(result) list = dic['list'] if len(list) == 0: print 'cname %s result list length = 0 ' % cname self.query_company_info_failure.append(line) return True print 'cname %s result ################### list length ------ %d' % ( cname, len(list)) for l in list: aa = {} for k, v in l.items(): aa[k] = v self.query_company_info.append(spider.util.utf8str(aa)) part = cname + "|" + l['oc_name'] + "|" + str( l['oc_area']) + "|" + str(l['oc_code']) + "|" + str( l['oc_number']) self.query_company_info_part.append(part) self.get_detail(l['oc_name'], l['oc_code'], l['oc_area']) if len(list) < 20: return True elif len(list) == 20: now_page += 1 self.flip_over(now_page, cname, line, cnt) else: print "cname %s #######################################UNKNOWN ERROR############################################# [ %d ]" % ( cname, res.code) self.query_company_info_failure.append(line) return True
def run_job(self, jobid): tid = self.get_tid() gsweb = getattr(self._curltls, "gsweb", None) if gsweb is None: gsweb = self.init_obj() tp = jobid["type"] cnt = jobid.get("cnt") if tp == "query": qname = jobid.get("qname") if qname in filter_name: print cnt, "已经查询过:", qname return out = gsweb.search_company(qname) if out is None: self.job_retry(jobid, 1) return else: setattr(self._curltls, "failcnt", 0) if "PROXY-ERROR" in out: raise AccountErrors.NoAccountError( "Maybe the proxy invalid - PROXY-ERROR ") elif len(out) == 0: print cnt, qname, ' 查询公司列表为空...' self.record_spider(qname) return else: for oi in out: cname = oi["name"] if cname in filter_name: print cnt, "已经爬取过:", cname return job = {"oi": oi, "type": "detail", "cnt": cnt, "retry": 0} self.add_main_job(job) self.un_spider_name.append(cname) self.record_spider(qname) elif tp == "detail": oi = jobid["oi"] cname = oi["name"] url = oi["url"] regist_code = oi["regcode"] gd = "gsxt.gdgs.gov.cn/aiccips/GSpublicity/GSpublicityList.html" sz = "szcredit.com.cn/web/GSZJGSPT/QyxyDetail.aspx" gz1 = "gsxt.gzaic.gov.cn/search/search!entityShow" gz2 = "gsxt.gzaic.gov.cn/aiccips/GSpublicity/GSpublicityList.html" flg = None if gd in url: flg = gsweb.get_GSpublicityList(cnt, cname, url, regist_code) elif sz in url: flg = gsweb.get_QyxyDetail(cnt, cname, url, regist_code, tid=tid) elif gz1 in url: flg = gsweb.get_entityShow(cnt, cname, url, regist_code) #此链接跑完需重新初始化对象 self.init_obj() elif gz2 in url: flg = gsweb.get_guangzhou(cnt, cname, url, regist_code) else: print "未知的链接类型--->", url Log.error("UNKNOWN LINK TYPE," + url) return if flg == "success": self.record_spider(cname) elif flg == "proxy_error": self.job_retry(jobid, 1) elif flg == "notdisplay": oi["error"] = "notdisplay" self.not_show_save.append(spider.util.utf8str(oi)) #self.job_retry(jobid, 0) elif flg == "return_error": oi["error"] = "return_page_error" self.not_show_save.append(spider.util.utf8str(oi)) #self.job_retry(jobid, 0) else: self.job_retry(jobid, 0) if time.time() - self.run_time > 20: print cnt, "====================== speed =====================", self.cnt / ( time.time() - self.run_time), "t/s" self.run_time = time.time() self.cnt = 1
def get_detail(self, line, cnt, retry): tid = self.get_tid() try: param = eval(line) except Exception as err: print 'tid=%d --- cnt=%d --- data is not json, return'%(tid, cnt) self.record_spider(line,'UNKNOW') return cname = param['oc_name'] if cname in self.bloom: cname = param['query_name'] if cname in self.bloom: print 'query_name:%s aleready crawler...'%cname return ccode = param['oc_code'] carea = param['oc_area'] url = "http://appsvc.qiye.qianzhan.com/OrgCompany.svc/orgcompany/combine/detail" encryptedJson = { "bl_oc_code" : ccode,#code, #"71526726X" "v1" : "QZOrgV005", "isDirect" : "0", "bl_oc_name" : cname,#cname, #"腾讯科技" "bl_oc_area" : carea #area #"4403" } res = self.req_all(url, encryptedJson) res_code = 0 if res is None : if self.get_fail_cnt(1, 'failcount-none') < 10: self.re_add_job({'line':line,'cnt':cnt, 'retry':retry}) print "tid=%d --- cnt=%d --- cname=%s --- retry=%d --- res.code=%d " % (tid, cnt, cname, retry, res_code) return else: # if retry > 5: # self.query_failure.append(line) # self.record_spider(line, cname) # return # else: self.re_add_job({'line':line,'cnt':cnt, 'retry':(retry+1)}) self._can_use_proxy_num -= 1 raise AccountErrors.NoAccountError("Maybe the proxy invalid,failcount-none = [ %d ]" % self.get_fail_cnt(0, 'failcount-none')) else: setattr(self._curltls, 'failcount-none', 0) res_code = res.code if (res_code >= 400 and res_code < 500) or res_code == 202 : #print time.time(),"出现################",(time.time()-self.init_time), " res.code=", res_code # if retry > 20: # self.query_failure.append(line) # self.record_spider(line, cname) # else: self.re_add_job({'line':line,'cnt':cnt, 'retry':(retry+1)}) print "tid=%d --- cnt=%d --- cname=%s --- retry=%d --- res.code=%d " % (tid, cnt, cname, retry, res_code) if self.get_fail_cnt(1, 'failcount-400') > 30: self._can_use_proxy_num -= 1 raise AccountErrors.NoAccountError("Maybe the proxy invalid,failcount-400 = [ %d ]" % self.get_fail_cnt(0, 'failcount-400')) return else: setattr(self._curltls, 'failcount-400', 0) if res_code >= 500: # if retry > 5: # self.query_failure.append(line) # self.record_spider(line, cname) # else: self.re_add_job({'line':line,'cnt':cnt, 'retry':(retry+1)}) print "tid=%d --- cnt=%d --- cname=%s --- retry=%d --- res.code=%d " % (tid, cnt, cname, retry, res_code) time.sleep(2) return elif res_code == 200: try: c = eval(res.text)['c'] except Exception as err: print "tid=%d --- cnt=%d --- cname=%s --- retry=%d --- res.code=%d exception res.text " % (tid, cnt, cname, retry, res_code) #print "exception res.text:\n", res.text self.query_failure.append(line) self.record_spider(line, cname) return if len(c) == 0: print "tid=%d --- cnt=%d --- cname=%s --- retry=%d --- res.code=%d --- exception 'C' IS NULL" % (tid, cnt, cname, retry, res_code) self.query_failure.append(line) self.record_spider(line, cname) return result = CCIQ_AES("BF1856A312580D41256311147089E1CC").decrypt(c) try: detail = eval(result) except Exception as err: print "tid=%d --- cnt=%d --- cname=%s --- retry=%d --- res.code=%d --- exception result:%s" % (tid, cnt, cname, retry, res_code, result) self.query_failure.append(line) self.record_spider(line, cname) return #print 'tid=', tid, 'proxy=', self.proxies_dict[tid], ' detail=',spider.util.utf8str(detail) #print 'tid=', tid, ' detail=',spider.util.utf8str(detail) #股东信息 listGD = self.get_gd(carea, ccode, cname, 0) if listGD is not None: #print "tid=",tid," listGD=",spider.util.utf8str(listGD) detail['listGD'] = listGD['listGD'] #投资信息 list_inversted = self.get_inversted(cname, 0) if list_inversted is not None: #print "tid=",tid," list_inversted=",spider.util.utf8str(list_inversted) detail['inversted'] = list_inversted['inversted'] #获取分支机构信息 list_branch = self.get_branch(cname, 1, {"Branch": []}, 0) if list_branch is not None: #print "tid=",tid," list_branch=",spider.util.utf8str(list_branch) detail['Branch'] = list_branch['Branch'] self.query_success.append(spider.util.utf8str(detail)) self.record_spider(line, cname) print "tid=%d --- proxy=%s --- cnt=%d --- cname=%s --- retry=%d --- res.code=%d --- success:\n %s" % (tid,self.proxies_dict[tid], cnt, cname, retry, res_code, spider.util.utf8str(detail)) else: self.query_failure.append(line) self.record_spider(line, cname) print "tid=%d --- cnt=%d --- cname=%s --- retry=%d --- res.code=%d --- exception UNKNOW ERROR" % (tid, cnt, cname, retry, res_code) return
def run_job(self, jobid): jobid_int = jobid.get("id") retry = jobid.get("retry") url = "http://www.jobui.com/job/%d/" % (jobid_int) tid = self.get_tid() proxies = self.proxies_dict[tid] res = self.request_url(url, proxies=self.proxies_dict[self.get_tid()]) self.request_count += 1 self.serial_num += 1 if res is None: if self.get_fail_cnt(1, 'failcount-none') < 10: self.re_add_job(jobid) else: print "id is [ %s ] thread and [ %s ] proxy will be close and drop." % ( tid, proxies) self.__fail_ids.append(str(jobid_int)) raise AccountErrors.NoAccountError( "Maybe the proxy[ %s ] invalid,failcount = [ %d ]" % (proxies, self.get_fail_cnt(0, 'failcount-none'))) #return else: setattr(self._curltls, 'failcount-none', 0) if res.code == 407: if self.get_fail_cnt(1, 'failcount-407') < 10: self.re_add_job(jobid) else: print "id is [ %s ] thread and [ %s ] proxy will be close and drop." % ( tid, proxies) self.__fail_ids.append(str(jobid_int)) raise AccountErrors.NoAccountError( "Maybe the proxy[ %s ] invalid,failcount = [ %d ]" % (proxies, self.get_fail_cnt(0, 'failcount-407'))) #return else: setattr(self._curltls, 'failcount-407', 0) if res.code == 404: print "%d ======》 404 ---> retry:%d" % (jobid_int, retry) if retry < 3: self.re_add_job({"id": jobid_int, "retry": (retry + 1)}) else: self.__fail_ids.append(str(jobid_int)) #return elif res.code == 503 or res.code == 500 or res.code == 502 or res.code == 504: print "%d ------> %d " % (jobid_int, res.code) self.re_add_job(jobid) time.sleep(random.randrange(1, 3, 1)) #return elif res.code == 200: self.serial_num = 0 print "%d ————> will be into database......." % jobid_int self.page_store.save(int(time.time()), str(jobid_int), url, res.text) self.success_count += 1 else: print "#######################################UNKNOWN ERROR############################################# [ %d ]" % res.code if retry < 3: self.re_add_job({"id": jobid_int, "retry": (retry + 1)}) else: self.__fail_ids.append(str(jobid_int)) print "serial_number:{},request_count:{},success_count:{},request_speed:{}".format( self.serial_num, self.request_count, self.success_count, self.request_count / (time.time() - self.start_time))
def flip_over(self, now_page, cname, cnt, retry): tid = self.get_tid() """ 根据公司名查询公司列表,翻页 """ encryptedJson = { "pagesize": "20", "page": now_page, "od_orderBy": "0", "sh_searchType": "一般搜索", "sh_oc_areaName": "", "od_statusFilter": "0", "v1": "QZOrgV005", "oc_name": cname, "sh_u_uid": "", "sh_u_name": "" } r_result = {"cname": cname} res = self.req_all(encryptedJson) res_code = 0 if res is None: if self.get_fail_cnt('failcount-none', 1) < 10: self.re_add_job({'cname': cname, 'cnt': cnt, 'retry': retry}) print "tid=%d --- cnt=%d --- cname=%s --- retry=%d --- res.code=%d --- now_page:%d" % ( tid, cnt, cname, retry, res_code, now_page) return else: # if retry > 5: # r_result["type"] = "None" # self.already_error_type.append(spider.util.utf8str(r_result)) # self.record_spider(cname) # print "tid=%d --- cnt=%d --- cname=%s --- retry=%d --- res.code=%d --- now_page:%d" % (tid, cnt, cname, retry, res_code, now_page) # else: # self.re_add_job({'cname':cname,'cnt':cnt, 'retry':(retry+1)}) self.re_add_job({'cname': cname, 'cnt': cnt, 'retry': retry}) raise AccountErrors.NoAccountError( "Maybe the proxy invalid,failcount-none = [ %d ],tid=[ %d ]" % (self.get_fail_cnt('failcount-none', 0), tid)) else: setattr(self._curltls, 'failcount-none', 0) res_code = res.code if (res_code >= 400 and res_code < 500) or res_code == 202: if self.get_fail_cnt('failcount-400', 1) < 5: self.re_add_job({'cname': cname, 'cnt': cnt, 'retry': retry}) print "tid=%d --- cnt=%d --- cname=%s --- retry=%d --- res.code=%d --- now_page:%d" % ( tid, cnt, cname, retry, res_code, now_page) return else: if retry > 5: r_result["type"] = "400+" self.already_error_type.append( spider.util.utf8str(r_result)) self.record_spider(cname) else: self.re_add_job({ 'cname': cname, 'cnt': cnt, 'retry': (retry + 1) }) self._can_use_proxy_num -= 1 raise AccountErrors.NoAccountError( "Maybe the proxy invalid,failcount-400 = [ %d ],tid=[ %d ]" % (self.get_fail_cnt('failcount-400', 0), tid)) else: setattr(self._curltls, 'failcount-400', 0) if res_code >= 500: # if retry > 2: # r_result["type"]="500" # self.already_error_type.append(spider.util.utf8str(r_result)) # self.record_spider(cname) # else: self.re_add_job({'cname': cname, 'cnt': cnt, 'retry': retry}) print "tid=%d --- cnt=%d --- cname=%s --- retry=%d --- res.code=%d --- now_page:%d " % ( tid, cnt, cname, retry, res_code, now_page) time.sleep(random.randrange(1, 10, 1)) return elif res_code == 200: try: c = eval(res.text)['c'] except Exception as err: print "tid=%d --- cnt=%d --- cname=%s --- retry=%d --- res.code=%d --- now_page:%d --- exception res.text - %s" % ( tid, cnt, cname, retry, res_code, now_page, err) # r_result["type"] = "res_error" # self.already_error_type.append(spider.util.utf8str(r_result)) # self.record_spider(cname) # self.error_cnt += 1 self.re_add_job({'cname': cname, 'cnt': cnt, 'retry': retry}) return if len(c) == 0: print "tid=%d --- cnt=%d --- cname=%s --- retry=%d --- res.code=%d --- now_page:%d --- exception 'C' IS NULL" % ( tid, cnt, cname, retry, res_code, now_page) r_result["type"] = "c=0" self.already_error_type.append(spider.util.utf8str(r_result)) self.record_spider(cname) self.error_cnt += 1 return result = CCIQ_AES("BF1856A312580D41256311147089E1CC").decrypt(c) try: dic = eval(result) except Exception as err: print "tid=%d --- cnt=%d --- cname=%s --- retry=%d --- res.code=%d --- now_page:%d --- exception result:%s" % ( tid, cnt, cname, retry, res_code, now_page, result) r_result["type"] = "result_error" self.already_error_type.append(spider.util.utf8str(r_result)) self.record_spider(cname) self.error_cnt += 1 return list = dic['list'] if len(list) == 0: print "tid=%d --- cnt=%d --- cname=%s --- retry=%d --- res.code=%d --- now_page:%d --- exception len(list)=0" % ( tid, cnt, cname, retry, res_code, now_page) r_result["type"] = "list=0" self.already_error_type.append(spider.util.utf8str(r_result)) self.record_spider(cname) self.error_cnt += 1 return #print "tid=%d ### cnt=%d ### cname=%s ### retry=%d ### res.code=%d ### now_page:%d ### success:len(list):%d " % (tid, cnt, cname, retry, res_code, now_page, len(list)) for l in list: aa = {"query_name": cname} for k, v in l.items(): aa[k] = v self.query_company_list.append(spider.util.utf8str(aa)) print "******", len(list), spider.util.utf8str(list) if len(list) < 20: # r_result["type"] = "success" # self.already_error_type.append(spider.util.utf8str(r_result)) self.record_spider(cname) return elif len(list) == 20: if now_page > 100: self.already_error_type.append( spider.util.utf8str(r_result)) self.record_spider(cname) return now_page += 1 self.flip_over(now_page, cname, cnt, retry) else: print "tid=%d --- cnt=%d --- cname=%s --- retry=%d --- res.code=%d --- now_page:%d --- exception UNKNOW ERROR" % ( tid, cnt, cname, retry, res_code, now_page) if retry < 3: self.re_add_job({ 'cname': cname, 'cnt': cnt, 'retry': (retry + 1) }) return r_result["type"] = "unknown_error" self.already_error_type.append(spider.util.utf8str(r_result)) self.record_spider(cname) return
def run_job(self, jobid): url = jobid.get("url") retry = jobid.get("retry") tid = self.get_tid() proxies = self.proxies_dict[tid] res = self.request_url(url, proxies=self.proxies_dict[self.get_tid()]) self.request_count += 1 if res is None: if self.get_fail_cnt(1, 'failcount-none') < 10: self.re_add_job(jobid) else: print "id is [ %s ] thread and [ %s ] proxy will be close and drop." % ( tid, proxies) #self.__fail_urls.append(url) self.re_add_job(jobid) raise AccountErrors.NoAccountError( "Maybe the proxy[ %s ] invalid,failcount = [ %d ]" % (proxies, self.get_fail_cnt(0, 'failcount-none'))) return else: setattr(self._curltls, 'failcount', 0) if res.code == 407: if self.get_fail_cnt(1, 'failcount-407') < 10: print "%s ======》 407 , retry:%d" % (url, retry) self.re_add_job(jobid) else: print "id is [ %s ] thread and [ %s ] proxy will be close and drop." % ( tid, proxies) self.re_add_job(jobid) #self.__fail_urls.append(url) raise AccountErrors.NoAccountError( "Maybe the proxy[ %s ] invalid,failcount = [ %d ]" % (proxies, self.get_fail_cnt(0, 'failcount-407'))) return else: setattr(self._curltls, 'failcount-407', 0) if res.code == 404: print "%s ======》 404 , retry:%d" % (url, retry) if retry < 3: self.re_add_job({"id": jobid_int, "retry": (retry + 1)}) else: self.__fail_urls.append(url) return elif res.code == 503 or res.code == 500 or res.code == 502 or res.code == 504: print "%s ------> %d " % (url, res.code) self.add_job(jobid) time.sleep(random.randrange(1, 3, 1)) return elif res.code == 200: print "%s ————> will be into database......." % url m = re.search(ur'http://www.jobui.com/job/(\d+)/', url) if m: jid = m.group(1) self.page_store.save(int(time.time()), jid, url, res.text) self.success_count += 1 else: print "#######################################UNKNOWN ERROR############################################# [ %d ] retry:%d" % ( res.code, retry) if retry < 3: self.re_add_job({"id": jobid_int, "retry": (retry + 1)}) else: self.__fail_urls.append(url) #raise AccountErrors.NoAccountError('fatal error') print "request_count:{},success_count:{},request_speed:{}".format( self.request_count, self.success_count, self.request_count / (time.time() - self.start_time))