class Job51Spider(Spider): def dispatch(self): self.bs = BinSaver("job51.bin") for i in range(45000000, 75000000): self.add_main_job(i) self.wait_q() self.add_main_job(None) def run_job(self, jobid): print "job is ", jobid url = "http://search.51job.com/job/%d,c.html" % jobid res = self.request_url(url, []) if re.search(u'您选择的职位目前已经暂停招聘', res.text): print jobid, "match nothing" else: print "saving %d ..." % jobid self.bs.append('51job.%d' % jobid, res.text)
class CourtCAPParser(CAPParser): def __init__(self, channel, dist_file, name, parser): CAPParser.__init__(self, channel, name) self.bin_writer = BinSaver(dist_file) self.parser = parser def parse(self, page): res = self.parser.parse(page['indexUrl'], page['content'][1]) if res: return [res] return [] def pre_save(self, saver): pass def on_save(self, items): for item in items: print 'saving', item['name'] self.bin_writer.append(item['name'], item['value'])
class ZGcpwswSpider2(Spider2): def __init__(self, thcnt, need_srl=True, qf_name=None): Spider2.__init__(self, thcnt) # self.ce_fs = FileSaver("court_queries/check_error") self.docbin_fs = BinSaver("ws_data/ws.%d.bin"% os.getpid()) self.log_fs = FileSaver("log") # self.qf_name = qf_name self._name = "%s"% self.qf_name.split("/")[1] self.srl = {} self.need_srl = need_srl pass def init_jobs(self): with open(self.qf_name) as fs: for line in fs: job = eval(line.strip()) count = job.get("count") if count > ZGcpwswData.total_max_record: for i in ZGcpwswData.data_order: for j in ZGcpwswData.order_direction: for k in range(ZGcpwswData.page_max_index): copy_job = copy.deepcopy(job) copy_job["jobid"]["data"]["Index"] = k + 1 copy_job["jobid"]["data"]["Page"] = ZGcpwswData.page_max_count copy_job["jobid"]["data"]["Direction"] = ZGcpwswData.order_direction[j] copy_job["jobid"]["data"]["Order"] = ZGcpwswData.data_order[i] self.add_job(copy_job) elif ZGcpwswData.total_core_record < count <= ZGcpwswData.total_max_record: for j in ZGcpwswData.order_direction: for k in range(ZGcpwswData.page_max_index): copy_job = copy.deepcopy(job) copy_job["jobid"]["data"]["Index"] = k + 1 copy_job["jobid"]["data"]["Page"] = ZGcpwswData.page_max_count copy_job["jobid"]["data"]["Direction"] = ZGcpwswData.order_direction[j] self.add_job(copy_job) elif 0 < count <= ZGcpwswData.total_core_record: for k in range(ZGcpwswData.page_max_index): copy_job = copy.deepcopy(job) copy_job["jobid"]["data"]["Index"] = k + 1 copy_job["jobid"]["data"]["Page"] = ZGcpwswData.page_max_count self.add_job(copy_job) print "=======finish loading job======" def run_job(self, jobid): time.sleep(0.1) if isinstance(jobid, dict): url = jobid.get("jobid").get("url") data = jobid.get("jobid").get("data") headers = jobid.get("jobid").get("headers") reg_count = int(jobid.get("count")) resp = None try: if self.need_srl: nr = self.srl.get(getattr(self._tls, 'tid', 0)) else: nr = self.get_session_request() self.set_cookie_passport(nr) # 由于文书网系统升级,所以每次请求前需要再请求两次,用于获取cookie passport resp = nr.request_url(url, data=data, headers=headers) if isinstance(resp, CurlReq.Response) and resp and resp.content: result_list = json.loads(json.loads(resp.content)) if result_list: # for record ZGcpwswData.set_doc_count(data, len(result_list) - 1, self.log_fs) # for record for result in result_list: if result.get("Count"): new_count = int(result.get("Count")) if new_count > reg_count: jobid["check_count"] = new_count self.ce_fs.append(json.dumps(jobid, ensure_ascii=False)) else: name = '%s.%d' % (result.get(ZGcpwswData.doc_id), int(time.time()) ) self.docbin_fs.append(name, json.dumps(result, ensure_ascii=False)) else: pass else: # owing to network, return None, add to job pass except Exception, e: # print "%s-%s"%(resp.text, data) pass time.sleep(1) self.re_add_job(jobid)
class JobuiSpider(Spider): def __init__(self): self.proxies_dict = [] self.read_proxy("../spider/proxy/proxy.txt") Spider.__init__(self, len(self.proxies_dict)) self.num_count = 0 self.__fail_ids = FileSaver("fail_ids.txt") def wait_q_breakable(self): lt = 0 while True: if not self.job_queue.empty() or not self.job_queue2.empty( ) or not self.job_queue3.empty(): time.sleep(5) if time.time() < lt + 1 and self._running_count == 0: return True time.sleep(2) lt = time.time() if self._worker_count == 0: return False def dispatch(self): self.bs = BinSaver("jobui_job.bin") i = 133002626 while i > 130000000: #131127307 131207901 job = {"id": i, "retry_none": 0, "retry_500": 0} self.add_job(job, True) i -= 1 self.wait_q_breakable() self.add_job(None, True) def get_fail_cnt(self, addv): fc = getattr(self._curltls, 'failcount', 0) if (addv): fc += addv setattr(self._curltls, 'failcount', fc) return fc def run_job(self, jobid): jobid_int = jobid.get("id") url = "http://www.jobui.com/job/%d/" % (jobid_int) tid = self.get_tid() proxies = self.proxies_dict[tid] #print "this request tid = [ %s ] proxies = [ %s ]" % (tid,proxies) res = self.request_url(url, proxies=self.proxies_dict[self.get_tid()]) self.num_count += 1 #print "id : %d ------------- response code : %s " % (jobid_int, "Response Is None" if res is None else str(res.code)) if res is None: if self.get_fail_cnt(1) < 10: self.add_job(jobid) else: print "id is [ %s ] thread and [ %s ] proxy will be close and drop." % ( tid, proxies) self.__fail_ids.append(str(jobid_int)) raise AccountErrors.NoAccountError( "Maybe the proxy[ %s ] invalid,failcount = [ %d ]" % (proxies, self.get_fail_cnt(0))) return else: setattr(self._curltls, 'failcount', 0) if res.code == 404: print "%d ------ 404" % jobid_int return elif res.code == 503 or res.code == 500 or res.code == 502 or res.code == 504: print "%d ------ %d " % (jobid_int, res.code) self.add_job(jobid) time.sleep(0.8) return elif res.code == 200: print "%d ------ saving " % jobid_int fn = 'jobui_job.%d.%d' % (jobid_int, int(time.time())) self.bs.append(fn, res.text) if self.bs.getsize() >= 8 * 1024 * 1024 * 1024: raise AccountErrors.NoAccountError('file too large') else: print "#######################################UNKNOWN ERROR############################################# [ %d ]" % res.code Log.error("unknown error...") Log.errorbin("%s" % jobid_int, res.text) raise AccountErrors.NoAccountError('fatal error') def event_handler(self, evt, msg, **kwargs): if evt == 'DONE': spider.util.sendmail('*****@*****.**', '%s DONE' % sys.argv[0], msg) def read_proxy(self, fn): with open(fn, 'r') as f: for line in f: line = line.strip() m = re.match(r'(\d+\.\d+\.\d+\.\d+:\d+)', line, re.I) if m: prstr = m.group(1) proxies = { 'http': 'http://' + prstr + "/", 'https': 'https://' + prstr + "/" } self.proxies_dict.append(proxies) elif re.match('\s*#', line): continue print " loaded [ %d ] proxis " % len(self.proxies_dict)
class PageStoreBase(PageStoreDB): class CurDoc(object): def __init__(self, content, getime, jdid, real_url): self.cur_content = content self.cur_getime = getime self.cur_jdid = jdid self.cur_url = real_url def __init__(self, channel, dburl=None): super(PageStoreBase, self).__init__(channel, dburl) self.testmode = False opath = self.getopath() t = time.localtime() folder = "%s/%s/%d" % (opath, self.channel, t.tm_year) fnbase = "%s_%d%02d" % (self.channel, t.tm_year, t.tm_mon) os.system("mkdir -m 777 -p " + folder) self._ofn = "%s/%s.bin" % (folder, fnbase) self._ofnlog = "%s/%s_update.log" % (folder, fnbase) self.fssaver = BinSaver(self._ofn) self._hashcheck = spider.util.LocalHashChecker() self._docobjtls = threading.local() self.saved_count = 0 def getopath(self): dirs = ['/data/crawler/_files3_', '/opt/_test_store_'] for di in dirs: if os.path.isdir(di) and os.access(di, os.W_OK): return di raise RuntimeError("no dir to write files.") def get_cur_doc(self): return getattr(self._docobjtls, 'doc', None) def set_cur_doc(self, content, getime, jdid, real_url): doc = PageStoreBase.CurDoc(content, getime, jdid, real_url) setattr(self._docobjtls, 'doc', doc) @staticmethod def mktime(year=2015, m=1, d=1, hour=0, minute=0, second=0): arr = [year, m, d, hour, minute, second, 0, 0, 0] for i in range(0, len(arr)): arr[i] = int(arr[i]) return time.mktime(arr) def extract_content(self): raise NotImplementedError('virtual function called') def page_time(self): raise NotImplementedError('virtual function called') def check_should_fetch(self, jdid): indexUrl = "%s://%s" % (self.channel, jdid) return not self.find_new(indexUrl) def save_time_log(self, indexUrl, cur_tm): oldtime = self.get_page_time(indexUrl) if oldtime == cur_tm: return logstr = "%s %ld => %ld\n" % (indexUrl, oldtime, cur_tm) cutil.mp_append_log(self._ofnlog, logstr) def save(self, getime, jdid, real_url, content, fnpath=None, offset=None): global MIN_TIME_MSEC if getime > MIN_TIME_MSEC: raise RuntimeError("get time muse be in seconds.") if self._hashcheck.query(jdid) > 0: return True self.set_cur_doc(content, getime, jdid, real_url) try: pageDesc = self.extract_content() if not pageDesc: print "jdid: %s, pageDesc empty" % self.get_cur_doc().cur_jdid return False elif self.testmode: print pageDesc pageTime = self.page_time() if pageTime is None or pageTime < MIN_TIME_MSEC: raise RuntimeError("page time must be in msec") if isinstance(pageTime, float): pageTime = int(pageTime) if isinstance(pageDesc, unicode): pageDesc = pageDesc.encode('utf-8') contentSign = hashlib.md5(pageDesc).hexdigest() indexUrl = "%s://%s" % (self.channel, jdid) self.save_time_log(indexUrl, pageTime) # if there is an entry with this contentSign, update it with no need to save webpage in binfile. # otherwise update by indexUrl. if self.find_item(indexUrl, contentSign): Log.warning("%s exists in db, skip" % jdid) self.update_time(indexUrl, contentSign, int(getime) * 1000, pageTime) return True print "saving", indexUrl odoc = { 'contentSign': contentSign, 'indexUrl': indexUrl, 'realUrl': real_url, 'createTimeFlag': 1, 'owner': self.channel, 'createTimeTimeStamp': pageTime, 'crawlerUpdateTime': int(getime) * 1000, 'updateTime': pageTime, 'status': 0, 'isUpdated': 0, 'isExpired': 0, } if self.testmode: pprint.pprint(odoc) return True else: if self.do_save(odoc, content, fnpath, offset): print indexUrl, "saved" self.saved_count += 1 self._hashcheck.add(jdid) return True return False except Exception as e: print e traceback.print_exc() Log.error("failed to save %s %s" % (self.channel, jdid)) time.sleep(5) return False def do_save(self, odoc, content, fnpath=None, offset=None): if isinstance(content, unicode): content = content.encode('utf-8') filepos = self.fssaver.append( "%s.%s.%d" % (self.channel, self.get_cur_doc().cur_jdid, self.get_cur_doc().cur_getime), content) odoc.update({'pageContentPath': "binf::%s::%d" % (self._ofn, filepos)}) return self.upsert_doc(odoc['indexUrl'], odoc)
class QycxbSpider(Spider): def __init__(self, threadcnt): Spider.__init__(self, threadcnt) self.sqs = {} self.binsaver = BinSaver("Qycxb" + str(time.time()).split(".")[0] + ".bin") def init_req(self): with self.locker: threadident = str(threading.currentThread().ident) sq = QycxbReq() # sq.load_proxy("../../_zhilian/curproxy0") # sq.load_proxy("../_zhilian/curproxy") # sq.select_user_agent("firefox") sq.default_headers = {"Connection": "keep-alive", "Accept": r"text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "Accept-Language": "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3", "Accept-Encoding": "gzip, deflate", "Referer":"http://qiye.qianzhan.com/", "X-Requested-With":"XMLHttpRequest", "User-Agent": r"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:42.0) Gecko/20100101 Firefox/42.0", "Pragma":"no-cache", "Cache-Control":"no-cache", "Content-Type":"application/x-www-form-urlencoded; charset=UTF-8"} # con = sq.request_url("http://qiye.qianzhan.com/") con1 = sq.request_url(r"http://qiye.qianzhan.com/orgcompany/searchList", data=r"oc_name=%E5%B9%BF%E5%B7%9E%E5%B8%82%E5%8D%97%E6%B2%99%E5%8D%8E%E5%B7%A5%E7%A0%94%E7%A9%B6%E9%99%A2&oc_area=&sh_searchType=1&od_orderby=0&page=1&pageSize=10") self.sqs[threadident] = sq setattr(self._curltls, "sq", sq) return sq def dispatch(self): f = open("/home/peiyuan/r1.txt", "rb") currline = 0 skip = 0 endline = 1000 while currline < skip: line = f.readline() currline += 1 while currline < endline: line = f.readline() key = line.strip().split(" ")[-1].strip() job = {"key": key, "type": "u1", "lineno": currline} self.add_main_job(job) currline += 1 self.wait_q() self.add_main_job(None) def run_job(self, job): time.sleep(5) threadident = str(threading.currentThread().ident) sq = getattr(self._curltls, "sq",None) if sq is None: sq = self.init_req() Log.info("Running job:" + util.utf8str(job.__str__())) if job["type"] == "u1": Log.info("Searching line %d" % job["lineno"]) con = sq.request_url(r"http://qiye.qianzhan.com/orgcompany/searchList", data={"oc_name": job["key"], "od_orderby": 0, "page": 1, "pageSize": 10, "oc_area": "", "sh_searchType": 1}) if con is None or con.text.strip() == "" or con.code != 200: Log.error("[u1]Bad connect or empty content return.. JOB=>" + util.utf8str(job.__str__())) self.re_add_job(job) return else: jsonobj = "" try: jsonobj = json.loads(con.text.strip()) except ValueError as e: Log.error("Json decode error. String is %s" % con.text) return if not jsonobj["succ"]: Log.warning(jsonobj.__str__()) time.sleep(1) Log.error("[u1]Request fail, succ flag is False. JOB=>" + util.utf8str(job.__str__())) if 'status' in jsonobj and jsonobj['status'] == '4': Log.error("Remove current proxy...Used %d times....." % sq._proxy_use_times[sq.curproxy]) sq.remove_curproxy() self.re_add_job(job) else: corplist = jsonobj["lst"] if len(corplist) == 0: Log.error("Search return nothing. %d:%s, no data." % (job["lineno"], job["key"])) return else: for corp in corplist: jobb = {"type": "u2", "orgCode": corp["oc_orgCode"], "name": corp["oc_name"]} self.add_job(jobb) if job["type"] == "u2": Log.info("Getting detail info about %s" % job["name"]) timestr = "%f" % time.time() con0 = sq.request_url(r"http://qiye.qianzhan.com/orgcompany/GetJsVerfyCode?t=0.%s&_=%s" % ( timestr.split(".")[1], timestr.split(".")[0])) if con0 is None or con0.text.strip() == "" or con0.code != 200: Log.error("[u2]Bad connect or empty content return.. JOB=>" + util.utf8str(job.__str__())) self.re_add_job(job) return if not os.path.exists(threadident): os.mkdir(threadident) f = open(threadident + "/qycxb.js", "w+b") f.write(r'var window = {document : {cookie :"qznewsite.uid=' + sq.get_cookie( "qznewsite.uid").strip() +'"}}; ' + con0.text + "console.log(window.__qzmcf())") f.flush() os.system("nodejs " + threadident + "/qycxb.js > " + threadident + "/mcfcode.txt") mcfcode = open(threadident + "/mcfcode.txt", "rb").read().strip() con1 = sq.request_url("http://qiye.qianzhan.com/orgcompany/SearchItemDtl", data={"mcfCode": mcfcode, "orgCode": job["orgCode"]}) if con1 is None or con1.text.strip() == "" or con1.code != 200: Log.error("[u2]Bad connect or empty content return.. JOB=>" + util.utf8str(job.__str__())) self.re_add_job(job) return else: jsonobj = json.loads(con1.text.strip()) if not jsonobj["succ"]: Log.warning(jsonobj.__str__()) time.sleep(1) Log.error( "[u2]Request fail, succ flag is False.Check the orgcode and mcfcode. JOB=>" + util.utf8str( job.__str__())) if 'status' in jsonobj and jsonobj['status'] == '4': Log.error("Remove current proxy...Used %d times....." % sq._proxy_use_times[sq.curproxy]) sq.remove_curproxy() self.re_add_job(job) else: self.binsaver.append(job["name"] + job["orgCode"], con1.text.strip()) Log.info("%s,%s,saved." % (job["name"], job["orgCode"])) return
class EWTSpider(Spider): def __init__(self, thcnt): Spider.__init__(self, thcnt) self._user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11; rv:42.0) Gecko/20100101 Firefox/42.0' self.baseurl = 'http://www.ewt360.com/LNLQXX/SearchResult?act=mark' self.headers = {'Referer': self.baseurl} #scores = range(450,750+1) + range(449, 0, -1) + [0] scores = range(750, 0, -1) + [0] self.possmap = { 'Years': range(2009, 2014 + 1), 'WL': ['l', 'w'], 'BZ': ['b', 'z'], 'PiCi': 0, 'Score': scores, 'ProvinceCode': 0, 'page': 1 } self.bs = BinSaver("fo.bin") self.racer = RaceValueByKey() ##stacktracer.trace_start("trace.html") def doLogin(self): self.cookies = {} self.request_url( 'http://passport.ewt360.com/login/prelogin?callback=cb&sid=2&username=BOBOYI&password=1122333&fromurl=%2F&code=&isremember=1' ) print self.cookies return len(self.cookies.keys()) def dispatch(self): self.racer.getValue('login', lambda v: self.doLogin()) ## load saved list. savedlist = {} try: with open("ks") as f: for lines in f: savedlist[lines.strip()] = 1 except Exception as e: pass a = AllPossibilities(self.possmap, ['WL', 'BZ', 'Years', 'Score']) for i in a.all(): keys = ['WL', 'BZ', 'Years', 'Score', 'page'] ss = "%s.%s.%s.%s.%s" % (i[keys[0]], i[keys[1]], i[keys[2]], i[keys[3]], i[keys[4]]) if ss not in savedlist: self.add_job({'tp': 'mj', 'v': i}, True) self.wait_q() self.add_job(None, True) def dispatch2(self): with open('errlog.txt') as f: for lines in f: jm = json.loads(lines.strip()) self.add_job({'tp': 'mi', 'v': jm}, True) self.wait_q() self.add_job(None, True) def run_job(self, jobid): if isinstance(jobid, dict): self.dump_jobid(jobid) xxvalue = self.racer.oldValue('login') con = self.request_url(self.baseurl, params=jobid['v'], headers=self.headers) if con is None: return self.run_job(jobid) if re.search(u'<title>登录系统</title>', con.text) or re.search( u'您的账号未登陆或超时,请重新登', con.text): self.racer.delValueChecked('login', xxvalue) self.racer.sleepAlign(10) print "=======================relogin===================" self.racer.getValue('login', lambda v: self.doLogin()) return self.run_job(jobid) #raise RuntimeError("need login") if jobid['tp'] == 'mj': m = re.search(ur'page=(\d+)[^<>]*>尾页', con.text) if m: lp = int(m.group(1)) for page in range(2, lp + 1): v2 = copy.deepcopy(jobid['v']) v2['page'] = page self.add_job({'tp': 'mi', 'v': v2}) if jobid['tp'] == 'mj' or jobid['tp'] == 'mi': key = json.dumps(jobid['v'], ensure_ascii=0).encode('utf-8') self.bs.append(key, con.text)
class ParseBin: def __init__(self): self.nfs = BinSaver("parsed.bin") self.nks = {} self.errlog = FileSaver("errlog.txt") def get_nkey(self, jn): keys = ['WL', 'BZ', 'Years', 'Score', 'page'] ss = "%s.%s.%s.%s.%s" % (jn[keys[0]], jn[keys[1]], jn[keys[2]], jn[keys[3]], jn[keys[4]]) return ss def save(self, k, v): if k in self.nks: return True self.nks[k] = 1 self.nfs.append(k, v) def isRed(self, col): hcode = html.tostring(col) m = re.search(ur'color\s*:\s*Red', hcode, re.I) if m: return True return False def go_(self, fr): while True: n, v = fr.readone() if n is None: return jn = json.loads(n) nkey = self.get_nkey(jn) print nkey if '系统检索不到您所查询的相关信息' in v: self.save(nkey, 'None') continue try: doc = html.fromstring(v) tbl = doc.xpath("//table[@id='tablecloth']")[0] otbl = [] rowno = 0 for rows in list(tbl): rowno += 1 if rowno == 1: continue currow = [] colid = 0 for cols in rows: colid += 1 t = re.sub(ur'\s+', u' ', cols.text_content().strip()) if colid == 4 and self.isRed(cols): t += ".red" currow.append(t) otbl.append(currow) #print nkey, json.dumps(otbl, ensure_ascii=0).encode('utf8') self.save(nkey, json.dumps(otbl, ensure_ascii=0).encode('utf8')) except Exception as e: print v raise def go(self): fns = ['fo.bin'] for fn in fns: fr = BinReader(fn) self.go_(fr)
class GSSPider(Spider): def __init__(self, tc): Spider.__init__(self, tc) self._logport = 5556 # self.channel = 'gsid' # self.job_queue = 'gsid' self.savebin = BinSaver("gongshang.bin") self.faillog = open("fail_list.txt", "w+b") def loadhans(self, fn): c = '' with open(fn) as f: c = f.read().decode('utf-8') all = {} for i in c: if ord(i) > 0x400: all[i] = 1 cc = all.keys() if len(cc) == 0: raise RuntimeError("no hans loaded") # print json.dumps(cc, ensure_ascii=False).encode('utf-8') return cc def query_summary(self, areacode, qword, page=1): # hashStr = hashlib.md5(str(time.time())).hexdigest() # print hashStr # CookieStr = hashStr[0:8]+"-"+hashStr[9:13]+"-"+hashStr[14:18]+"-"+hashStr[19:23]+"-"+hashStr[24:32] # print "COOKIE:%s" % CookieStr CookieStr = "E1F3418D-BDC7-468D-9F43-6EA13A642356" headers = { 'User-Agent': 'Mozilla/5.0 (iPhone;8.0.2;iPhone;iPhone);Version/1.1;ISN_GSXT', "Cookie": CookieStr } data = {'AreaCode': areacode, 'Limit': 50, 'Page': page, 'Q': qword} # print data con = self.request_url('https://120.52.121.75:8443/QuerySummary', headers=headers, data=data, verify=False) # try: if con is None: Log.error("query %s, connect failed! " % qword) raise ConnectFailError("query %s, connect failed! " % qword) if "502 Bad Gateway" in con.text: Log.error("query %s, 502! " % qword) raise BadGatewayError("query %s, 502! " % qword) j = json.loads(con.text) if j.get('ERRCODE') == '0': rs = j["RESULT"] if len(rs) is 0: Log.error("query %s, no data!" % qword) return rs else: Log.error("query %s, request error! Response: %s" % (qword, j)) raise RequestError("query %s, request error! " % qword) # except Exception as e: # print e # return None def query_info(self, areacode, regNo, page=1): CookieStr = "E1F3418D-BDC7-468D-9F43-6EA13A642356" headers = { 'User-Agent': 'Mozilla/5.0 (iPhone;8.0.2;iPhone;iPhone);Version/1.1;ISN_GSXT', "Cookie": CookieStr } data = { 'AreaCode': areacode, 'Limit': 50, 'Page': page, 'Q': regNo, 'EndNo': regNo } # print data con = self.request_url('https://120.52.121.75:8443/QuerySummary', headers=headers, data=data, verify=False) # try: if con is None: Log.error("query %s, connect failed! " % regNo) raise ConnectFailError("query %s, connect failed! " % regNo) if "502 Bad Gateway" in con.text: Log.error("query %s, 502! " % regNo) raise BadGatewayError("query %s, 502! " % regNo) j = json.loads(con.text) if j.get('ERRCODE') == '0': rs = j["RESULT"] if len(rs) is 0: Log.error("query %s, no data!" % regNo) return rs else: Log.error("query %s, request error! Response: %s" % (regNo, j)) raise RequestError("query %s, request error! " % regNo) def save_qs(self, qs, qid, areacode, qw): for t in qs: print json.dumps(t, ensure_ascii=0, indent=4).encode('utf-8') t['qinfo'] = {'qid': qid, 'areacode': areacode, 'qw': qw} # self.col.update({'REGNO':t['REGNO'], 'ENTNAME':t['ENTNAME']}, t, True) def dispatch_hans1(self): self.hans = self.loadhans('hans') for i in self.hans: for k in area_data: # self.job_queue.put({'qw':i, 'areacode':k[0], 'tp':'w'}) pass def dispatch(self): with open("r1k.txt", 'r') as f: for lines in f: m = re.match(u'\d+\s+(\d+)\s+(.*)', lines) if m: code = int(m.group(1)) l = m.group(2) if code == 0: for k in area_data: self.add_main_job({ 'qw': l, 'areacode': k[0], 'tp': 'w' }) else: self.add_main_job({ 'qw': l, 'areacode': code, 'tp': 'w' }) self.wait_q() self.job_queue.put(None) def run_job(self, job): self.faillog.write("%s failed.\n" % job) if isinstance(job, dict): tp = job.get('tp') areacode = job.get('areacode') qid = -1 if tp == 'w': qw = job['qw'] if tp == 'w2': qid = job.get('qid') xxx = qid / len(self.hans) yyy = qid % len(self.hans) if xxx >= yyy: return True qw = self.hans[xxx] + self.hans[yyy] if tp == 'w' or tp == 'w2': page = job.get('page', 1) print qw, areacode, page # self.savebin.append(qw) try: qs = self.query_summary(areacode, qw, page) except HttpError: readd_count = job.get("readd_count", "0").strip() if int(readd_count) < 10: job["readd_count"] = str(int(readd_count) + 1) Log.warning("readd job %s" % qw) self.re_add_job(job) else: Log.error("job %s has run 30 times. " % qw) self.faillog.write("%s failed.\n" % job) time.sleep(1) return self.savebin.append(qw, qs.__str__()) Log.warning("%s saved" % qw) time.sleep(1)