class LiepinSpider(Spider2): def __init__(self, thcnt): Spider2.__init__(self, thcnt) self._name = 'jd_liepin' self.bs = BinSaver("liepin.%d.bin" % os.getpid()) def init_jobs(self): self.add_main_job_range({}, 1, 9999999) def run_job(self, job): print "job is ", job #url = "http://m.liepin.com/hjob/%d/" % (job['value']) value = job['value'] url = "http://job.liepin.com/%03d_%d/" % (int(value)/10000, int(value)) res = self.request_url(url) if re.search(u'您访问的页面不存在或已删除', res.text ): print job, "match nothing" elif re.search(u'该职位已结束', res.text): print job, "match ending" elif re.search(u'您查看的职位已过期', res.text): print job, "match timeout" else: print "saving %d ..." % job['value'] name = '%s.%d.%d' % (self._name, job['value'], int(time.time()) ) self.bs.append(name, res.text)
def __init__(self, tc): Spider.__init__(self, tc) self._logport = 5556 # self.channel = 'gsid' # self.job_queue = 'gsid' self.savebin = BinSaver("gongshang.bin") self.faillog = open("fail_list.txt", "w+b")
def main(): try: opts, args = getopt.gnu_getopt(sys.argv[1:], 'o:m:pi:') except getopt.GetoptError as e: showusage() return 1 outfile = None matchstr = '' printout = False index = -1 for (n, v) in opts: if n == '-o': outfile = v if n == '-m': matchstr = v if n == '-p': printout = True if n == '-i': index = int(v) if len(args) == 0: showusage() return 1 if outfile: fo = BinSaver(outfile) for fn in args: r = BinReader(fn) while True: (n, v) = r.readone() if n is None: break if matchstr in v: fo.append(n, v) else: for fn in args: if printout or index != -1: r = BinReader(fn) else: r = BinReader1(fn) findex = 0 while True: (n, v) = r.readone() if n is None: break if index != -1: if findex == index: if printout: print v else: print n elif findex > index: break elif printout: print n, v else: print n findex += 1
def __init__(self): self.bs = BinSaver("gsinfo_Guangdong_html.bin") self.pic = BinSaver("gsinfo_Guangdong_pic.bin") self.fs_QyxyDetail = FileSaver("gsinfo_guangdong_QyxyDetail.txt") self.fs_GSpublicityList = FileSaver( "gsinfo_guangdong_GSpublicityList.txt") self.fs_entityShow = FileSaver("gsinfo_guangdong_entityShow.txt") self.fs_guangzhou = FileSaver("gsinfo_guangdong_guangzhou.txt")
def dispatch(self): self.bs = BinSaver("jobui_job.bin") i = 133002626 while i > 130000000: #131127307 131207901 job = {"id": i, "retry_none": 0, "retry_500": 0} self.add_job(job, True) i -= 1 self.wait_q_breakable() self.add_job(None, True)
def __init__(self, thcnt, need_srl=True, qf_name=None): Spider2.__init__(self, thcnt) # self.ce_fs = FileSaver("court_queries/check_error") self.docbin_fs = BinSaver("ws_data/ws.%d.bin"% os.getpid()) self.log_fs = FileSaver("log") # self.qf_name = qf_name self._name = "%s"% self.qf_name.split("/")[1] self.srl = {} self.need_srl = need_srl pass
def __init__(self, channel, dburl=None): super(PageStoreBase, self).__init__(channel, dburl) self.testmode = False opath = self.getopath() t = time.localtime() folder = "%s/%s/%d" % (opath, self.channel, t.tm_year) fnbase = "%s_%d%02d" % (self.channel, t.tm_year, t.tm_mon) os.system("mkdir -m 777 -p " + folder) self._ofn = "%s/%s.bin" % (folder, fnbase) self._ofnlog = "%s/%s_update.log" % (folder, fnbase) self.fssaver = BinSaver(self._ofn) self._hashcheck = spider.util.LocalHashChecker() self._docobjtls = threading.local() self.saved_count = 0
class CourtParser(CWPParser): def __init__(self, channel, dist_file, name, parser): CWPParser.__init__(self, channel, name) self.bin_writer = BinSaver(dist_file) self.parser = parser def process_child_item(self, item): print 'saving', item['name'] self.bin_writer.append(item['name'], item['value']) def parse_item(self, page): res = self.parser.parse(page['indexUrl'], page['content'][1]) if res: return [res] return []
class Job51Spider(Spider): def dispatch(self): self.bs = BinSaver("job51.bin") for i in range(45000000, 75000000): self.add_main_job(i) self.wait_q() self.add_main_job(None) def run_job(self, jobid): print "job is ", jobid url = "http://search.51job.com/job/%d,c.html" % jobid res = self.request_url(url, []) if re.search(u'您选择的职位目前已经暂停招聘', res.text): print jobid, "match nothing" else: print "saving %d ..." % jobid self.bs.append('51job.%d' % jobid, res.text)
def __init__(self, thcnt): Spider.__init__(self, thcnt) self._user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11; rv:42.0) Gecko/20100101 Firefox/42.0' self.baseurl = 'http://www.ewt360.com/LNLQXX/SearchResult?act=mark' self.headers = {'Referer': self.baseurl} #scores = range(450,750+1) + range(449, 0, -1) + [0] scores = range(750, 0, -1) + [0] self.possmap = { 'Years': range(2009, 2014 + 1), 'WL': ['l', 'w'], 'BZ': ['b', 'z'], 'PiCi': 0, 'Score': scores, 'ProvinceCode': 0, 'page': 1 } self.bs = BinSaver("fo.bin") self.racer = RaceValueByKey()
class CourtCAPParser(CAPParser): def __init__(self, channel, dist_file, name, parser): CAPParser.__init__(self, channel, name) self.bin_writer = BinSaver(dist_file) self.parser = parser def parse(self, page): res = self.parser.parse(page['indexUrl'], page['content'][1]) if res: return [res] return [] def pre_save(self, saver): pass def on_save(self, items): for item in items: print 'saving', item['name'] self.bin_writer.append(item['name'], item['value'])
def __init__(self, *proxyfile): threadcnt = self.prepare_proxy(*proxyfile) Spider.__init__(self, threadcnt) if not os.path.exists("data1"): os.makedirs("data1") self.namefile = open( "data1/corpname." + str(time.time()).split(".")[0] + ".txt", "w+b") self.failfile = open( "data1/fail." + str(time.time()).split(".")[0] + ".txt", "w+b") self.binsaver = BinSaver("data1/gsinfo" + str(time.time()).split(".")[0] + ".bin")
def dispatch(self): self.bs = BinSaver('joblagou.bin') for query in q: try: for jobid in self.getIds(query): if isinstance(jobid, int): jobid = str(jobid) self.add_main_job(jobid) except Exception as e: continue self.wait_q() self.add_main_job(None)
def dispatch(self): self.bs = BinSaver("youzy_job.bin") f = open("url_cfraction918-.txt", "r") while True : line = f.readline() if line.strip(): job = {"url":line.strip()} self.add_job(job, True) else: break f.close() self.wait_q_breakable() self.add_job(None, True)
def __init__(self, thcnt): self.proxy_mode = 2 # 代理模式如下: # 0:使用固定代理,代理数=线程数 # 1:使用单一ADSL切换 ,线程数自定义 # 2:使用多个ADSL切换,线程数自定义 # 3:通过API提取快代理,并将代理放入到队列,所有线程共享同一个代理,代理异常则切换并从队列中移除(目前没有做代理切换) if self.proxy_mode == 0: self.proxies_dict = [{'http': 'http://*****:*****@106.75.134.189:18889', 'https': 'https://*****:*****@106.75.134.189:18889'}, {'http': 'http://*****:*****@106.75.134.190:18889', 'https': 'https://*****:*****@106.75.134.190:18889'}, {'http': 'http://*****:*****@106.75.134.191:18889', 'https': 'https://*****:*****@106.75.134.191:18889'}, {'http': 'http://*****:*****@106.75.134.192:18889', 'https': 'https://*****:*****@106.75.134.192:18889'}, {'http': 'http://*****:*****@106.75.134.193:18889', 'https': 'https://*****:*****@106.75.134.193:18889'}, ] Spider.__init__(self, 100) elif self.proxy_mode == 1: self.proxies = {'http': 'http://*****:*****@121.40.186.237:50001', 'https': 'https://*****:*****@121.40.186.237:50001'} #self.proxies = {'http': 'http://*****:*****@183.56.160.174:50001', 'https': 'https://*****:*****@183.56.160.174:50001'} #self.proxies = {'http': 'http://*****:*****@192.168.1.39:3428', 'https': 'https://*****:*****@192.168.1.39:3428'} #self.proxis = {'http': 'http://*****:*****@106.75.134.189:18889', 'https': 'https://*****:*****@106.75.134.189:18889'} Spider.__init__(self, 100) elif self.proxy_mode == 2: self.proxies_dict = [#{'http': 'http://*****:*****@183.56.160.174:50001', 'https': 'https://*****:*****@183.56.160.174:50001'}, {'http': 'http://*****:*****@121.40.186.237:50001', 'https': 'https://*****:*****@121.40.186.237:50001'}, {'http': 'http://*****:*****@192.168.1.39:3428', 'https': 'https://*****:*****@192.168.1.39:3428'}, {'http': 'http://*****:*****@192.168.1.39:3428', 'https': 'https://*****:*****@192.168.1.39:3428'}] Spider.__init__(self, 200) elif self.proxy_mode == 3: self.proxies_dict = [] self.proxies = {} self.get_kuaidaili() elif self.proxy_mode == 4: Spider.__init__(self, 1) self.proxies_dict = [] self._curltls = threading.local() self.shoudong_img = False #手动输入验证码 self.saver = FileSaver("nacao_traversal_info_l.txt") self.already = FileSaver("nacao_traversal_info_already_l.txt") self.queries0 = FileSaver("nacao_traversal_queies_0_l.txt") self.bin_saver = BinSaver("nacao_captcha_image.bin") self.init_already() self.time_record = time.time() self.scs_record = 0
def __init__(self): self.bs = BinSaver("gsinfo_Sichuan_html.bin") self.fs = FileSaver("gsinfo_sichuan.txt")
def __init__(self): self.bs = BinSaver("gsinfo_Hunan_html.bin") self.fs = FileSaver("gsinfo_hunan.txt")
def __init__(self, thcnt): Spider2.__init__(self, thcnt) self._name = 'jd_liepin' self.bs = BinSaver("liepin.%d.bin" % os.getpid())
class ZGcpwswSpider2(Spider2): def __init__(self, thcnt, need_srl=True, qf_name=None): Spider2.__init__(self, thcnt) # self.ce_fs = FileSaver("court_queries/check_error") self.docbin_fs = BinSaver("ws_data/ws.%d.bin"% os.getpid()) self.log_fs = FileSaver("log") # self.qf_name = qf_name self._name = "%s"% self.qf_name.split("/")[1] self.srl = {} self.need_srl = need_srl pass def init_jobs(self): with open(self.qf_name) as fs: for line in fs: job = eval(line.strip()) count = job.get("count") if count > ZGcpwswData.total_max_record: for i in ZGcpwswData.data_order: for j in ZGcpwswData.order_direction: for k in range(ZGcpwswData.page_max_index): copy_job = copy.deepcopy(job) copy_job["jobid"]["data"]["Index"] = k + 1 copy_job["jobid"]["data"]["Page"] = ZGcpwswData.page_max_count copy_job["jobid"]["data"]["Direction"] = ZGcpwswData.order_direction[j] copy_job["jobid"]["data"]["Order"] = ZGcpwswData.data_order[i] self.add_job(copy_job) elif ZGcpwswData.total_core_record < count <= ZGcpwswData.total_max_record: for j in ZGcpwswData.order_direction: for k in range(ZGcpwswData.page_max_index): copy_job = copy.deepcopy(job) copy_job["jobid"]["data"]["Index"] = k + 1 copy_job["jobid"]["data"]["Page"] = ZGcpwswData.page_max_count copy_job["jobid"]["data"]["Direction"] = ZGcpwswData.order_direction[j] self.add_job(copy_job) elif 0 < count <= ZGcpwswData.total_core_record: for k in range(ZGcpwswData.page_max_index): copy_job = copy.deepcopy(job) copy_job["jobid"]["data"]["Index"] = k + 1 copy_job["jobid"]["data"]["Page"] = ZGcpwswData.page_max_count self.add_job(copy_job) print "=======finish loading job======" def run_job(self, jobid): time.sleep(0.1) if isinstance(jobid, dict): url = jobid.get("jobid").get("url") data = jobid.get("jobid").get("data") headers = jobid.get("jobid").get("headers") reg_count = int(jobid.get("count")) resp = None try: if self.need_srl: nr = self.srl.get(getattr(self._tls, 'tid', 0)) else: nr = self.get_session_request() self.set_cookie_passport(nr) # 由于文书网系统升级,所以每次请求前需要再请求两次,用于获取cookie passport resp = nr.request_url(url, data=data, headers=headers) if isinstance(resp, CurlReq.Response) and resp and resp.content: result_list = json.loads(json.loads(resp.content)) if result_list: # for record ZGcpwswData.set_doc_count(data, len(result_list) - 1, self.log_fs) # for record for result in result_list: if result.get("Count"): new_count = int(result.get("Count")) if new_count > reg_count: jobid["check_count"] = new_count self.ce_fs.append(json.dumps(jobid, ensure_ascii=False)) else: name = '%s.%d' % (result.get(ZGcpwswData.doc_id), int(time.time()) ) self.docbin_fs.append(name, json.dumps(result, ensure_ascii=False)) else: pass else: # owing to network, return None, add to job pass except Exception, e: # print "%s-%s"%(resp.text, data) pass time.sleep(1) self.re_add_job(jobid)
def __init__(self): self.bs = BinSaver("gsinfo_Gansu_html.bin") self.fs = FileSaver("gsinfo_Gansu.txt")
def dispatch(self): self.bs = BinSaver('joblagou.bin') for i in xrange(0, 1500000): self.add_main_job(str(i)) self.wait_q() self.add_main_job(None)
def __init__(self): self.bs = BinSaver("gsinfo_Tianjin_html.bin") self.fs = FileSaver("gsinfo_tianjin.txt")
def __init__(self, channel, dist_file, name, parser): CAPParser.__init__(self, channel, name) self.bin_writer = BinSaver(dist_file) self.parser = parser
class QycxbSpider(Spider): def __init__(self, threadcnt): Spider.__init__(self, threadcnt) self.sqs = {} self.binsaver = BinSaver("Qycxb" + str(time.time()).split(".")[0] + ".bin") def init_req(self): with self.locker: threadident = str(threading.currentThread().ident) sq = QycxbReq() # sq.load_proxy("../../_zhilian/curproxy0") # sq.load_proxy("../_zhilian/curproxy") # sq.select_user_agent("firefox") sq.default_headers = {"Connection": "keep-alive", "Accept": r"text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "Accept-Language": "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3", "Accept-Encoding": "gzip, deflate", "Referer":"http://qiye.qianzhan.com/", "X-Requested-With":"XMLHttpRequest", "User-Agent": r"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:42.0) Gecko/20100101 Firefox/42.0", "Pragma":"no-cache", "Cache-Control":"no-cache", "Content-Type":"application/x-www-form-urlencoded; charset=UTF-8"} # con = sq.request_url("http://qiye.qianzhan.com/") con1 = sq.request_url(r"http://qiye.qianzhan.com/orgcompany/searchList", data=r"oc_name=%E5%B9%BF%E5%B7%9E%E5%B8%82%E5%8D%97%E6%B2%99%E5%8D%8E%E5%B7%A5%E7%A0%94%E7%A9%B6%E9%99%A2&oc_area=&sh_searchType=1&od_orderby=0&page=1&pageSize=10") self.sqs[threadident] = sq setattr(self._curltls, "sq", sq) return sq def dispatch(self): f = open("/home/peiyuan/r1.txt", "rb") currline = 0 skip = 0 endline = 1000 while currline < skip: line = f.readline() currline += 1 while currline < endline: line = f.readline() key = line.strip().split(" ")[-1].strip() job = {"key": key, "type": "u1", "lineno": currline} self.add_main_job(job) currline += 1 self.wait_q() self.add_main_job(None) def run_job(self, job): time.sleep(5) threadident = str(threading.currentThread().ident) sq = getattr(self._curltls, "sq",None) if sq is None: sq = self.init_req() Log.info("Running job:" + util.utf8str(job.__str__())) if job["type"] == "u1": Log.info("Searching line %d" % job["lineno"]) con = sq.request_url(r"http://qiye.qianzhan.com/orgcompany/searchList", data={"oc_name": job["key"], "od_orderby": 0, "page": 1, "pageSize": 10, "oc_area": "", "sh_searchType": 1}) if con is None or con.text.strip() == "" or con.code != 200: Log.error("[u1]Bad connect or empty content return.. JOB=>" + util.utf8str(job.__str__())) self.re_add_job(job) return else: jsonobj = "" try: jsonobj = json.loads(con.text.strip()) except ValueError as e: Log.error("Json decode error. String is %s" % con.text) return if not jsonobj["succ"]: Log.warning(jsonobj.__str__()) time.sleep(1) Log.error("[u1]Request fail, succ flag is False. JOB=>" + util.utf8str(job.__str__())) if 'status' in jsonobj and jsonobj['status'] == '4': Log.error("Remove current proxy...Used %d times....." % sq._proxy_use_times[sq.curproxy]) sq.remove_curproxy() self.re_add_job(job) else: corplist = jsonobj["lst"] if len(corplist) == 0: Log.error("Search return nothing. %d:%s, no data." % (job["lineno"], job["key"])) return else: for corp in corplist: jobb = {"type": "u2", "orgCode": corp["oc_orgCode"], "name": corp["oc_name"]} self.add_job(jobb) if job["type"] == "u2": Log.info("Getting detail info about %s" % job["name"]) timestr = "%f" % time.time() con0 = sq.request_url(r"http://qiye.qianzhan.com/orgcompany/GetJsVerfyCode?t=0.%s&_=%s" % ( timestr.split(".")[1], timestr.split(".")[0])) if con0 is None or con0.text.strip() == "" or con0.code != 200: Log.error("[u2]Bad connect or empty content return.. JOB=>" + util.utf8str(job.__str__())) self.re_add_job(job) return if not os.path.exists(threadident): os.mkdir(threadident) f = open(threadident + "/qycxb.js", "w+b") f.write(r'var window = {document : {cookie :"qznewsite.uid=' + sq.get_cookie( "qznewsite.uid").strip() +'"}}; ' + con0.text + "console.log(window.__qzmcf())") f.flush() os.system("nodejs " + threadident + "/qycxb.js > " + threadident + "/mcfcode.txt") mcfcode = open(threadident + "/mcfcode.txt", "rb").read().strip() con1 = sq.request_url("http://qiye.qianzhan.com/orgcompany/SearchItemDtl", data={"mcfCode": mcfcode, "orgCode": job["orgCode"]}) if con1 is None or con1.text.strip() == "" or con1.code != 200: Log.error("[u2]Bad connect or empty content return.. JOB=>" + util.utf8str(job.__str__())) self.re_add_job(job) return else: jsonobj = json.loads(con1.text.strip()) if not jsonobj["succ"]: Log.warning(jsonobj.__str__()) time.sleep(1) Log.error( "[u2]Request fail, succ flag is False.Check the orgcode and mcfcode. JOB=>" + util.utf8str( job.__str__())) if 'status' in jsonobj and jsonobj['status'] == '4': Log.error("Remove current proxy...Used %d times....." % sq._proxy_use_times[sq.curproxy]) sq.remove_curproxy() self.re_add_job(job) else: self.binsaver.append(job["name"] + job["orgCode"], con1.text.strip()) Log.info("%s,%s,saved." % (job["name"], job["orgCode"])) return
def __init__(self, threadcnt): Spider.__init__(self, threadcnt) self.sqs = {} self.binsaver = BinSaver("Qycxb" + str(time.time()).split(".")[0] + ".bin")
def dispatch(self): self.bs = BinSaver("qichacha.bin") self.add_job("墨麟") self.add_job("爱拼") self.wait_q() self.add_job(None, True)
class PageStoreBase(PageStoreDB): class CurDoc(object): def __init__(self, content, getime, jdid, real_url): self.cur_content = content self.cur_getime = getime self.cur_jdid = jdid self.cur_url = real_url def __init__(self, channel, dburl=None): super(PageStoreBase, self).__init__(channel, dburl) self.testmode = False opath = self.getopath() t = time.localtime() folder = "%s/%s/%d" % (opath, self.channel, t.tm_year) fnbase = "%s_%d%02d" % (self.channel, t.tm_year, t.tm_mon) os.system("mkdir -m 777 -p " + folder) self._ofn = "%s/%s.bin" % (folder, fnbase) self._ofnlog = "%s/%s_update.log" % (folder, fnbase) self.fssaver = BinSaver(self._ofn) self._hashcheck = spider.util.LocalHashChecker() self._docobjtls = threading.local() self.saved_count = 0 def getopath(self): dirs = ['/data/crawler/_files3_', '/opt/_test_store_'] for di in dirs: if os.path.isdir(di) and os.access(di, os.W_OK): return di raise RuntimeError("no dir to write files.") def get_cur_doc(self): return getattr(self._docobjtls, 'doc', None) def set_cur_doc(self, content, getime, jdid, real_url): doc = PageStoreBase.CurDoc(content, getime, jdid, real_url) setattr(self._docobjtls, 'doc', doc) @staticmethod def mktime(year=2015, m=1, d=1, hour=0, minute=0, second=0): arr = [year, m, d, hour, minute, second, 0, 0, 0] for i in range(0, len(arr)): arr[i] = int(arr[i]) return time.mktime(arr) def extract_content(self): raise NotImplementedError('virtual function called') def page_time(self): raise NotImplementedError('virtual function called') def check_should_fetch(self, jdid): indexUrl = "%s://%s" % (self.channel, jdid) return not self.find_new(indexUrl) def save_time_log(self, indexUrl, cur_tm): oldtime = self.get_page_time(indexUrl) if oldtime == cur_tm: return logstr = "%s %ld => %ld\n" % (indexUrl, oldtime, cur_tm) cutil.mp_append_log(self._ofnlog, logstr) def save(self, getime, jdid, real_url, content, fnpath=None, offset=None): global MIN_TIME_MSEC if getime > MIN_TIME_MSEC: raise RuntimeError("get time muse be in seconds.") if self._hashcheck.query(jdid) > 0: return True self.set_cur_doc(content, getime, jdid, real_url) try: pageDesc = self.extract_content() if not pageDesc: print "jdid: %s, pageDesc empty" % self.get_cur_doc().cur_jdid return False elif self.testmode: print pageDesc pageTime = self.page_time() if pageTime is None or pageTime < MIN_TIME_MSEC: raise RuntimeError("page time must be in msec") if isinstance(pageTime, float): pageTime = int(pageTime) if isinstance(pageDesc, unicode): pageDesc = pageDesc.encode('utf-8') contentSign = hashlib.md5(pageDesc).hexdigest() indexUrl = "%s://%s" % (self.channel, jdid) self.save_time_log(indexUrl, pageTime) # if there is an entry with this contentSign, update it with no need to save webpage in binfile. # otherwise update by indexUrl. if self.find_item(indexUrl, contentSign): Log.warning("%s exists in db, skip" % jdid) self.update_time(indexUrl, contentSign, int(getime) * 1000, pageTime) return True print "saving", indexUrl odoc = { 'contentSign': contentSign, 'indexUrl': indexUrl, 'realUrl': real_url, 'createTimeFlag': 1, 'owner': self.channel, 'createTimeTimeStamp': pageTime, 'crawlerUpdateTime': int(getime) * 1000, 'updateTime': pageTime, 'status': 0, 'isUpdated': 0, 'isExpired': 0, } if self.testmode: pprint.pprint(odoc) return True else: if self.do_save(odoc, content, fnpath, offset): print indexUrl, "saved" self.saved_count += 1 self._hashcheck.add(jdid) return True return False except Exception as e: print e traceback.print_exc() Log.error("failed to save %s %s" % (self.channel, jdid)) time.sleep(5) return False def do_save(self, odoc, content, fnpath=None, offset=None): if isinstance(content, unicode): content = content.encode('utf-8') filepos = self.fssaver.append( "%s.%s.%d" % (self.channel, self.get_cur_doc().cur_jdid, self.get_cur_doc().cur_getime), content) odoc.update({'pageContentPath': "binf::%s::%d" % (self._ofn, filepos)}) return self.upsert_doc(odoc['indexUrl'], odoc)
def dispatch(self): self.bs = BinSaver("jobui_job.bin") for i in range(133002626, 133002636, 1): self.add_job(i, True) self.wait_q() self.add_job(None, True)
def dispatch(self): self.bs = BinSaver("job51.bin") for i in range(45000000, 75000000): self.add_main_job(i) self.wait_q() self.add_main_job(None)
class JobuiSpider(Spider): def __init__(self): self.proxies_dict = [] self.read_proxy("../spider/proxy/proxy.txt") Spider.__init__(self, len(self.proxies_dict)) self.num_count = 0 self.__fail_ids = FileSaver("fail_ids.txt") def wait_q_breakable(self): lt = 0 while True: if not self.job_queue.empty() or not self.job_queue2.empty( ) or not self.job_queue3.empty(): time.sleep(5) if time.time() < lt + 1 and self._running_count == 0: return True time.sleep(2) lt = time.time() if self._worker_count == 0: return False def dispatch(self): self.bs = BinSaver("jobui_job.bin") i = 133002626 while i > 130000000: #131127307 131207901 job = {"id": i, "retry_none": 0, "retry_500": 0} self.add_job(job, True) i -= 1 self.wait_q_breakable() self.add_job(None, True) def get_fail_cnt(self, addv): fc = getattr(self._curltls, 'failcount', 0) if (addv): fc += addv setattr(self._curltls, 'failcount', fc) return fc def run_job(self, jobid): jobid_int = jobid.get("id") url = "http://www.jobui.com/job/%d/" % (jobid_int) tid = self.get_tid() proxies = self.proxies_dict[tid] #print "this request tid = [ %s ] proxies = [ %s ]" % (tid,proxies) res = self.request_url(url, proxies=self.proxies_dict[self.get_tid()]) self.num_count += 1 #print "id : %d ------------- response code : %s " % (jobid_int, "Response Is None" if res is None else str(res.code)) if res is None: if self.get_fail_cnt(1) < 10: self.add_job(jobid) else: print "id is [ %s ] thread and [ %s ] proxy will be close and drop." % ( tid, proxies) self.__fail_ids.append(str(jobid_int)) raise AccountErrors.NoAccountError( "Maybe the proxy[ %s ] invalid,failcount = [ %d ]" % (proxies, self.get_fail_cnt(0))) return else: setattr(self._curltls, 'failcount', 0) if res.code == 404: print "%d ------ 404" % jobid_int return elif res.code == 503 or res.code == 500 or res.code == 502 or res.code == 504: print "%d ------ %d " % (jobid_int, res.code) self.add_job(jobid) time.sleep(0.8) return elif res.code == 200: print "%d ------ saving " % jobid_int fn = 'jobui_job.%d.%d' % (jobid_int, int(time.time())) self.bs.append(fn, res.text) if self.bs.getsize() >= 8 * 1024 * 1024 * 1024: raise AccountErrors.NoAccountError('file too large') else: print "#######################################UNKNOWN ERROR############################################# [ %d ]" % res.code Log.error("unknown error...") Log.errorbin("%s" % jobid_int, res.text) raise AccountErrors.NoAccountError('fatal error') def event_handler(self, evt, msg, **kwargs): if evt == 'DONE': spider.util.sendmail('*****@*****.**', '%s DONE' % sys.argv[0], msg) def read_proxy(self, fn): with open(fn, 'r') as f: for line in f: line = line.strip() m = re.match(r'(\d+\.\d+\.\d+\.\d+:\d+)', line, re.I) if m: prstr = m.group(1) proxies = { 'http': 'http://' + prstr + "/", 'https': 'https://' + prstr + "/" } self.proxies_dict.append(proxies) elif re.match('\s*#', line): continue print " loaded [ %d ] proxis " % len(self.proxies_dict)