def run_job(self, jobid): spider.genquery.GenQueries.run_job(self, jobid) jobtype = self.get_job_type(jobid) if jobtype == 'loadpage': url = CData.get_url(jobid.get('u'), jobid.get('p')) print url con = self.request_url(url) if con is not None: self.parse_html(con.text) else: self.re_add_job(jobid) elif jobtype == 'jdurl': url = jobid['u'] m = re.search(r'/(\d+)\.html', url) if m: if self.pagestore.check_should_fetch(m.group(1)): con = self.request_url(url) if con is not None: self.pagestore.save(int(time.time()), m.group(1), url, con.text) else: self.re_add_job(jobid) Log.error("failed get url", url) # self.re_add_job(jobid) else: #Log.warning("skip fetch url:", url) pass
def run_job(self, job): jobtype = self.get_job_type(job) if jobtype == 'co': url = 'http://www.lagou.com/gongsi/j%s.html' % job['u'] print url con = self.request_url(url) if con is not None: self.parse_list(con.text, job['u']) else: self.re_add_job(job) elif jobtype == 'list': url = job['base'] + job['u'] print url con = self.request_url(url) if con is not None: self.parse_html(con.text) else: self.re_add_job(job) elif jobtype == 'jd': url = 'http://www.lagou.com/jobs/%s.html' % job['u'] if self.page_store.check_should_fetch(job['u']): con = self.request_url(url) if con is not None: self.page_store.save(int(time.time()), job['u'], url, con.text) else: self.re_add_job(job) Log.error("failed get url", url) else: pass
def save(self, url, id, content, getime): if self._hashchecker.query(id) > 0: return True if isinstance(content, unicode): content = content.encode('utf-8') url = url_standardize(url) sign = hashlib.md5(content).hexdigest() try: if self.has_any_with_sign(id, sign): Log.warning("%s exists in db, skip" % id) return True print 'saving', id ctime = int(time.time()) * 1000 indexUrl = '%s://%s' % (self.channel, id) doc = {'sign': sign, 'indexUrl': indexUrl, 'realUrl': url, 'createTimeFlag': 1, 'owner': self.channel, 'createTimeTimeStamp': ctime, 'crawlerUpdateTime': int(getime) * 1000, 'updateTime': ctime, 'content': content, 'status': 0, 'isUpdated': 0, 'isExpired': 0,} if self.__collection.insert_one(doc): print id, 'saved' with self.__lock: self.saved_count += 1 self._hashchecker.add(id) except Exception as e: print e traceback.print_exc() Log.error("failed to save %s %s" % (self.channel, id)) time.sleep(5) return False
def save_info(self, job, jsonobj): with self.locker: if job["type"] == "QuerySummary": name = jsonobj.get("ENTNAME", "-") regNo = jsonobj.get("REGNO", "-") id = jsonobj.get("ID", "-") self.namefile.write(job["line"] + " " + name.encode("utf-8") + " " + regNo.encode("utf-8") + " " + id.encode("utf-8") + "\n") self.namefile.flush() self.binsaver.append( name.encode("utf-8") + "_" + regNo.encode("utf-8"), json.dumps(jsonobj)) spider.runtime.Log.info("%s:%s=========>saved." % (job["Q"], name)) elif job["type"] == "QueryAutoName": if "ERRCODE" in jsonobj: if not self.re_add_job(job): self.save_fail_info(job) Log.error("ErrCode, proxy down.") raise AccountErrors.NoAccountError() for name in jsonobj: self.namefile.write(job["line"] + " " + name.encode("utf-8") + "\n") self.namefile.flush() spider.runtime.Log.info("%s:%s=========>saved." % (job["Q"], name))
def run_job(self, jobid): if self.get_job_type(jobid) != 'cvurl': return #http://ehire.51job.com/Candidate/ResumeView.aspx?hidUserID=2801&hidEvents=23&hidKey=b4c9f030c69853ed26b3b5a92a20fb45 url = jobid['url'] m = re.search(r'hidUserID=(\d+)', url) if m is None: return jdid = m.group(1) with self.locker: spider.util.FS.dbg_append_file('ooo.txt', jdid) if self.hasher.query(jdid) > 0: print "%s duplicated" % jdid return if self.pagestore.check_should_fetch(jdid): con = self.cv51nm.el_request(url) if con is None: self.re_add_job(jobid) return else: getime = int(time.time()) if u'此人简历保密' in con.text: Log.warning(jdid, "此人简历保密") self.hasher.add(jdid) else: self.pagestore.save(getime, jdid, url, con.text) else: print "skip %s" % jdid
def run_job(self, job): jt = self.get_job_type(job) if jt == 'main': joburl = CVZhilianUtil.get_search_url(json.loads(job['line'])) # if this account can't search, then giveup. con = self.zlm.el_request(joburl, headers=self.headers, hint='search', prechecker=self.search_cnt_checker) if con.code == 404: con = None if con is None: Log.warning('请求搜索页失败', joburl) self.add_job(job) return for su in CVZhilianUtil.sub_pages(joburl, con): self.add_job({'type': 'search', 'url': su}) self.parse_page(joburl, con) elif jt == 'search': joburl = job['url'] # if self.crawler_range: # joburl = CVZhilianUtil.get_count() con = self.zlm.el_request(joburl, headers=self.headers, hint='search') if con.code == 404: con = None if con is None: Log.warning('请求搜索页失败', joburl) self.add_job(job) return self.parse_page(joburl, con)
def get_cookie(self): cookiepair = getattr(self._tls, "cookie", None) # cookie 隔一段时间换一次 if cookiepair == None: cookiestr = hashlib.md5(str(time.time())).hexdigest().upper() cookiepair = [ cookiestr[:8] + '-' + cookiestr[8:12] + '-' + cookiestr[12:16] + '-' + cookiestr[16:20] + '-' + cookiestr[20:], time.time() ] setattr(self._tls, "cookie", cookiepair) else: cookietime = cookiepair[1] if (time.time() - cookietime) > 60 * 10: cookiestr = hashlib.md5(str(time.time())).hexdigest().upper() cookiepair = [ cookiestr[:8] + '-' + cookiestr[8:12] + '-' + cookiestr[12:16] + '-' + cookiestr[16:20] + '-' + cookiestr[20:], time.time() ] setattr(self._tls, "cookie", cookiepair) Log.info("Cookie changed, sleep 10s") time.sleep(10) return cookiepair[0]
def run_job(self, job): jobtype = self.get_job_type(job) if jobtype == 'co': url = 'http://sou.zhaopin.com/jobs/companysearch.ashx?CompID=%s' % job[ 'u'] print url con = self.request_url(url) if con is not None: self.parse_list(con.text, url) self.parse_html(con.text) else: self.re_add_job(job) elif jobtype == 'list': url = job['base'] + job['u'] print url con = self.request_url(url) if con is not None: self.parse_html(con.text) else: self.re_add_job(job) elif jobtype == 'jd': url = job['u'] m = re.search(r'.*?(\d+).htm', url) print url if m: if self.pagestore.check_should_fetch(m.group(1)): con = self.request_url(url) if con is not None: self.pagestore.save(int(time.time()), m.group(1), url, con.text) else: self.re_add_job(job) Log.error("failed get url", url) else: pass
def get_image(self, dbgdata=None): headers = {'Referer': self.info['url']} con = None while True: imgurl = ImgUrlProcess().process(self.info['imgurl']) self.select_user_agent( "=Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko LBBROWSER" ) con = self._con = self.request_url(imgurl, headers=headers, timeout=30) if con is not None and con.code == 200: if dbgdata is not None and isinstance(dbgdata, dict): dbgdata["imgUrl"] = imgurl break time.sleep(random.randrange(1, 5, 1)) print "获取验证码出错,http-code:%d , 正在重新获取......" % (0 if con is None else con.code) time.sleep(0.5) # if con is None or con.code != 200: # time.sleep(random.randrange(1, 5, 1)) # print "获取验证码出错,http-code:%d" % (con.code if con is None else 0) # continue #return None imgtype = imghdr.what(None, con.content) if imgtype in ['gif', 'jpeg', 'jpg', 'png', 'bmp']: #TODO 正式跑请注释 spider.util.FS.dbg_save_file("pic." + imgtype, con.content) return con.content else: if con.content[0:1] != '<': Log.error("invalid image type") print "request captcha code error:content=%s" % con.content return None
def report_job_one_minute(self): if self.store is not None: count = self.store.saved_count - self.prev_count self.prev_count = self.store.saved_count Log.errinfo(time.strftime('%Y-%m-%d %H:%M:%S') + ' ==> %s' % count) else: Log.errinfo(time.strftime('%Y-%m-%d %H:%M:%S'))
def report(self): sock = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) while True: time.sleep(1) ##sleep for next report. if int(time.time()) % 60 == 0: Log.errinfo(time.strftime('%Y-%m-%d %H:%M:%S')) prog = "mj:%d/%s,aj:%d/(%d,%d,%d)" % (self._mjob_count, self._mjob_all, self._job_count, self.job_queue.qsize(), self.job_queue2.qsize(), self.job_queue3.qsize()) if isinstance(self.curjobid, dict) and self.curjobid.has_key('url'): cjstr = spider.util.utf8str(self.curjobid['url']) else: cjstr = spider.util.utf8str(self.curjobid.__str__()) cjstr = re.sub(r'\r|\n', '', cjstr) if len(cjstr) > 100: cjstr = cjstr[0:100] message = "[pid=%d]job:%s prog:%s\n" % (os.getpid(), cjstr, prog) try: sock.sendto(message, ("127.0.0.1", self._logport)) except Exception as e: pass if self._end_mark: message = "[pid=%d] DONE\n" % (os.getpid()) try: sock.sendto(message, ("127.0.0.1", self._logport)) except: pass return
def extract_content(self): doc = self.get_cur_doc() hf = spider.util.htmlfind(doc.cur_content, 'id="resumeContentBody"', 0) dom = html.fromstring(doc.cur_content) contact_info = self.extract_info(dom) name = contact_info.get("name", "") email = contact_info.get("email", "") telephone = contact_info.get("telephone", "") if not (name and (email or telephone)): self.log.info("fail id: %s, extract contact infomation fail" % self.get_cur_doc().cur_jdid) return None try: detail = hf.get_text() except: Log.errorbin("invalid cv content %s" % doc.cur_url, doc.cur_content) return None return utf8str(contact_info) + utf8str(detail)
def extract_info(dom): if isinstance(dom, (str, unicode)): dom = html.fromstring(dom) user_name = '' user_field = dom.xpath('//div[@id="userName"]/@alt') if not user_field: Log.warning("find zhilian user field exception") else: user_name = user_field[0] contact_info = dom.xpath("//div[@class='feedbackD']//em") user_email = '' user_telephone = '' if not contact_info and len(contact_info) < 2: Log.warning("find contact info exception") else: user_telephone = contact_info[0].text_content() user_email = contact_info[1].text_content() return { 'name': user_name, 'telephone': user_telephone, 'email': user_email }
class GenLPQuery(GenQueries): def __init__(self, thcnt=20): GenQueries.__init__(self, thcnt) self._name = "lp_qiye_queries" def init_conditions(self): self.baseurl = "http://www.liepin.com/zhaopin/?pubTime=3&salary=*&searchType=1&clean_condition=&jobKind=2&isAnalysis=&init=1&searchField=1&key=&industries=&jobTitles=&dqs=&compscale=000&compkind=000" self.cond = ['dqs', 'compscale', 'compkind', 'jobTitles'] self.conddata = [dqs, compscale, compkind, jobTitles] def need_split(self, url, level, islast): con = self.request_url(url) if con is not None: m = re.search(ur"共为您找到\s*<strong>([0-9+]*)</strong>\s*职位", con.text) if m: found = m.group(1) count = parseInt(found) print "[%d] %s ==> %s %s" % (level, url, found, 'failed' if (count >= 4000) else '') if parseInt(found) >= 3000: return True m1 = re.search(ur'curPage=(\d+)" title="末页"', con.text) if m1: if int(m1.group(1)) >= 100: print "===failed===" return True if m or m1: return False if re.search(ur"没有找到符合您搜索条件的相关职位", con.text): return False Log.error("unknown page for", url) Log.errorbin(url, con.text) time.sleep(1) return self.need_split(url, level, islast)
def get_cv(self, url): #http://rd.zhaopin.com/resumepreview/resume/viewone/2/JM622670859R90250000000_1_1?searchresume=1 con = self.zlm.el_request(url) if con is None: return None if u"您需要输入验证码才能继续后续的操作" in con.text: self.imgcnt += 1 if self.imgcnt > 10: self.imgcnt = 0 self.zlm.set_nologin() return None for i in range(0, 5): code = OnlineOCR('zhilian2').resolve( lambda dbgdata=None: self._get_image(url)) purl = "http://rd.zhaopin.com/resumePreview/resume/_CheackValidatingCode?validatingCode=" + code con = self.zlm.el_request(purl, data={'validatingCode': code}, headers={'Referer': url}) if con is not None: if re.search('true', con.text, re.I): time.sleep(5) return None Log.warning('验证码输入失败') time.sleep(2) #连续失败了5次, 换帐号!! self.zlm.set_nologin() self.imgcnt = 0 return None return con
def get_detail(self, gsweb, oi, cnt, qname=None): #time.sleep(2) tid = self.get_tid() cname = oi["name"] url = oi["url"] regist_code = oi["regcode"] gd = "gsxt.gdgs.gov.cn/aiccips/GSpublicity/GSpublicityList.html" sz = "szcredit.com.cn/web/GSZJGSPT/QyxyDetail.aspx" gz1 = "gsxt.gzaic.gov.cn/search/search!entityShow" gz2 = "gsxt.gzaic.gov.cn/aiccips/GSpublicity/GSpublicityList.html" flg = None if gd in url: #return True flg = gsweb.get_GSpublicityList(cnt, cname, url, regist_code, qname=qname) elif sz in url: #return True flg = gsweb.get_QyxyDetail(cnt, cname, url, regist_code, tid=tid) elif gz1 in url: #return True flg = gsweb.get_entityShow(cnt, cname, url, regist_code) # 此链接跑完需重新初始化对象 self.init_obj() elif gz2 in url: flg = gsweb.get_guangzhou(cnt, cname, url, regist_code, qname=qname) else: print "未知的链接类型--->", url Log.error("UNKNOWN LINK TYPE," + url) return True #未知链接类型 暂时跳过 算成功 if flg == "success": self.record_spider_query(cname.strip()) return True elif flg == "proxy_error": self.error_add() time.sleep(5) return self.get_detail(gsweb, oi, cnt, qname=qname) elif flg == "notdisplay": oi["error"] = "notdisplay" self.not_show_save.append(spider.util.utf8str(oi)) return True elif flg == "return_error": oi["error"] = "return_page_error" self.not_show_save.append(spider.util.utf8str(oi)) return True elif flg == "page_error": time.sleep(10) return self.get_detail(gsweb, oi, cnt, qname=qname) elif flg == "521": gsweb = self.init_obj() return self.get_detail(gsweb, oi, cnt, qname=qname) else: return False
def dispatch(self): seeds = self.source.export_seeds() print 'load %d seeds' % len(seeds) for seed in seeds: date = seed['indexUrl'].split('://')[1] eval_str = seed['content'][1:-1].replace('\\"', '"') res = eval(eval_str) try: if (isinstance(res, tuple) or isinstance(res, list)) and len(res) > 0: self.add_main_job({ 'type': 'main', 'date': date.encode('utf-8'), 'count': int(res[0]['Count']) }) else: print 'invalid seed', seed except KeyError as e: Log.error('KeyError %s' % e.message) traceback.print_exc() print seed print eval_str time.sleep(2) self.wait_q() self.add_job(None)
def _replace_proxy(self, kwargs, memo): with self.locker: if not isinstance(self.sp_proxies, dict) or len(self.sp_proxies.keys()) == 0: return False if self._auto_change_proxy: oldproxy = memo.get('proxy') if oldproxy in self.sp_proxies: self.sp_proxies[oldproxy] += 1 prs = self.sp_proxies.keys() prs.sort() for i in range(0, len(prs)): self._cur_proxy_index = (self._cur_proxy_index + 1) % len(prs) selproxy = prs[self._cur_proxy_index] if self.sp_proxies.get(selproxy, 0) <= 10 and self._proxy_use_times.get(selproxy, 0) <= self.proxy_limit: memo['proxy'] = selproxy self._set_proxy(kwargs, selproxy) self.curproxy = selproxy return True elif self._proxy_use_times.get(selproxy,0) > self.proxy_limit: Log.error("This proxy has run too many times.==>"+ self.curproxy) else: Log.error("This proxy has made too many errors.==>" + self.curproxy) elif self._cur_proxy_index < 0: # don't auto change proxy, and the index < 0, no proxy is used. # but don't report an error. return True else: prs = self.sp_proxies.keys() prs.sort() selproxy = prs[self._cur_proxy_index % len(prs)] self._set_proxy(kwargs, selproxy) return True return False
def _print_banner(self, port): hostname = 'localhost' conn_info = os.getenv('SSH_CONNECTION') or '' m = re.search(r'[0-9.]+ \d+ ([0-9.]+) ', conn_info) if m: hostname = m.group(1) Log.error("[pid:%d]Enter image code at http://%s:%d/ " % (os.getpid(), hostname, port))
def extract_content(self): content = spider.util.htmlfind(self.get_cur_doc().cur_content, 'class="job-description"', 0) try: content = content.get_text() except: Log.errorbin("invalid jd content %s" % self.get_cur_doc().cur_url, self.get_cur_doc().cur_content) return None return content
def thread_check(self): # 3次连续错误,杀死线程 basicreq = getattr(self._tls, "req") proxy_error_cnt = getattr(basicreq._curltls, "proxy_error_cnt", 0) if proxy_error_cnt > 3: Log.error("Proxy %s and its thread is going down." % (basicreq.sp_proxies.items()[0][0])) return False return True
def _get_image(self, refurl): imgurl = "http://rd2.zhaopin.com/s/loginmgr/monitorvalidatingcode.asp?t=" + str( int(time.time()) * 1000) con = self.zlm.el_request(imgurl, headers={'Referer': refurl}) if con is None: Log.warning("fetch image failed, sleep 1s") time.sleep(1) return self._get_image(refurl) return con.content
def extract_content(self): m = re.search( r'<div class="detail-info">(.*?)<div class="wrap_style mb15 pd5" id="comment" name="comment">', self.get_cur_doc().cur_content, re.S) if m: a = re.sub(ur'<[a-zA-Z/!][^<>]*>', '', m.group(1)) return a.strip() Log.error(self.get_cur_doc().cur_url, "no content") return None
def page_time(self): tag = spider.util.htmlfind(self.get_cur_doc().cur_content, 'class="publish-time"', 0) try: tag = tag.get_text() except: Log.errorbin("invalid jd content %s" % self.get_cur_doc().cur_url, self.get_cur_doc().cur_content) return None return TimeHandler.fmt_time(tag)
def thread_init(self, tid): # self.proxyq is threading-safe proxy = self.proxyq.get(True) basicreq = BasicRequests() basicreq.sp_proxies[proxy] = 0 basicreq._cur_proxy_index = 0 basicreq._auto_change_proxy = False setattr(self._tls, "req", basicreq) with self.locker: Log.info("Thread%d's request prepared..Proxy:%s" % (tid, proxy))
def extract_content(self): doc = self.get_cur_doc() assert isinstance(doc, PageStoreBase.CurDoc) hf = spider.util.htmlfind(doc.cur_content, 'id="resumeContentBody"', 0) try: return hf.get_text() except: Log.errorbin("invalid cv content %s" % doc.cur_url, doc.cur_content) raise
def extract_content(self): content = spider.util.htmlfind(self.get_cur_doc().cur_content, 'class="hasVist cfix sbox fs16"', 0) try: content = content.get_text() except: Log.errorbin("invalid jd content %s" % self.get_cur_doc().cur_url, self.get_cur_doc().cur_content) return None return content
def extract_content(self): #<td style="padding-top:10px;" id="divInfo"> dom = html.fromstring(self.get_cur_doc().cur_content) xx = dom.xpath("//td[@id='divInfo']") if xx is not None and len(xx)>0: return xx[0].text_content() Log.errorbin(self.get_cur_doc().cur_url, self.get_cur_doc().cur_content) Log.error("get cv failed", self.get_cur_doc().cur_url) time.sleep(5) return None
def eval_var(self, var): if var == 'TIME': return str(int(time.time() * 1000)) elif var == 'RND': return str(random.random()) elif re.match(r'^RND\d+$', var): v = int(var[3:]) return str(random.randint(v / 10, v - 1)) else: Log.error("unknown variable", var) return ""
def parse_list(self, text, url): if isinstance(text, unicode): text = text.encode('utf-8') m = re.search('共<em>(\d+)</em>个职位满足条件', text) if m: pagecnt = (int(m.group(1)) + 59) / 60 for i in range(2, pagecnt + 1): self.add_job({'type': 'list', 'u': '&p=%s' % i, 'base': url}) if pagecnt == 0: # no record found. Log.error("%s => NO_PAGES!" % url) return