def __init__(self, a, b, c): AioRunner.__init__(self, a, b, c) self.baset = time.time() self.dstfile = open("corp_name.txt", "a+b") self.failfile = open("corp_fail.txt", "a+b") self.__jsl_clearance = None self.__jsluid = None
def prepare_req(self, job, curl, proxies): pr = AioRunner.prepare_req(self, job, curl, proxies) if pr is not None: return pr url, headers = {} curl.prepare_req(url, headers=headers, proxies=proxies) return True
def on_result(self, curl, resp): AioRunner.on_result(self, curl, resp) con = resp if con is None or con.text.strip() == "": spider.runtime.Log.error("Request return nothing! Readd...." + self.job.__str__()) self.master.re_add_job(self.job) return else: corp_name_list = re.findall( r'<h3 class="site-list-title">(.*?)<small', con.text, re.S) if len(corp_name_list) == 0: spider.runtime.Log.warning("line " + str(self.job["lineno"]) + ", key:" + self.job["key"] + ", no data...") self.failfile.write(self.job["line"].strip() + " no data.\n") self.failfile.flush() return else: self.save_name(self.job, corp_name_list) print resp.request.url, resp.code
def on_result(self, curl, resp): AioRunner.on_result(self, curl, resp) con = resp if con is None or con.text.strip() == "": spider.runtime.Log.error("Request return nothing! Readd...." + self.job.__str__()) self.master.re_add_job(self.job) return elif con.code == 521: f = open("login.js", "w+b") f.write( con.text.replace("<script>", "").replace("</script>", "").replace( "document.cookie=dc", "console.log(dc)")) f.close() os.system("nodejs login.js > cookiestr.txt") f = open("cookiestr.txt", "r+b") self.__jsl_clearance = re.findall(r"__jsl_clearance=(.*?);", f.read(), re.S)[0] if "Set-Cookie:" in con.headers: setcookie = re.findall(r"Set-Cookie:(.*?)path", con.headers, re.S)[0] self.__jsluid = re.findall(r"__jsluid=(.*?);", setcookie, re.S)[0] self.master.re_add_job(self.job) else: corp_name_list = re.findall( r'class="search-result-title"><em>(.*?)</a>', con.text, re.S) if len(corp_name_list) == 0: spider.runtime.Log.warning("line " + str(self.job["lineno"]) + ", key:" + self.job["key"] + ", no data...") self.failfile.write(self.job["line"].strip() + " no data.\n") self.failfile.flush() return else: self.save_name(self.job, corp_name_list) print resp.request.url, resp.code
def prepare_req(self, job, curl, proxies): self.dbg('prepare') pa = AioRunner.prepare_req(self, job, curl, proxies) if pa is not None: return pa if 'value' in job: url = "https://www.linkedin.com/jobs2/view/%d" % job['value'] else: url = job['url'] print "[%d] prepare %s proxies=" % (self.idx, url), proxies headers={} if 'ip.cn' in url: headers['User-Agent'] = 'curl/7.20.1' curl.prepare_req(url, headers=headers, proxies=proxies) return True
def prepare_req(self, job, curl, proxies): self.dbg('prepare') pa = AioRunner.prepare_req(self, job, curl, proxies) if pa is not None: return pa if 'key' in job: key = spider.util.utf8str(job['key']) url = "http://qichacha.com/search?key=" + quote(key) + "&sType=0" else: Log.error("Invalid job.===>" + job.__str__()) print "[%d] prepare %s proxies=" % (self.idx, url), proxies headers = {} if 'ip.cn' in url: headers['User-Agent'] = 'curl/7.20.1' curl.prepare_req(url, headers=headers, proxies=proxies) return True
def prepare_req(self, job, curl, proxies): self.dbg('prepare') pa = AioRunner.prepare_req(self, job, curl, proxies) if pa is not None: return pa if 'key' in job: key = spider.util.utf8str(job['key']) url = r"http://www.qixin007.com/search/?key=" + quote( key) + "&type=enterprise&source=&isGlobal=Y" # url = "http://qichacha.com/search?key=" + quote(key) + "&sType=0" else: Log.error("Invalid job.===>" + job.__str__()) print "[%d] prepare %s proxies=" % (self.idx, url), proxies headers = {} if 'ip.cn' in url: headers['User-Agent'] = 'curl/7.20.1' if self.__jsl_clearance: headers["Cookie"] = "__jsl_clearance=" + self.__jsl_clearance + ";" if self.__jsluid: headers["Cookie"] += "__jsluid=" + self.__jsluid curl.prepare_req(url, headers=headers, proxies=proxies) return True
def on_error(self, curl, errcode, errmsg): AioRunner.on_error(self, curl, errcode, errmsg) print "[%d] error, proxy_errcnt=%d" % (self.idx, self.proxyerr) print "with: code=%d msg=%s" % (errcode, errmsg)
def __init__(self, a, b, c): AioRunner.__init__(self, a, b, c) self.baset = time.time() self.dstfile = open("corp_name.txt", "a+b") self.failfile = open("corp_fail.txt", "a+b")
def __init__(self, curl, selproxy, idx): AioRunner.__init__(self, curl, selproxy, idx)
def on_result(self, curl, resp): self.dbg('result') AioRunner.on_result(self, curl, resp) print resp.request.url, resp.code
def __init__(self, a, b,c): AioRunner.__init__(self, a, b, c) self.baset = time.time()