def process_request(self, request, spider): if self.count % 200 == 0: print 'fetcing proxys ..........' self.PROXIES = fetch_all() proxy = random.choice(self.PROXIES) request.meta['proxy'] = "http://%s" % proxy self.count += 1
def fetch_new_proxyes(self): logger.info("extending proxyes using fetch_free_proxyes.py") new_proxyes = fetch_free_proxyes.fetch_all() logger.info("new proxyes: %s" % new_proxyes) for np in new_proxyes: if np in self.proxyes: continue else: self.proxyes[np] = {'count': 0, 'valid': True, 'last': 0}
def process_request(self, request, spider): curr_time = time.time() local_proxy = '127.0.0.1:80' if not MyGlobals.last_fetch_time or curr_time > MyGlobals.last_fetch_time + FETCH_PROXY_INTERVAL: log.msg('start fetch proxy ips...', level=log.INFO) MyGlobals.proxy_list = fetch_all() MyGlobals.proxy_list.append(local_proxy) MyGlobals.last_fetch_time = curr_time proxy = random.choice(MyGlobals.proxy_list) if local_proxy == proxy: log.msg('*******current proxy is:local', level=log.INFO) else: log.msg('*******current proxy is:{0}'.format(proxy)) request.meta['proxy'] = "http://%s" % proxy
def fetch_new_proxyes(self): """ 从网上抓取新的代理添加到代理列表中 """ logger.info("extending proxyes using fetch_free_proxyes.py") new_proxyes = fetch_free_proxyes.fetch_all(https=self.use_https) logger.info("new proxyes: %s" % new_proxyes) self.last_fetch_proxy_time = datetime.now() for np in new_proxyes: if self.url_in_proxyes(np): continue else: self.proxyes.append({"proxy": np, "valid": True, "count": 0}) if self.len_valid_proxy( ) < self.extend_proxy_threshold: # 如果发现抓不到什么新的代理了, 缩小threshold以避免白费功夫 self.extend_proxy_threshold -= 1
def fetch_new_proxyes(self): """ 从网上抓取新的代理添加到代理列表中 """ logger.info("extending proxyes using fetch_free_proxyes.py") new_proxyes = fetch_free_proxyes.fetch_all() logger.info("new proxyes: %s" % new_proxyes) self.last_fetch_proxy_time = datetime.now() for np in new_proxyes: if self.url_in_proxyes("http://" + np): continue else: self.proxyes.append({"proxy": "http://" + np, "valid": True, "count": 0}) if self.len_valid_proxy() < self.extend_proxy_threshold: # 如果发现抓不到什么新的代理了, 缩小threshold以避免白费功夫 self.extend_proxy_threshold -= 1
def fetch_new_proxyes(self): """ 从网上抓取新的代理添加到代理列表中 """ self.logger.info("extending proxyes using fetch_free_proxyes.py") new_proxyes = fetch_free_proxyes.fetch_all(log=self.logger) self.logger.info("new proxyes: %s" % new_proxyes) self.last_fetch_proxy_time = datetime.now() next_index = None for np in new_proxyes: if self.url_in_proxyes("http://" + np): continue else: new_one = {"proxy": "http://" + np, "valid": True, "count": 0} self.proxyes.append(new_one) if not next_index: next_index = self.proxyes.index(new_one) if next_index: self.proxy_index = next_index if self.len_valid_proxy( ) < self.extend_proxy_threshold: # 如果发现抓不到什么新的代理了, 缩小threshold以避免白费功夫 self.extend_proxy_threshold -= 1
def updateProxyAddr(self): self.pushall(dict(zip(fetch_all(log=self.logger), self._yield())))
def click(url, content, socketio=None, proxy=False): proxies = [None] if proxy: if socketio: socketio.sleep(1) socketio.emit('my_response', {'data': '免费代理获取中 \n这可能花费几分钟,请稍后...'}, namespace='/patent') proxies = fproxy.fetch_all() proxies = [{'http': 'http://' + x} for x in proxies] if socketio: socketio.sleep(1) socketio.emit('my_response', {'data': '免费代理获取中 \n这可能花费几分钟,请稍后...'}, namespace='/patent') form = form_produce(content) num = get_page_nums(url, form) logger.info(num) mongo = mongoConnection.mongoConnection(db='patent', collection='patentinfo') i = 1 if not num and socketio: socketio.emit('my_response', {'data': '目标网站连接失败,请稍后重试!'}, namespace='/patent') socketio.emit('disconnect', {'data': 'disconnect'}, namespace='/patent') return while i <= num: failed_tag = 0 attempt = 0 form = form_produce(content, i) proxie = random.choice(proxies) patents = get_patent(url, form, proxie) while patents is None: logger.debug('失败次数为:' + str(attempt + 1) + str(failed_tag)) failed_tag += 1 attempt += 1 if attempt % 3 == 0: attempt = 0 break if failed_tag % 10 == 0: logger.info("抓取新代理,请稍等") if socketio: socketio.sleep(1) socketio.emit('my_response', {'data': '抓取新代理,请稍等'}, namespace='/patent') proxies = fproxy.fetch_all() proxies = [{'http': 'http://' + x} for x in proxies] proxie = random.choice(proxies) # print('新换ip代理为:',proxie) patents = get_patent(url, form, proxie) failed_tag = 0 if patents != -1: try: for x in patents['titles']: logger.info('title:' + x) if socketio: socketio.sleep(1) socketio.emit('my_response', {'data': 'title:' + x}, namespace='/patent') store(patents, str(content['_id'])) except Exception as e: logger.debug(e) logger.debug('插入数据库失败...') i += 50
def click(url, socketio=None, proxy=False): logger.info('免费代理获取中 \n这可能花费几分钟,请稍后...') if socketio: socketio.emit('my_response', {'data': '免费代理获取中 \n这可能花费几分钟,请稍后...'}, namespace='/paper') socketio.sleep(1) proxies = [None] if proxy: proxies = fproxy.fetch_all() proxies = [{'http': 'http://' + x} for x in proxies] logger.info('免费代理获取完毕,总共%d条。' % len(proxies)) if socketio: socketio.sleep(1) socketio.emit('my_response', {'data': '免费代理获取完毕,总共%d条。' % len(proxies)}, namespace='/paper') num = get_page_nums(url) logger.info(num) mongo = mongoConnection.mongoConnection(db='wanFang', collection='paperinfo') # proxies = [None] # i = page_index(PAGEINDEX,'r') i = 0 while i <= (num + 9) // 10: failed_tag = 0 new_url = url + '&p=' + str(i) # print(new_url) proxie = random.choice(proxies) item = get_url(new_url, proxie) # print('item',item) if item is not None and item['paper_urls'] != []: for paper_url, quote in zip(item['paper_urls'], item['quotes']): attempt = 0 papers = get_paper(paper_url, quote, proxie) while papers is None: # print ('失败次数为:',attempt+1,failed_tag) failed_tag += 1 attempt += 1 if attempt % 3 == 0: attempt = 0 break if failed_tag % 10 == 0: logger.info("抓取新代理,请稍等") proxies = fproxy.fetch_all() proxies = [{'http': 'http://' + x} for x in proxies] proxie = random.choice(proxies) # print('新换ip代理为:',proxie) papers = get_paper(paper_url, quote, proxie) if papers is not None and papers != -1: failed_tag = 0 logger.info('papers:' + papers['title']) if socketio: socketio.emit('my_response', {'data': 'papers:' + papers['title']}, namespace='/paper') socketio.sleep(1) try: papers['url'] = url papers = clean.clean(papers) #按照规定格式格式化 mongo.collection.insert(papers) except Exception as e: logger.debug(e) logger.debug(papers) logger.debug('插入数据库失败...') # fwrite(FNAME, papers) i += 1 #此页数据访问成功,开始访问下一页,否则重新选择代理访问 page_index(PAGEINDEX, 'w', str(i + 1)) else: failed_tag += 1 if failed_tag % 3 == 0: proxies = fproxy.fetch_all() proxies = [{'http': 'http://' + x} for x in proxies]