def get_page(url, options={}): headers = dict(base_header, **options) Logger.log_normal('Getting %s' % url) try: r = requests.get(url, headers=headers) Logger.log_high('Getting result %s %s' % (url, r.status_code)) if r.status_code == 200: return r.text except ConnectionError: Logger.log_fail('Crawling Failed %s' % url) return None
def valid_proxy(cycle=VALID_CHECK_CYCLE): """从redis里面获取一半的代理 """ conn = RedisClient() tester = VaildityTester() while True: Logger.log_high('Refreshing ip') count = int(0.5 * conn.queue_len) if count == 0: Logger.log_normal('Waiting for adding') time.sleep(cycle) continue raw_proxies = conn.get(count) tester.set_raw_proxies(raw_proxies) tester.test() time.sleep(cycle)
async def test_single_proxy(self, proxy): """ 检测单个代理,如果可用,则将其加入_usable_proxies """ async with aiohttp.ClientSession() as session: try: if isinstance(proxy, bytes): proxy = proxy.decode('utf-8') real_proxy = 'http://' + proxy Logger.log_normal('Testing %s' % proxy) async with session.get(self.test_api, proxy=real_proxy, timeout=15) as resp: if resp.status == 200: self._conn.put(proxy) Logger.log_high('Valid proxy %s' % proxy) except Exception: pass
def add_to_queue(self): """ 命令爬虫抓取一定量未检测的代理,然后检测,将通过检测的代理加入到代理池中 """ Logger.log_normal('PoolAdder is working') proxy_count = 0 while not self.is_over_threshold(): for callback_label in range(self._crawler.__CrawlFuncCount__): callback = self._crawler.__CrawlFunc__[callback_label] raw_proxies = self._crawler.get_raw_proxies(callback) # test crawled proxies self._tester.set_raw_proxies(raw_proxies) self._tester.test() proxy_count += len(raw_proxies) if self.is_over_threshold(): Logger.log_high('IP is enough, waiting to be used') break if proxy_count == 0: raise ResourceDepletionError
def run(self): Logger.log_high('Ip processing running') valid_process = Process(target=Schedule.valid_proxy) check_process = Process(target=Schedule.check_pool) valid_process.start() check_process.start()