def extend_proxy_pool(): """Check proxies count if need to extend proxy pool. """ conn = rc() loop = asyncio.get_event_loop() flag = asyncio.Event() proxies = asyncio.Queue() crawler = ProxyCrawler(proxies) validator = ProxyValidator(conn) while 1: if conn.count > lower_limit: time.sleep(check_cycle_time) continue logger.debug('extend proxy pool started') try: loop.run_until_complete( asyncio.gather( ProxyPool.crawler_start(crawler, validator, proxies, flag), ProxyPool.crawler_stop(crawler, conn, flag))) except Exception as e: logger.error(e, exc_info=True) logger.debug('extend proxy pool finished') time.sleep(check_interval_time) flag.clear() crawler.reset() # clear flags
def extend_proxy_pool(): """Check proxies count if need to extend proxy pool.""" loop = asyncio.get_event_loop() proxies = asyncio.Queue() crawler = ProxyCrawler(proxies) validator = ProxyValidator() while 1: if conn.count > LOWER_LIMIT: time.sleep(CHECK_CYCLE_TIME) continue logger.debug('extend proxy pool started') flag = asyncio.Event() try: loop.run_until_complete(asyncio.gather( ProxyPool.crawler_start(crawler, validator, proxies, flag), ProxyPool.crawler_stop(crawler, flag) )) except Exception: logger.error(traceback.format_exc()) logger.debug('extend proxy pool finished') time.sleep(CHECK_INTERVAL_TIME) crawler.reset() # create new flag
async def _crawler(self, rule): logger.debug('{0} crawler started'.format(rule.__rule_name__)) parser = asyncio.ensure_future(self._parser(rule.page_count)) await self._downloader(rule) await self._pages.join() parser.cancel() logger.debug('{0} crawler finished'.format(rule.__rule_name__))
def proxy_validator_run(): loop = asyncio.new_event_loop() asyncio.set_event_loop(loop) validator = ProxyValidator() while 1: logger.debug('regular validator started') try: loop.run_until_complete(validator.start()) except Exception: logger.error(traceback.format_exc()) logger.debug('regular validator finished') time.sleep(VALIDATE_CYCLE_TIME)
async def validate(self, proxies): logger.debug('validator started') while 1: proxy = await proxies.get() async with aiohttp.ClientSession() as session: try: real_proxy = 'http://' + proxy async with session.get(self.validate_url, proxy=real_proxy, timeout=validate_timeout) as resp: self._conn.put(proxy) except Exception as e: logger.error(e) proxies.task_done()
async def regular_validate(self): count = min(ceil(self._conn.count * validate_ratio), validate_upper_limit) old_proxies = self._conn.get_list(count) # TODO: set an upper limit valid_proxies = [] logger.debug('regular validator started, {0} to validate'.format(len(old_proxies))) async with aiohttp.ClientSession() as session: for proxy in old_proxies: try: real_proxy = 'http://' + proxy.decode('utf-8') # proxy from redis was bytes type async with session.get(self.validate_url, proxy=real_proxy, timeout=validate_timeout) as resp: valid_proxies.append(proxy) except asyncio.TimeoutError: continue except Exception as e: logger.error(e) logger.debug('regular validator finished, {0} passed'.format(len(valid_proxies))) self._conn.put_list(valid_proxies)
async def crawler_start(crawler, validator, proxies, flag): """ Start proxy crawler and validator. Args: crawler: ProxyCrawler object. validator: ProxyValidator object. proxies: asyncio.Queue object, crawler put proxy and validator get proxy. flag: asyncio.Event object, stop flag for 'crawler_stop' function. """ logger.debug('proxy crawler started') valid = asyncio.ensure_future(validator.start(proxies)) await crawler.start() await proxies.join() valid.cancel( ) # cancel task when Queue was empty, or it would be blocked at Queue.get method flag.set() logger.debug('proxy crawler finished')
async def crawler_stop(crawler, flag): """Check proxies count if enough to stop proxy crawler. Args: crawler: ProxyCrawler object. flag: asyncio.Event object, stop flag. """ while 1: if conn.count > int(UPPER_LIMIT * UPPER_LIMIT_RATIO): logger.warning('proxies count approached the upper limit') crawler.stop() break if flag.is_set(): # stop check if crawler and validator finished break logger.debug('checked proxies count in redis') await asyncio.sleep(200 * random())
async def start(self): for rule in self._rules: parser = asyncio.ensure_future(self._parse_page(rule)) logger.debug('{0} crawler started'.format(rule.__rule_name__)) if not rule.use_phantomjs: await page_download(ProxyCrawler._url_generator(rule), self._pages, self._stop_flag) else: await page_download_phantomjs( ProxyCrawler._url_generator(rule), self._pages, rule.phantomjs_load_flag, self._stop_flag) await self._pages.join() parser.cancel( ) # cancel task when Queue was empty, or it would be blocked at Queue.get method logger.debug('{0} crawler finished'.format(rule.__rule_name__))
async def crawler_stop(crawler, conn, flag): """Check proxies count if enough to stop proxy crawler. Args: crawler: ProxyCrawler object. conn: redis connection. flag: asyncio.Event object, stop flag. """ # await asyncio.sleep(10) # TODO while 1: if conn.count > int(upper_limit * upper_limit_ratio): logger.warning('proxies count approached the upper limit') crawler.stop() break if flag.is_set(): # stop check if crawler and validator finished break logger.debug('checked proxies count in redis') await asyncio.sleep(200 * random())
def reset(self): self._stop_flag = asyncio.Event( ) # once setted, create a new Event object logger.debug('proxy crawler reseted')
def reset(self): self._stop_flag.clear() # clear crawler's stop flag logger.debug('proxy crawler reseted')