Exemplo n.º 1
0
    def extend_proxy_pool():
        """Check proxies count if need to extend proxy pool.
        """
        conn = rc()
        loop = asyncio.get_event_loop()
        flag = asyncio.Event()
        proxies = asyncio.Queue()
        crawler = ProxyCrawler(proxies)
        validator = ProxyValidator(conn)
        while 1:
            if conn.count > lower_limit:
                time.sleep(check_cycle_time)
                continue

            logger.debug('extend proxy pool started')

            try:
                loop.run_until_complete(
                    asyncio.gather(
                        ProxyPool.crawler_start(crawler, validator, proxies,
                                                flag),
                        ProxyPool.crawler_stop(crawler, conn, flag)))
            except Exception as e:
                logger.error(e, exc_info=True)

            logger.debug('extend proxy pool finished')
            time.sleep(check_interval_time)
            flag.clear()
            crawler.reset()  # clear flags
Exemplo n.º 2
0
    def extend_proxy_pool():
        """Check proxies count if need to extend proxy pool."""

        loop = asyncio.get_event_loop()
        proxies = asyncio.Queue()
        crawler = ProxyCrawler(proxies)
        validator = ProxyValidator()
        while 1:
            if conn.count > LOWER_LIMIT:
                time.sleep(CHECK_CYCLE_TIME)
                continue

            logger.debug('extend proxy pool started')

            flag = asyncio.Event()
            try:
                loop.run_until_complete(asyncio.gather(
                    ProxyPool.crawler_start(crawler, validator, proxies, flag),
                    ProxyPool.crawler_stop(crawler, flag)
                ))
            except Exception:
                logger.error(traceback.format_exc())

            logger.debug('extend proxy pool finished')
            time.sleep(CHECK_INTERVAL_TIME)
            crawler.reset() # create new flag
Exemplo n.º 3
0
    async def _crawler(self, rule):
        logger.debug('{0} crawler started'.format(rule.__rule_name__))

        parser = asyncio.ensure_future(self._parser(rule.page_count))
        await self._downloader(rule)
        await self._pages.join()
        parser.cancel()

        logger.debug('{0} crawler finished'.format(rule.__rule_name__))
Exemplo n.º 4
0
def proxy_validator_run():
    loop = asyncio.new_event_loop()
    asyncio.set_event_loop(loop)
    validator = ProxyValidator()
    while 1:
        logger.debug('regular validator started')
        try:
            loop.run_until_complete(validator.start())
        except Exception:
            logger.error(traceback.format_exc())
        logger.debug('regular validator finished')
        time.sleep(VALIDATE_CYCLE_TIME)
Exemplo n.º 5
0
    async def validate(self, proxies):
        logger.debug('validator started')
        while 1:
            proxy = await proxies.get()
            async with aiohttp.ClientSession() as session:
                try:
                    real_proxy = 'http://' + proxy
                    async with session.get(self.validate_url, proxy=real_proxy, timeout=validate_timeout) as resp:
                        self._conn.put(proxy)
                except Exception as e:
                    logger.error(e)

            proxies.task_done()
Exemplo n.º 6
0
    async def regular_validate(self):
        count = min(ceil(self._conn.count * validate_ratio), validate_upper_limit)
        old_proxies = self._conn.get_list(count) # TODO: set an upper limit
        valid_proxies = []
        logger.debug('regular validator started, {0} to validate'.format(len(old_proxies)))
        async with aiohttp.ClientSession() as session:
            for proxy in old_proxies:
                try:
                    real_proxy = 'http://' + proxy.decode('utf-8') # proxy from redis was bytes type
                    async with session.get(self.validate_url, proxy=real_proxy, timeout=validate_timeout) as resp:
                        valid_proxies.append(proxy)
                except asyncio.TimeoutError:
                    continue
                except Exception as e:
                    logger.error(e)

        logger.debug('regular validator finished, {0} passed'.format(len(valid_proxies)))
        self._conn.put_list(valid_proxies)
Exemplo n.º 7
0
    async def crawler_start(crawler, validator, proxies, flag):
        """ Start proxy crawler and validator.

        Args:
            crawler: ProxyCrawler object.
            validator: ProxyValidator object.
            proxies: asyncio.Queue object, crawler put proxy and validator get proxy.
            flag: asyncio.Event object, stop flag for 'crawler_stop' function.
        """
        logger.debug('proxy crawler started')

        valid = asyncio.ensure_future(validator.start(proxies))
        await crawler.start()
        await proxies.join()
        valid.cancel(
        )  # cancel task when Queue was empty, or it would be blocked at Queue.get method

        flag.set()
        logger.debug('proxy crawler finished')
Exemplo n.º 8
0
    async def crawler_stop(crawler, flag):
        """Check proxies count if enough to stop proxy crawler.

        Args:
            crawler: ProxyCrawler object.
            flag: asyncio.Event object, stop flag.
        """

        while 1:

            if conn.count > int(UPPER_LIMIT * UPPER_LIMIT_RATIO):
                logger.warning('proxies count approached the upper limit')
                crawler.stop()
                break
            if flag.is_set(): # stop check if crawler and validator finished
                break

            logger.debug('checked proxies count in redis')
            await asyncio.sleep(200 * random())
Exemplo n.º 9
0
    async def start(self):
        for rule in self._rules:
            parser = asyncio.ensure_future(self._parse_page(rule))
            logger.debug('{0} crawler started'.format(rule.__rule_name__))

            if not rule.use_phantomjs:
                await page_download(ProxyCrawler._url_generator(rule),
                                    self._pages, self._stop_flag)
            else:
                await page_download_phantomjs(
                    ProxyCrawler._url_generator(rule), self._pages,
                    rule.phantomjs_load_flag, self._stop_flag)

            await self._pages.join()

            parser.cancel(
            )  # cancel task when Queue was empty, or it would be blocked at Queue.get method

            logger.debug('{0} crawler finished'.format(rule.__rule_name__))
Exemplo n.º 10
0
    async def crawler_stop(crawler, conn, flag):
        """Check proxies count if enough to stop proxy crawler.

        Args:
            crawler: ProxyCrawler object.
            conn: redis connection.
            flag: asyncio.Event object, stop flag.
        """
        # await asyncio.sleep(10) # TODO
        while 1:

            if conn.count > int(upper_limit * upper_limit_ratio):
                logger.warning('proxies count approached the upper limit')
                crawler.stop()
                break
            if flag.is_set():  # stop check if crawler and validator finished
                break

            logger.debug('checked proxies count in redis')
            await asyncio.sleep(200 * random())
Exemplo n.º 11
0
 def reset(self):
     self._stop_flag = asyncio.Event(
     )  # once setted, create a new Event object
     logger.debug('proxy crawler reseted')
Exemplo n.º 12
0
 def reset(self):
     self._stop_flag.clear()  # clear crawler's stop flag
     logger.debug('proxy crawler reseted')