示例#1
0
    async def _parse_proxy(self, rule, page):
        ips = page.xpath(rule.ip_xpath)
        ports = page.xpath(rule.port_xpath)

        if not ips or not ports:
            logger.warning(
                '{2} crawler could not get ip(len={0}) or port(len={1}), please check the xpaths or network'
                .format(len(ips), len(ports), rule.__rule_name__))
            return

        proxies = map(
            lambda x, y: '{0}:{1}'.format(x.text.strip(), y.text.strip()), ips,
            ports)

        if rule.filters:  # filter proxies
            filters = []
            for i, ft in enumerate(rule.filters_xpath):
                field = page.xpath(ft)
                if not field:
                    logger.warning(
                        '{1} crawler could not get {0} field, please check the filter xpath'
                        .format(rule.filters[i], rule.__rule_name__))
                    continue
                filters.append(map(lambda x: x.text.strip(), field))

            filters = zip(*filters)
            selector = map(lambda x: x == rule.filters, filters)
            proxies = compress(proxies, selector)

        for proxy in proxies:
            await self._proxies.put(proxy)  # put proxies in Queue to validate
示例#2
0
    async def crawler_stop(crawler, flag):
        """Check proxies count if enough to stop proxy crawler.

        Args:
            crawler: ProxyCrawler object.
            flag: asyncio.Event object, stop flag.
        """

        while 1:

            if conn.count > int(UPPER_LIMIT * UPPER_LIMIT_RATIO):
                logger.warning('proxies count approached the upper limit')
                crawler.stop()
                break
            if flag.is_set(): # stop check if crawler and validator finished
                break

            logger.debug('checked proxies count in redis')
            await asyncio.sleep(200 * random())
示例#3
0
    async def crawler_stop(crawler, conn, flag):
        """Check proxies count if enough to stop proxy crawler.

        Args:
            crawler: ProxyCrawler object.
            conn: redis connection.
            flag: asyncio.Event object, stop flag.
        """
        # await asyncio.sleep(10) # TODO
        while 1:

            if conn.count > int(upper_limit * upper_limit_ratio):
                logger.warning('proxies count approached the upper limit')
                crawler.stop()
                break
            if flag.is_set():  # stop check if crawler and validator finished
                break

            logger.debug('checked proxies count in redis')
            await asyncio.sleep(200 * random())
示例#4
0
 def stop(self):
     self._stop_flag.set()  # set crawler's stop flag
     logger.warning('proxy crawler was stopping...')