示例#1
0
    def run(self):
        """
        Run tester.
        :return:
        """
        logger.debug('Tester is running.')
        try:
            count = self.redis.count()
            logger.info(
                'There are {} proxy (proxies) in proxy pool now.'.format(
                    count))
            for i in range(0, count, BATCH_TEST_SIZE):
                start = i
                stop = min(i + BATCH_TEST_SIZE, count)
                # 分批测试防止内存开销过大
                logger.debug(
                    'Testing proxies with index between {} and {}.'.format(
                        start + 1, stop))
                test_proxies = self.redis.batch(start, stop)
                # 异步测试加快速度
                loop = asyncio.get_event_loop()
                tasks = [
                    self.test_single_proxy(proxy) for proxy in test_proxies
                ]
                loop.run_until_complete(asyncio.wait(tasks))
                sys.stdout.flush()
                time.sleep(5)
            logger.info('Testing finished')

        except Exception as e:
            logger.warning('Tester error {}'.format(e.args))
示例#2
0
 async def test_single_proxy(self, proxy):
     """
     Test the availability of a proxy.
     And maximize its score if available else decrease its score.
     :param proxy
     :return
     """
     conn = aiohttp.TCPConnector(verify_ssl=False)
     async with aiohttp.ClientSession(connector=conn) as session:
         try:
             if isinstance(proxy, bytes):
                 proxy = proxy.decode('utf-8')
             real_proxy = 'http://' + proxy  # url应包括协议,浏览器访问不加协议会默认用http
             logger.debug('Testing {}'.format(proxy))
             async with session.get(TEST_URL,
                                    proxy=real_proxy,
                                    timeout=15,
                                    allow_redirects=False) as response:
                 if response.status in VALID_STATUS_CODES:
                     logger.debug('Proxy {} is OK'.format(proxy))
                     self.redis.maximize(proxy)
                 else:
                     logger.warning(
                         'Failed to use proxy {} because the response code was {}'
                         .format(proxy, response.status))
                     self.redis.decrease(proxy)
         except (ClientError,
                 aiohttp.client_exceptions.ClientConnectorError,
                 asyncio.TimeoutError, AttributeError) as e:
             self.redis.decrease(proxy)
             logger.warning('Failed to use proxy {} because of {}'.format(
                 proxy, repr(e)))
示例#3
0
 def maximize(self, proxy):
     """
     Maximize the score of proxy
     :param proxy
     :return
     """
     logger.debug('Set proxy {} by maximum score {}'.format(
         proxy, MAX_SCORE))
     return self.db.zadd(REDIS_KEY, {proxy: MAX_SCORE})
示例#4
0
 def run(self):
     logger.debug('Getter is running.')
     if not self.is_over_threshold():
         for callback_label in range(self.crawler.__CrawlFuncCount__):
             callback = self.crawler.__CrawlFunc__[callback_label]
             # 获取代理
             proxies = self.crawler.get_proxies(callback)
             sys.stdout.flush()
             for proxy in proxies:
                 self.redis.add(proxy)
示例#5
0
 def get_proxies(self, callback):
     '''
     A public interfere for calling a crawling method.
     :param callback: the name of crawling method
     :return: a list of proxies
     '''
     proxies = []
     for proxy in eval("self.{}()".format(callback)):
         logger.debug('Collected proxy {}'.format(proxy))
         proxies.append(proxy)
     return proxies
示例#6
0
 def add(self, proxy, score=INITIAL_SCORE):
     """
     Add a proxy
     :param proxy
     :param score
     :return: adding result
     """
     if not re.match('\d+\.\d+\.\d+\.\d+\:\d+', proxy):
         logger.debug('Illegal proxy {} is deprecated'.format(proxy))
         return
     if not self.db.zscore(REDIS_KEY, proxy):
         return self.db.zadd(REDIS_KEY, {proxy: score})
示例#7
0
 def decrease(self, proxy):
     """
     Decrease the score of proxy by 1
     Any proxy with score lower than MIN_SCORE will be removed.
     :param proxy
     :return: the modified score of proxy
     """
     score = self.db.zscore(REDIS_KEY, proxy)
     if score and score > MIN_SCORE:
         logger.debug('Proxy {} with score {} - 1.'.format(
             proxy, int(score)))
         return self.db.zincrby(REDIS_KEY, -1, proxy)
     else:
         logger.debug('Proxy {} with score {} is removed.'.format(
             proxy, int(score)))
         return self.db.zrem(REDIS_KEY, proxy)
示例#8
0
def get_page(url, options={}):
    """
    :param url:
    :param additional entries of headers:
    :return:
    """
    headers = dict(base_headers, **options)
    logger.debug('Crawling: {}'.format(url))
    try:
        response = requests.get(url, headers=headers)
        logger.info('Finished crawling {}, status_code is {}'.format(
            url, response.status_code))
        if response.status_code == 200:
            return response.text
    except ConnectionError as e:
        logger.warning('Failed to crawl {} because of {}'.format(url, repr(e)))
        return None
示例#9
0
            'en-US,en;q=0.9,zh-CN;q=0.8,zh;q=0.7',
            'Cache-Control':
            'max-age=0',
            'Connection':
            'keep-alive',
            'Cookie':
            'JSESSIONID=47AA0C887112A2D83EE040405F837A86',
            'Host':
            'www.data5u.com',
            'Referer':
            'http://www.data5u.com/free/index.shtml',
            'Upgrade-Insecure-Requests':
            '1',
            'User-Agent':
            'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.108 Safari/537.36',
        }
        html = get_page(start_url, options=headers)
        if html:
            ip_address = re.compile(
                '<span><li>(\d+\.\d+\.\d+\.\d+)</li>.*?<li class=\"port.*?>(\d+)</li>',
                re.S)
            re_ip_address = ip_address.findall(html)
            for address, port in re_ip_address:
                result = address + ':' + port
                yield result.replace(' ', '')


if __name__ == '__main__':
    cr = Crawler()
    logger.debug(cr.__dict__)
    logger.debug(Crawler.__dict__)