예제 #1
0
class UsabilityTester(object):
    """代理测试器,负责检验给定代理的可用性"""
    def __init__(self):
        self._headers = HEADERS
        self._logger = logging.getLogger('root')
        self.test_api = TEST_API
        self._pool = RedisOperator()

    async def test_signal_proxy(self, proxy):
        """异步测试单个代理"""
        async with aiohttp.ClientSession() as sess:
            real_proxy = 'http://' + proxy
            try:
                async with sess.get(self.test_api,
                                    headers=self._headers,
                                    proxy=real_proxy,
                                    timeout=10,
                                    allow_redirects=False):
                    self._pool.increase(proxy)
            except (asyncio.TimeoutError, Exception):
                self._pool.decrease(proxy)

    def test(self, proxies):
        """测试传入的代理列表,
        将在定时测试周期和每次爬虫工作后被调用
        :param proxies: 代理列表
        :return: None
        """
        self._logger.info('测试器开始工作,本次测试 %s 个代理' % len(proxies))
        loop = asyncio.get_event_loop()
        for batch in [proxies[i:i + 200] for i in range(0, len(proxies), 200)]:
            tasks = [self.test_signal_proxy(proxy) for proxy in batch]
            loop.run_until_complete(asyncio.wait(tasks, loop=loop))
예제 #2
0
def get_conn():
    """获取 Redis 连接
    :return: RedisOperator
    """
    if not hasattr(g, 'redis_connect'):
        g.redis_connect = RedisOperator()
    return g.redis_connect
예제 #3
0
 def run(self):
     logging.config.dictConfig(self._logging_conf)
     logger = logging.getLogger('root')
     logger.info('进程1 - 代理数量监控启动,每%s秒检查一次' % self._cycle)
     adder = PoolAdder()
     pool = RedisOperator()
     while True:
         if pool.usable_size < self._lower_threshold:
             adder.add_to_pool()
         time.sleep(self._cycle)
예제 #4
0
 def run(self):
     logging.config.dictConfig(self._logging_conf)
     logger = logging.getLogger('root')
     logger.info('进程2 - 代理定时测试启动,每%s秒测试一次' % self._cycle)
     tester = UsabilityTester()
     pool = RedisOperator()
     while True:
         logger.debug('周期性测试开始,将对所有代理进行测试')
         test_proxies = pool.get_all()
         test_total = len(test_proxies)
         if test_total < self._lower_threshold:
             logger.debug('池中代理数量低于阈值,本次不进行测试')
             time.sleep(self._cycle)
             continue
         tester.test(test_proxies)
         after_test_total = pool.usable_size
         logger.info('淘汰了 %s 个代理,现可用 %s 个代理' %
                     (test_total - after_test_total, after_test_total))
         logger.info('本次测试结束,%s秒后再次测试' % self._cycle)
         time.sleep(self._cycle)
예제 #5
0
class PoolAdder(object):
    """添加器,负责启动爬虫补充代理"""
    def __init__(self):
        self._threshold = POOL_UPPER_THRESHOLD
        self._pool = RedisOperator()
        self._tester = UsabilityTester()
        self._logger = logging.getLogger('root')

    def is_over(self):
        """ 判断池中代理的数量是否达到阈值
            :return: 达到阈值返回 True, 否则返回 False.
        """
        return True if self._pool.usable_size >= self._threshold else False

    def add_to_pool(self):
        """补充代理
        :return: None
        """
        self._logger.info('代理数量过低,启动爬虫补充代理')
        spiders = [cls() for cls in SpiderMeta.spiders]
        flag = 1
        while not self.is_over():
            new_proxies = []
            added_proxies = []
            with futures.ThreadPoolExecutor(
                    max_workers=len(spiders)) as executor:
                future_todo = [
                    executor.submit(spiders[i].get)
                    for i in range(len(spiders))
                ]
                for future in futures.as_completed(future_todo):
                    new_proxies.extend(future.result())
            for proxy in new_proxies:
                if self._pool.add(proxy):
                    added_proxies.append(proxy)
            self._logger.info('爬取增加了 %s 个代理,开始测试' % len(added_proxies))
            self._tester.test(added_proxies)
            if self.is_over():
                self._logger.debug('代理已经充足,结束补充')
                break
            if flag % 5 == 0:
                self._logger.warning('已连续补充%s次代理仍未满足,请检查爬虫和配置' % flag)
            flag += 1
            self._logger.debug('代理仍然不足,进行第%s次补充' % flag)
        for spider in spiders:
            spider.flush()
예제 #6
0
class PageRequest(object):
    def __init__(self):
        self.proxies_arg = None
        self._logger = logging.getLogger('root')
        self._headers = HEADERS
        self._pool = RedisOperator()

    def get_resp(self, url, retry=2):
        try:
            return requests.get(url, headers=self._headers, timeout=20, proxies=self.proxies_arg)
        except(ProxyError, ConnectionError, Timeout, ChunkedEncodingError):
            if retry > 0:
                return self.get_resp(url, retry-1)
            self._logger.warning('爬虫可能被反爬,正在加载代理重试')
            self.load_proxy()
            return self.get_resp(url)



    def load_proxy(self):
        self.proxies_arg = {'http': self._pool.get()}
예제 #7
0
 def __init__(self):
     self.proxies_arg = None
     self._logger = logging.getLogger('root')
     self._headers = HEADERS
     self._pool = RedisOperator()
예제 #8
0
 def __init__(self):
     self._headers = HEADERS
     self._logger = logging.getLogger('root')
     self.test_api = TEST_API
     self._pool = RedisOperator()
예제 #9
0
 def __init__(self):
     self._threshold = POOL_UPPER_THRESHOLD
     self._pool = RedisOperator()
     self._tester = UsabilityTester()
     self._logger = logging.getLogger('root')