class UsabilityTester(object): """代理测试器,负责检验给定代理的可用性""" def __init__(self): self._headers = HEADERS self._logger = logging.getLogger('root') self.test_api = TEST_API self._pool = RedisOperator() async def test_signal_proxy(self, proxy): """异步测试单个代理""" async with aiohttp.ClientSession() as sess: real_proxy = 'http://' + proxy try: async with sess.get(self.test_api, headers=self._headers, proxy=real_proxy, timeout=10, allow_redirects=False): self._pool.increase(proxy) except (asyncio.TimeoutError, Exception): self._pool.decrease(proxy) def test(self, proxies): """测试传入的代理列表, 将在定时测试周期和每次爬虫工作后被调用 :param proxies: 代理列表 :return: None """ self._logger.info('测试器开始工作,本次测试 %s 个代理' % len(proxies)) loop = asyncio.get_event_loop() for batch in [proxies[i:i + 200] for i in range(0, len(proxies), 200)]: tasks = [self.test_signal_proxy(proxy) for proxy in batch] loop.run_until_complete(asyncio.wait(tasks, loop=loop))
def get_conn(): """获取 Redis 连接 :return: RedisOperator """ if not hasattr(g, 'redis_connect'): g.redis_connect = RedisOperator() return g.redis_connect
def run(self): logging.config.dictConfig(self._logging_conf) logger = logging.getLogger('root') logger.info('进程1 - 代理数量监控启动,每%s秒检查一次' % self._cycle) adder = PoolAdder() pool = RedisOperator() while True: if pool.usable_size < self._lower_threshold: adder.add_to_pool() time.sleep(self._cycle)
def run(self): logging.config.dictConfig(self._logging_conf) logger = logging.getLogger('root') logger.info('进程2 - 代理定时测试启动,每%s秒测试一次' % self._cycle) tester = UsabilityTester() pool = RedisOperator() while True: logger.debug('周期性测试开始,将对所有代理进行测试') test_proxies = pool.get_all() test_total = len(test_proxies) if test_total < self._lower_threshold: logger.debug('池中代理数量低于阈值,本次不进行测试') time.sleep(self._cycle) continue tester.test(test_proxies) after_test_total = pool.usable_size logger.info('淘汰了 %s 个代理,现可用 %s 个代理' % (test_total - after_test_total, after_test_total)) logger.info('本次测试结束,%s秒后再次测试' % self._cycle) time.sleep(self._cycle)
class PoolAdder(object): """添加器,负责启动爬虫补充代理""" def __init__(self): self._threshold = POOL_UPPER_THRESHOLD self._pool = RedisOperator() self._tester = UsabilityTester() self._logger = logging.getLogger('root') def is_over(self): """ 判断池中代理的数量是否达到阈值 :return: 达到阈值返回 True, 否则返回 False. """ return True if self._pool.usable_size >= self._threshold else False def add_to_pool(self): """补充代理 :return: None """ self._logger.info('代理数量过低,启动爬虫补充代理') spiders = [cls() for cls in SpiderMeta.spiders] flag = 1 while not self.is_over(): new_proxies = [] added_proxies = [] with futures.ThreadPoolExecutor( max_workers=len(spiders)) as executor: future_todo = [ executor.submit(spiders[i].get) for i in range(len(spiders)) ] for future in futures.as_completed(future_todo): new_proxies.extend(future.result()) for proxy in new_proxies: if self._pool.add(proxy): added_proxies.append(proxy) self._logger.info('爬取增加了 %s 个代理,开始测试' % len(added_proxies)) self._tester.test(added_proxies) if self.is_over(): self._logger.debug('代理已经充足,结束补充') break if flag % 5 == 0: self._logger.warning('已连续补充%s次代理仍未满足,请检查爬虫和配置' % flag) flag += 1 self._logger.debug('代理仍然不足,进行第%s次补充' % flag) for spider in spiders: spider.flush()
class PageRequest(object): def __init__(self): self.proxies_arg = None self._logger = logging.getLogger('root') self._headers = HEADERS self._pool = RedisOperator() def get_resp(self, url, retry=2): try: return requests.get(url, headers=self._headers, timeout=20, proxies=self.proxies_arg) except(ProxyError, ConnectionError, Timeout, ChunkedEncodingError): if retry > 0: return self.get_resp(url, retry-1) self._logger.warning('爬虫可能被反爬,正在加载代理重试') self.load_proxy() return self.get_resp(url) def load_proxy(self): self.proxies_arg = {'http': self._pool.get()}
def __init__(self): self.proxies_arg = None self._logger = logging.getLogger('root') self._headers = HEADERS self._pool = RedisOperator()
def __init__(self): self._headers = HEADERS self._logger = logging.getLogger('root') self.test_api = TEST_API self._pool = RedisOperator()
def __init__(self): self._threshold = POOL_UPPER_THRESHOLD self._pool = RedisOperator() self._tester = UsabilityTester() self._logger = logging.getLogger('root')