class Getter(): """代理获取器""" def __init__(self, proxy_key=setting.REDIS_KEY): self.redis = RedisClient(proxy_key=proxy_key) self.crawler = Crawler() def is_over_threshold(self): """ 判断是否达到了代理池限制 """ if self.redis.get_count() >= setting.POOL_UPPER_THRESHOLD: return True else: return False def run(self): logger.info('代理获取器开始执行') if not self.is_over_threshold(): for callback_label in range(self.crawler.__CrawlFuncCount__): callback = self.crawler.__CrawlFunc__[callback_label] # 获取代理 proxies = self.crawler.get_proxies(callback) sys.stdout.flush() for proxy in proxies: self.redis.add_proxy(proxy, setting.INITIAL_SCORE)
class Tester(object): """代理可用性测试器""" def __init__(self, proxy_key=setting.REDIS_KEY): self.redis = RedisClient(proxy_key=proxy_key) async def test_single_proxy(self, proxy): """ 测试单个代理 :param proxy: :return: """ conn = aiohttp.TCPConnector(verify_ssl=False) async with aiohttp.ClientSession(connector=conn) as session: try: if isinstance(proxy, bytes): proxy = proxy.decode('utf-8') if not proxy.startswith('http://'): proxy = 'http://' + proxy async with session.get(setting.TEST_URL, headers=setting.REQUEST_HEADERS, proxy=proxy, timeout=5, allow_redirects=False) as response: if response.status in setting.VALID_STATUS_CODES: self.redis.set_proxy_max(proxy) logger.info('代理 {} 请求成功,响应码为 {}'.format( proxy, response.status)) else: self.redis.decrease_proxy(proxy) logger.info('代理 {} 的响应码不合法,为 {}'.format( proxy, response.status)) except (aiohttp.ClientError, aiohttp.client_exceptions.ClientConnectorError, asyncio.TimeoutError, AttributeError) as e: self.redis.decrease_proxy(proxy) logger.warning('代理 {} 请求出错:{}'.format(proxy, repr(e))) def run(self): """ 测试主函数 :return: """ logger.info('测试器开始运行') try: count = self.redis.get_count() logger.info('当前有 {} 个代理'.format(count)) for i in range(0, count, setting.BATCH_TEST_SIZE): start = i stop = min(i + setting.BATCH_TEST_SIZE, count) logger.info('正在测试第 {}-{} 个代理'.format(start + 1, stop)) test_proxies = self.redis.get_batch(start, stop) loop = asyncio.get_event_loop() tasks = [ self.test_single_proxy(proxy) for proxy in test_proxies ] loop.run_until_complete(asyncio.wait(tasks)) sys.stdout.flush() time.sleep(5) except Exception as e: logger.error('测试器发生错误')
class TestRedisClient(unittest.TestCase): def __init__(self, *args, **kwargs): super(TestRedisClient, self).__init__(*args, **kwargs) self.redis = RedisClient(proxy_key=TEST_REDIS_KEY) def setUp(self): self.redis.clear() print('') def tearDown(self): self.redis.clear() def test_add_proxy(self): self.redis.add_proxy('0.0.0.0:0', 10) self.redis.add_proxy('0.0.0.0', 10) self.assertTrue(self.redis.exist_proxy('0.0.0.0:0')) self.assertEqual(self.redis.get_count(), 1) def test_get_random(self): count = 10 for i in range(count): self.redis.add_proxy('0.0.0.0:' + str(i), i) self.redis.set_proxy_max('0.0.0.0:0') # 获取到的是分数最大的代理 self.assertEqual(self.redis.get_random(), '0.0.0.0:0') # 不存在分数最大的代理时,随机获取前100名的代理 times = 5 self.redis.decrease_proxy('0.0.0.0:0') print('不存在分数最大代理时,随机获取', times, '个代理:') for i in range(times): print(self.redis.get_random())
class Tester(object): def __init__(self): self.redis = RedisClient() async def test_single_proxy(self, proxy): conn = aiohttp.TCPConnector(verify_ssl=False) async with aiohttp.ClientSession(connector=conn) as session: try: real_proxy = 'http://' + proxy async with session.get(TEST_URL, proxy=real_proxy, timeout=15) as res: if res.status in STATUS_CODE: self.redis.set_max_score(proxy) else: self.redis.decrease(proxy) except: pass def run(self): try: count = self.redis.get_count() for i in range(0, count, BATCH_TEST_COUNT): start = i stop = min(count, i + BATCH_TEST_COUNT) proxies = self.redis.get_batch(start, stop - 1) loop = asyncio.get_event_loop() tasks = [self.test_single_proxy(proxy) for proxy in proxies] loop.run_until_complete(asyncio.wait(tasks)) time.sleep(5) except Exception as e: print('测试器发生错误', e.args)
class Getter(object): def __init__(self): self.redis = RedisClient() self.crawler = Crawler() def is_over_threshold(self): if self.redis.get_count() < MAX_THRESHOLD: return True else: return False def run(self): if self.is_over_threshold(): for i in range(self.crawler.__CrawlCount__): proxies = self.crawler.get_proxies( self.crawler.__CrawlFunc__[i]) for proxy in proxies: self.redis.add(proxy)