class Getter(): def __init__(self): self.redis = RedisClient() self.crawler = Crawler() def is_over_threshold(self): """ 判断是否达到了代理池限制 :return: """ if self.redis.count() > POOL_UPPER_THRESHOLD: return True else: return False def run(self): print('获取器开始') n=0 if not self.is_over_threshold(): for callback_lable in range(self.crawler.__CrawlFuncCount__): callback= self.crawler.__CrawlFunc__[callback_lable] # 获取代理 proxies = self.crawler.get_proxies(callback) for proxy in proxies: self.redis.add(proxy)
class Getter(object): """ 有了Crawler类之后,在定义一个Getter类, 用来动态地调用所有以crawl_开头的方法, 然后获取到的代理,将其加入数据库中保存起来。 """ def __init__(self): """初始化""" self.redis = RedisClient() self.crawler = Crawler() def is_over_threshold(self): """ 判断代理池是否溢出 :return: 返回判断结果 """ if self.redis.count() >= POOL_UPPER_THRESHOLD: return True else: return False def run(self): print("获取器开始执行...") if not self.is_over_threshold(): if CRAWLER_FREE_ENABLED: for callback_label in range(self.crawler.__CrawlFuncCount__): callback = self.crawler.__CrawlFunc__[callback_label] proxies = self.crawler.get_proxies(callback) sys.stdout.flush() for proxy in proxies: self.redis.add(proxy) elif CRAWLER_MONEY_ENABLED: for callback_label in range(self.crawler.__MoneyFuncCount__): callback = self.crawler.__MoneyFunc__[callback_label] proxies = self.crawler.get_proxies(callback) sys.stdout.flush() for proxy in proxies: self.redis.add(proxy)
class Tester(object): """定义一个检测模块类""" def __init__(self): """初始化""" self.redis = RedisClient() async def test_single_proxy(self, proxy): """ 检测单个代理 :param proxy: 被检测单个代理 :return: None """ conn = aiohttp.TCPConnector(verify_ssl=False) async with aiohttp.ClientSession(connector=conn) as session: try: if isinstance(proxy, bytes): proxy = proxy.decode("utf-8") real_proxy = "http://" + proxy print("正在测试代理", proxy) async with session.get(TEST_URL, proxy=real_proxy, timeout=15, allow_redirects=False) as response: if response.status in VALID_STATUS_CODES: self.redis.max(proxy) print("代理可用", proxy) else: self.redis.decrease(proxy) print('请求响应码不合法 ', response.status, 'IP', proxy) except (ClientError, asyncio.TimeoutError, AttributeError, ConnectionError): self.redis.decrease(proxy) print("代理请求失败", proxy) def run(self): """ 测试主函数 :return: None """ print("测试器开始启动......") try: proxies = self.redis.all() loop = asyncio.get_event_loop() # 采用批量测试避免一次性测试全部代理导致内存开销过大问题 for i in range(0, len(proxies), BATCH_TEST_SIZE): test_proxies = proxies[i: i+BATCH_TEST_SIZE] tasks = [self.test_single_proxy(proxy) for proxy in test_proxies] loop.run_until_complete(asyncio.wait(tasks)) sys.stdout.flush() time.sleep(5) except Exception as e: print("测试器出错了,%s" % e.args)
def __init__(self): """初始化""" self.redis = RedisClient() self.crawler = Crawler()
def get_conn(): if not hasattr(g, 'redis'): g.redis = RedisClient() return RedisClient()
def __init__(self): """初始化""" self.redis = RedisClient()