class Getter(object): """ getter of proxypool """ def __init__(self): """ init db and crawlers """ self.redis = RedisClient() self.crawlers_cls = crawlers_cls self.crawlers = [crawler_cls() for crawler_cls in self.crawlers_cls] def is_full(self): """ if proxypool if full return: bool """ return self.redis.count() >= PROXY_NUMBER_MAX @logger.catch def run(self): """ run crawlers to get proxy :return: """ if self.is_full(): return for crawler in self.crawlers: for proxy in crawler.crawl(): self.redis.add(proxy)
class Getter(object): """ 获取代理池 """ def __init__(self): """ 初始化 db 和 爬虫 """ self.redis = RedisClient() self.crawlers_cls = crawlers_cls self.crawlers = [crawler_cls() for crawler_cls in self.crawlers_cls] def is_full(self): """ 判断代理池是否已经满了 return: bool """ return self.redis.count() >= PROXY_NUMBER_MAX @logger.catch def run(self): """ 运行代理抓取工具 :return: """ if self.is_full(): return for crawler in self.crawlers: logger.info(f'爬取 {crawler} to get proxy') for proxy in crawler.crawl(): self.redis.add(proxy)
class Tester(object): """ tester for testing proxies in queue """ def __init__(self): """ init redis """ self.redis = RedisClient() self.loop = asyncio.get_event_loop() async def test(self, proxy: Proxy): """ test single proxy :param proxy: Proxy object :return: """ async with aiohttp.ClientSession(connector=aiohttp.TCPConnector( ssl=False)) as session: try: logger.debug(f'testing {proxy.string()}') async with session.get(TEST_URL, proxy=f'http://{proxy.string()}', timeout=TEST_TIMEOUT, allow_redirects=False) as response: if response.status in TEST_VALID_STATUS: self.redis.max(proxy) logger.debug( f'proxy {proxy.string()} is valid, set max score') else: self.redis.decrease(proxy) logger.debug( f'proxy {proxy.string()} is invalid, decrease score' ) except EXCEPTIONS: self.redis.decrease(proxy) logger.debug( f'proxy {proxy.string()} is invalid, decrease score') @logger.catch def run(self): """ test main method :return: """ # event loop of aiohttp logger.info('stating tester...') count = self.redis.count() logger.debug(f'{count} proxies to test') for i in range(0, count, TEST_BATCH): # start end end offset start, end = i, min(i + TEST_BATCH, count) logger.debug(f'testing proxies from {start} to {end} indices') proxies = self.redis.batch(start, end) tasks = [self.test(proxy) for proxy in proxies] # run tasks using event loop self.loop.run_until_complete(asyncio.wait(tasks))
class Getter(object): """ getter of proxypool """ def __init__(self): """ init db and crawlers """ self.redis = RedisClient() self.crawlers_cls = crawlers_cls self.crawlers = [crawler_cls() for crawler_cls in self.crawlers_cls] def is_full(self): """ if proxypool if full return: bool """ return self.redis.count() >= PROXY_NUMBER_MAX @logger.catch def run(self): """ run crawlers to get proxy :return: """ if self.is_full(): return proxyfile = "staticproxy.txt" with open(proxyfile, 'r') as fh: proxylines = fh.readlines() logger.info(f'read {proxyfile}') for line in proxylines: if line.strip() != "" and not line.startswith("#"): line = line.replace("\r\n", "").replace("\n", "") pattern = re.compile( r'((?P<username>\S*?)\:(?P<password>\S*?)@)?(?P<ip>[\d]{1,3}\.[\d]{1,3}\.[\d]{1,3}\.[\d]{1,3})\:(?P<port>\d*)' ) match = re.search(pattern, line) if match: username = match.groupdict()['username'] password = match.groupdict()['password'] ip = match.groupdict()['ip'] port = match.groupdict()['port'] proxy = Proxy(host=ip, port=port, username=username, password=password) logger.info("getproxy " + proxy.string()) self.redis.add(proxy) for crawler in self.crawlers: logger.info(f'crawler {crawler} to get proxy') for proxy in crawler.crawl(): print(proxy.string()) self.redis.add(proxy)
class Tester(object): """ tester for testing proxies in queue """ def __init__(self): """ init redis """ self.redis = RedisClient() self.loop = asyncio.get_event_loop() async def test(self, proxy: Proxy): """ test single proxy :param proxy: Proxy object :return: """ async with aiohttp.ClientSession(connector=aiohttp.TCPConnector( ssl=False)) as session: try: logger.debug(f'testing {proxy.string()}') # if TEST_ANONYMOUS is True, make sure that # the proxy has the effect of hiding the real IP if TEST_ANONYMOUS: url = 'https://httpbin.org/ip' async with session.get(url, timeout=TEST_TIMEOUT) as response: resp_json = await response.json() origin_ip = resp_json['origin'] async with session.get(url, proxy=f'http://{proxy.string()}', timeout=TEST_TIMEOUT) as response: resp_json = await response.json() anonymous_ip = resp_json['origin'] assert origin_ip != anonymous_ip assert proxy.host == anonymous_ip async with session.get(TEST_URL, proxy=f'http://{proxy.string()}', timeout=TEST_TIMEOUT, allow_redirects=False) as response: if response.status in TEST_VALID_STATUS: self.redis.max(proxy) logger.debug( f'proxy {proxy.string()} is valid, set max score') else: self.redis.decrease(proxy) logger.debug( f'proxy {proxy.string()} is invalid, decrease score' ) except EXCEPTIONS: self.redis.decrease(proxy) logger.debug( f'proxy {proxy.string()} is invalid, decrease score') @logger.catch def run(self): """ test main method :return: """ # event loop of aiohttp logger.info('stating tester...') count = self.redis.count() logger.debug(f'{count} proxies to test') cursor = 0 while True: logger.debug( f'testing proxies use cursor {cursor}, count {TEST_BATCH}') cursor, proxies = self.redis.batch(cursor, count=TEST_BATCH) if proxies: tasks = [self.test(proxy) for proxy in proxies] self.loop.run_until_complete(asyncio.wait(tasks)) if not cursor: break
class Tester(object): """ 测试队列中代理 的 测试器 """ def __init__(self): """ 初始化redis """ self.redis = RedisClient() self.loop = asyncio.get_event_loop() # 测试匿名1 async def test_anonymous1(self, proxy: Proxy, session): url = 'https://httpbin.org/ip' async with session.get(url, timeout=TEST_TIMEOUT) as response: resp_json = await response.json() origin_ip = resp_json['origin'] async with session.get(url, proxy=f'http://{proxy.string()}', timeout=TEST_TIMEOUT) as response: resp_json = await response.json() anonymous_ip = resp_json['origin'] logger.debug( f'只测试匿名代理: {origin_ip != anonymous_ip} -- 结果匿名ip:{anonymous_ip},代理ip:{proxy.string()}' ) assert origin_ip != anonymous_ip assert proxy.host == anonymous_ip # 测试匿名2 async def test_anonymous2(self, proxy: Proxy, session): url = 'http://km.chik.cn/ip' async with session.get(url, timeout=TEST_TIMEOUT) as response: resp_json = await response.json() origin_ip = resp_json['origin'] async with session.get(url, proxy=f'http://{proxy.string()}', timeout=TEST_TIMEOUT) as response: resp_json = await response.json() anonymous_ip = resp_json['origin'] logger.debug( f'只测试匿名代理2: {origin_ip != anonymous_ip} -- 结果匿名ip:{anonymous_ip},代理ip:{proxy.string()}' ) assert origin_ip != anonymous_ip assert proxy.host == anonymous_ip async def test(self, proxy: Proxy): """ 测试单个代理 :param proxy: 代理对象 :return: """ async with aiohttp.ClientSession(connector=aiohttp.TCPConnector( ssl=False)) as session: try: # logger.debug(f'测试代理ip为: {proxy.string()}') # 如果 TEST_ANONYMOUS 设置为 True, 确保 # 代理具有隐藏真实IP的作用 # TEST_ANONYMOUS 测试匿名 if TEST_ANONYMOUS: await self.test_anonymous1(proxy, session) if TEST_ANONYMOUS_MYSELF: await self.test_anonymous2(proxy, session) async with session.get(TEST_URL, proxy=f'http://{proxy.string()}', timeout=TEST_TIMEOUT, allow_redirects=False) as response: if response.status in TEST_VALID_STATUS: self.redis.max(proxy) logger.debug(f'代理 {proxy.string()} 是有效的, 设置最大分数') else: self.redis.decrease(proxy) logger.debug(f'代理 {proxy.string()} 不可使用, 降低分数') except EXCEPTIONS: self.redis.decrease(proxy, score=-10) # logger.error(f'代理 {proxy.string()} 异常, 降低分数') @logger.catch def run(self): """ 主测试程序 :return: """ # event loop of aiohttp 的事件循环 logger.info('开始测试...') # proxies = convert_proxy_or_proxies("191.235.98.23:3128") # [self.test(proxy) for proxy in proxies] count = self.redis.count() logger.debug(f'共 {count} 个代理等待测试') cursor = 0 while True: logger.debug(f'测试代理游标 {cursor}, count {TEST_BATCH}') cursor, proxies = self.redis.batch(cursor, count=TEST_BATCH) if proxies: # proxies = convert_proxy_or_proxies("191.235.98.23:3128") tasks = [self.test(proxy) for proxy in proxies] self.loop.run_until_complete(asyncio.wait(tasks)) if not cursor: break