def run(self): """ Run tester. :return: """ logger.debug('Tester is running.') try: count = self.redis.count() logger.info( 'There are {} proxy (proxies) in proxy pool now.'.format( count)) for i in range(0, count, BATCH_TEST_SIZE): start = i stop = min(i + BATCH_TEST_SIZE, count) # 分批测试防止内存开销过大 logger.debug( 'Testing proxies with index between {} and {}.'.format( start + 1, stop)) test_proxies = self.redis.batch(start, stop) # 异步测试加快速度 loop = asyncio.get_event_loop() tasks = [ self.test_single_proxy(proxy) for proxy in test_proxies ] loop.run_until_complete(asyncio.wait(tasks)) sys.stdout.flush() time.sleep(5) logger.info('Testing finished') except Exception as e: logger.warning('Tester error {}'.format(e.args))
async def test_single_proxy(self, proxy): """ Test the availability of a proxy. And maximize its score if available else decrease its score. :param proxy :return """ conn = aiohttp.TCPConnector(verify_ssl=False) async with aiohttp.ClientSession(connector=conn) as session: try: if isinstance(proxy, bytes): proxy = proxy.decode('utf-8') real_proxy = 'http://' + proxy # url应包括协议,浏览器访问不加协议会默认用http logger.debug('Testing {}'.format(proxy)) async with session.get(TEST_URL, proxy=real_proxy, timeout=15, allow_redirects=False) as response: if response.status in VALID_STATUS_CODES: logger.debug('Proxy {} is OK'.format(proxy)) self.redis.maximize(proxy) else: logger.warning( 'Failed to use proxy {} because the response code was {}' .format(proxy, response.status)) self.redis.decrease(proxy) except (ClientError, aiohttp.client_exceptions.ClientConnectorError, asyncio.TimeoutError, AttributeError) as e: self.redis.decrease(proxy) logger.warning('Failed to use proxy {} because of {}'.format( proxy, repr(e)))
def maximize(self, proxy): """ Maximize the score of proxy :param proxy :return """ logger.debug('Set proxy {} by maximum score {}'.format( proxy, MAX_SCORE)) return self.db.zadd(REDIS_KEY, {proxy: MAX_SCORE})
def run(self): logger.debug('Getter is running.') if not self.is_over_threshold(): for callback_label in range(self.crawler.__CrawlFuncCount__): callback = self.crawler.__CrawlFunc__[callback_label] # 获取代理 proxies = self.crawler.get_proxies(callback) sys.stdout.flush() for proxy in proxies: self.redis.add(proxy)
def get_proxies(self, callback): ''' A public interfere for calling a crawling method. :param callback: the name of crawling method :return: a list of proxies ''' proxies = [] for proxy in eval("self.{}()".format(callback)): logger.debug('Collected proxy {}'.format(proxy)) proxies.append(proxy) return proxies
def add(self, proxy, score=INITIAL_SCORE): """ Add a proxy :param proxy :param score :return: adding result """ if not re.match('\d+\.\d+\.\d+\.\d+\:\d+', proxy): logger.debug('Illegal proxy {} is deprecated'.format(proxy)) return if not self.db.zscore(REDIS_KEY, proxy): return self.db.zadd(REDIS_KEY, {proxy: score})
def decrease(self, proxy): """ Decrease the score of proxy by 1 Any proxy with score lower than MIN_SCORE will be removed. :param proxy :return: the modified score of proxy """ score = self.db.zscore(REDIS_KEY, proxy) if score and score > MIN_SCORE: logger.debug('Proxy {} with score {} - 1.'.format( proxy, int(score))) return self.db.zincrby(REDIS_KEY, -1, proxy) else: logger.debug('Proxy {} with score {} is removed.'.format( proxy, int(score))) return self.db.zrem(REDIS_KEY, proxy)
def get_page(url, options={}): """ :param url: :param additional entries of headers: :return: """ headers = dict(base_headers, **options) logger.debug('Crawling: {}'.format(url)) try: response = requests.get(url, headers=headers) logger.info('Finished crawling {}, status_code is {}'.format( url, response.status_code)) if response.status_code == 200: return response.text except ConnectionError as e: logger.warning('Failed to crawl {} because of {}'.format(url, repr(e))) return None
'en-US,en;q=0.9,zh-CN;q=0.8,zh;q=0.7', 'Cache-Control': 'max-age=0', 'Connection': 'keep-alive', 'Cookie': 'JSESSIONID=47AA0C887112A2D83EE040405F837A86', 'Host': 'www.data5u.com', 'Referer': 'http://www.data5u.com/free/index.shtml', 'Upgrade-Insecure-Requests': '1', 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.108 Safari/537.36', } html = get_page(start_url, options=headers) if html: ip_address = re.compile( '<span><li>(\d+\.\d+\.\d+\.\d+)</li>.*?<li class=\"port.*?>(\d+)</li>', re.S) re_ip_address = ip_address.findall(html) for address, port in re_ip_address: result = address + ':' + port yield result.replace(' ', '') if __name__ == '__main__': cr = Crawler() logger.debug(cr.__dict__) logger.debug(Crawler.__dict__)