class ValidityTester(object): test_api = TEST_API def __init__(self): self._raw_proxies = None self._usable_proxies = [] def set_raw_proxies(self, proxies): self._raw_proxies = proxies self._conn = RedisClient() async def test_single_proxy(self, proxy): """ test one proxy, if valid, put them to usable_proxies. """ try: async with aiohttp.ClientSession() as session: try: if isinstance(proxy, bytes): proxy = proxy.decode('utf-8') real_proxy = 'http://' + proxy print('Testing', proxy) async with session.get( self.test_api, proxy=real_proxy, timeout=get_proxy_timeout) as response: if response.status == 200: self._conn.put(proxy) print('Valid proxy', proxy) except (ProxyConnectionError, TimeoutError, ValueError): print('Invalid proxy', proxy) except (ServerDisconnectedError, ClientResponseError, ClientConnectorError) as s: print(s) pass def test(self): """ aio test all proxies. """ print('ValidityTester is working') try: loop = asyncio.get_event_loop() tasks = [ self.test_single_proxy(proxy) for proxy in self._raw_proxies ] loop.run_until_complete(asyncio.wait(tasks)) except ValueError: print('Async Error')
class Tester(object): def __init__(self): self.redis = RedisClient() async def test_single_proxy(self, proxy): """ 测试单个代理 :param proxy: :return: """ conn = aiohttp.TCPConnector(verify_ssl=False) async with aiohttp.ClientSession(connector=conn) as session: try: if isinstance(proxy, bytes): proxy = proxy.decode('utf-8') real_proxy = 'http://' + proxy print('正在测试', proxy) async with session.get(TEST_URL, proxy=real_proxy, timeout=15, allow_redirects=False) as response: if response.status in VALID_STATUS_CODES: print('代理可用', proxy) elif response.elapsed.total_seconds() > 0.5: self.redis.zrem(proxy) print('响应时间过长', response.elapsed.total_seconds(), 'IP', proxy) else: self.redis.zrem(proxy) print('请求响应码不合法 ', response.status, 'IP', proxy) except (ClientError, aiohttp.client_exceptions.ClientConnectorError, asyncio.TimeoutError, AttributeError): self.redis.zrem(proxy) print('代理请求失败', proxy) def run(self): """ 测试主函数 :return: """ print('测试器开始运行') try: count = self.redis.count() print('当前剩余', count, '个代理') for i in range(0, count, BATCH_TEST_SIZE): start = i stop = min(i + BATCH_TEST_SIZE, count) print('正在测试第', start + 1, '-', stop, '个代理') test_proxies = self.redis.batch(start, stop) loop = asyncio.get_event_loop() tasks = [ self.test_single_proxy(proxy) for proxy in test_proxies ] loop.run_until_complete(asyncio.wait(tasks)) sys.stdout.flush() time.sleep(5) except Exception as e: print('测试器发生错误', e.args)
def vaild_proxy(cycle=VALID_CHECK_CYCLE): """Get half of proxies which in redis""" conn = RedisClient() tester = ValidityTester() while True: print('refresing ip') count = int(0.5 * conn.queue_len) if count == 0: print('Waiting for adding') time.sleep(cycle) continue raw_proxies = conn.get(count) tester.set_raw_proxies(raw_proxies) tester.test() time.sleep(cycle)
class Tester(object): def __init__(self, redis_key): self.redis = RedisClient(redis_key) async def test_single_proxy(self, proxy): """ 测试单个代理 :param proxy: :return: """ conn = aiohttp.TCPConnector(ssl=False) async with aiohttp.ClientSession(connector=conn) as session: try: if isinstance(proxy, bytes): proxy = proxy.decode('utf-8') real_proxy = 'http://' + proxy print('正在测试', proxy) headers = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'en;q=0.9,ja;q=0.8,fr;q=0.7', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:76.0) Gecko/20100101 Firefox/76.0', # 'Upgrade-Insecure-Requests': 1, 'Connection': 'close', } async with session.get(TEST_URL, headers=headers, proxy=real_proxy, timeout=TIMEOUT, allow_redirects=False) as response: if response.status in VALID_STATUS_CODES: self.redis.max(proxy) print('代理可用', proxy) else: self.redis.decrease(proxy) print('请求响应码不合法 ', response.status, 'IP', proxy) except (ClientError, ClientConnectorError, asyncio.TimeoutError, AttributeError): self.redis.decrease(proxy) print('代理请求失败', proxy) def run(self): """ 测试主函数 :return: """ print('测试器开始运行') try: count = self.redis.count() print('当前剩余', count, '个代理') for i in range(0, count, BATCH_TEST_SIZE): start = i stop = min(i + BATCH_TEST_SIZE, count) print('正在测试第', start + 1, '-', stop, '个代理') test_proxies = self.redis.batch(start, stop) loop = asyncio.get_event_loop() tasks = [self.test_single_proxy(proxy) for proxy in test_proxies] loop.run_until_complete(asyncio.wait(tasks)) sys.stdout.flush() time.sleep(5) except Exception as e: print('测试器发生错误', e.args)
def __init__(self, threshold): # 临界值 self._threshold = threshold self._conn = RedisClient() self._tester = ValidityTester() # 免费代理爬虫 self._crawler = FreeProxyGetter()
def get_conn(): """ 获取 Redis 对象 """ if not hasattr(g, 'redis'): g.redis = RedisClient() return g.redis
def valid_proxy(cycle=VALID_CHECK_CYCLE): """从redis里面获取一半的代理 """ conn = RedisClient() tester = VaildityTester() while True: Logger.log_high('Refreshing ip') count = int(0.5 * conn.queue_len) if count == 0: Logger.log_normal('Waiting for adding') time.sleep(cycle) continue raw_proxies = conn.get(count) tester.set_raw_proxies(raw_proxies) tester.test() time.sleep(cycle)
class PoolAdder(object): def __init__(self, threshold): self._threshold = threshold self._conn = RedisClient() self._tester = ValidityTester() self._crawler = FreeProxyGetter() def is_over_threshold(self): if self._conn.queue_len() >= self._threshold: return True else: return False def add_to_queue(self): print('PoolAdder is working') proxy_count = 0 while not self.is_over_threshold(): for callback_label in range(self._crawler.__CrawlFuncCount__): callback = self._crawler.__CrawFunc__[callback_label] raw_proxies = self._crawler.get_raw_proxies(callback) # test crawled proxies self._tester.set_raw_proxies(raw_proxies) self._tester.test() proxy_count += len(raw_proxies) if self.is_over_threshold(): print('IP is enough, waiting to be uesd') break if proxy_count == 0: raise ResourceDepletionError
class Tester: def __init__(self): self.redis = RedisClient() async def test_single_proxy(self, proxy): """ 测试单个代理 :param proxy: :return: """ conn = aiohttp.TCPConnector(ssl=False) async with aiohttp.ClientSession(connector=conn) as session: try: if isinstance(proxy, bytes): # 判断是不是bytes类型 proxy = proxy.decode('utf-8') real_proxy = 'http://' + proxy print('正在测试') async with session.get(TEST_URL, proxy=real_proxy, timeout=15, allow_redirects=False) as response: if response.status in VALID_STATUS_CODES: # 状态码是否为200,302 self.redis.max(proxy) # 代理可用就改变代理的分数为100 print('代理可用', proxy) else: self.redis.decrease(proxy) # 代理减分 print('请求响应码不合理', response.status, 'IP', proxy) except (ClientError, aiohttp.client_exceptions.ClientConnectorError, asyncio.TimeoutError, AttributeError): self.redis.decrease(proxy) print('代理请求失败______________', proxy) def run(self): """ 检测主函数 :return: """ print('检测器开始运行') try: count = self.redis.count() # 获取proxies数量 print('当前剩余', count, '个代理') for i in range(0, count, BATCH_TEST_SIZE): # 最大批测试量BATCH_TEST_SIZE = 10 start = i stop = min(i + BATCH_TEST_SIZE, count) print('正在测试第', start + 1, '-', stop, '个代理') test_proxies = self.redis.batch(start, stop) # 批量获取 loop = asyncio.get_event_loop() tasks = [ self.test_single_proxy(proxy) for proxy in test_proxies ] loop.run_until_complete(asyncio.wait(tasks)) sys.stdout.flush() time.sleep(5) except Exception as e: print('测试器发生错误', e.args)
class Tester(object): def __init__(self): self.redis = RedisClient() async def test_single_proxy(self, proxy): """ Test single proxy :param proxy: Single proxy :return: None """ conn = aiohttp.TCPConnector(verify_ssl=False) async with aiohttp.ClientSession(connector=conn) as session: try: if isinstance(proxy, bytes): proxy = proxy.decode('utf-8') real_proxy = 'http://' + proxy print('Testing', proxy) async with session.get(TEST_URL, proxy=real_proxy, timeout=15, allow_redirects=False) as response: if response.status in VALID_STATUS_CODES: self.redis.max(proxy) print("Proxy is OK", proxy) else: self.redis.decrease(proxy) print("Response code is wrong", response.status, 'IP', proxy) except (ClientError, aiohttp.client_exceptions.ClientConnectorError, asyncio.TimeoutError, AttributeError): self.redis.decrease(proxy) print("Fail to get proxy", proxy) def run(self): """ Main function :return: None """ print("Tester starts running") try: count = self.redis.count() print("Current surplus", count, "proxies") for i in range(0, count, BATCH_TEST_SIZE): start = i stop = min(i + BATCH_TEST_SIZE, count) print("Current testing the", start + 1, '-', stop, 'th proxy') test_proxies = self.redis.batch(start, stop) loop = asyncio.get_event_loop() tasks = [ self.test_single_proxy(proxy) for proxy in test_proxies ] loop.run_until_complete(asyncio.wait(tasks)) sys.stdout.flush() time.sleep(5) except Exception as e: print("Error!", e.args)
class Scheduler(): def schedule_tester(self, cycle=TESTER_CYCLE): """ 定时测试代理 """ tester = Tester() # tester.run() while True: print('测试器开始运行') tester.run() time.sleep(cycle) def schedule_getter(self, cycle=GETTER_CYCLE): """ 定时获取代理 """ getter = Getter() while True: print('开始抓取代理') getter.run() time.sleep(cycle) def schedule_api(self): """ 开启API """ app.run(API_HOST, API_PORT) #端口如果是5555,会报400的错误 def schedule_redis(self): """ 开启Redis """ os.system("redis-server") def run(self): print('代理池开始运行') # 开启redis线程 while True: self.redis = RedisClient() print('检查redis') if (self.redis.check()): break else: redis_process = Process(target=self.schedule_redis) redis_process.start() time.sleep(0.5) if TESTER_ENABLED: tester_process = Process(target=self.schedule_tester) tester_process.start() if GETTER_ENABLED: getter_process = Process(target=self.schedule_getter) getter_process.start() if API_ENABLED: api_process = Process(target=self.schedule_api) api_process.start()
class Getter(): def __init__(self): self.crawler = Crawler() self.redis = RedisClient() def run(self): if self.redis.count() < POOL_UPPER_THRESHOLD: #for crawl_func_label in range(self.crawler.__CrawlFuncCount__): for crawl_func in self.crawler.__CrawlFunc__: #crawl_func = self.crawler.__CrawlFunc__[crawl_func_label] proxies = self.crawler.start_crawl_func(crawl_func) print(crawl_func, '正在爬取代理') for proxy in proxies: print(proxy) self.redis.add(proxy) proxy_sum = self.redis.count() print('目前代理个数:', proxy_sum)
def valid_proxy(cycle=VALID_CHECK_CYCLE): """ 获取队列中一半的代理进行可用性测试! """ conn = RedisClient() tester = ValidityTester() while True: print('测试redis队列代理可用性:') count = int(0.5 * conn.queue_len) if count == 0: print('代理池为空或者全部测试完毕!') time.sleep(cycle) continue raw_proxies = conn.get(count) tester.set_raw_proxies(raw_proxies) tester.test() time.sleep(cycle)
def timingCheck(cycle=TIMING_CHECK): conn = RedisClient() valiClass = ValidityTester() while True: if conn.queue_len > 0: valiClass.set_timing_params() valiClass.TimingCheck() time.sleep(cycle)
class Tester(object): def __init__(self): self.redis = RedisClient() async def test_single_proxy(self, proxy): """ 测试单个代理 :param proxy: :return: """ headers = { "Connection": "keep-alive", "Host": "www.sogou.com", "Pragma": "no-cache", "User-Agent": 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.87 Safari/537.36', } conn = aiohttp.TCPConnector(verify_ssl=False) async with aiohttp.ClientSession(connector=conn) as session: try: if isinstance(proxy, bytes): proxy = proxy.decode('utf-8') real_proxy = 'http://' + proxy print('正在测试' + proxy) async with session.get(TEST_URL, proxy=real_proxy, timeout=15, allow_redirects=False, headers=headers) as response: if response.status in VALID_STATUS_CODES: self.redis.max(proxy) print('代理可用' + proxy) else: self.redis.decrease(proxy) print('请求响应码不合法 ' + str(response.status) + 'IP' + proxy) except (ClientError, aiohttp.client_exceptions.ClientConnectorError, asyncio.TimeoutError, AttributeError): self.redis.decrease(proxy) print('代理请求失败{}'.format(proxy)) def run(self): """ 测试主函数 :return: """ print('测试器开始运行') try: count = self.redis.count() print('当前剩余{}个代理'.format(count)) for i in range(0, count, BATCH_TEST_SIZE): start = i stop = min(i + BATCH_TEST_SIZE, count) print('正在测试第{}-{}个代理'.format(start + 1, stop)) test_proxies = self.redis.batch(start, stop) loop = asyncio.get_event_loop() tasks = [self.test_single_proxy(proxy) for proxy in test_proxies] loop.run_until_complete(asyncio.wait(tasks)) sys.stdout.flush() time.sleep(5) except Exception as e: print('测试器发生错误{}'.format(e.args))
def valid_proxy(cycle=VALID_CHECK_CYCLE): """ Get half of proxies which in redis """ conn = RedisClient() tester = ValidityTester() while True: print('Refreshing ip') count = int(0.5 * conn.queue_len) if count == 0: print('Waiting for adding') time.sleep(cycle) continue raw_proxies = conn.get(count) tester.set_raw_proxies(raw_proxies) tester.test() time.sleep(cycle)
def valid_proxy(cycle=VALID_CHECK_CYCLE): """ Get half of proxies which in redis """ conn = RedisClient() #Redis连接对象 tester = ValidityTester() while True: print('刷新代理池中...') count = int(0.3 * conn.queue_len) #从左侧拿出一半的代理,只剩一个时,看做0个 if count == 0: #如果队列长度不够了 print('等待添加代理中...') time.sleep(cycle) #设置暂时睡眠,等待添加 continue raw_proxies = conn.get(count) tester.set_raw_proxies(raw_proxies) #调用函数添加,raw_proxies设置为类变量 tester.test() #检测代理是否可用 time.sleep(cycle)
def get_conn(): """ Opens a new redis connection if there is none yet for the current application context. """ if not hasattr(g, 'redis_client'): g.redis_client = RedisClient() return g.redis_client
def get_conn(): """ 建立Redis连接;若已连接则直接返回 :return: 返回一个Redis连接类的全局对象 """ if not hasattr(g, 'redis_client'): g.redis_client = RedisClient() return g.redis_client
class Tester(object): def __init__(self): self.redis = RedisClient() async def test_single_proxy(self, proxy): """ 测试单个代理 :param proxy: :return: """ conn = aiohttp.TCPConnector(verify_ssl=False) async with aiohttp.ClientSession(connector=conn) as session: try: if isinstance(proxy, bytes): proxy = proxy.decode('utf-8') real_proxy = 'http://' + proxy logger.debug(f'正在测试 {proxy}') async with session.get(TEST_URL, proxy=real_proxy, timeout=15, allow_redirects=False) as response: if response.status in VALID_STATUS_CODES: self.redis.max(proxy) logger.debug(f'代理可用 {proxy}') else: self.redis.decrease(proxy) logger.debug(f'请求响应码不合法 {response.status}, IP {proxy}') except (ClientError, aiohttp.client_exceptions.ClientConnectorError, asyncio.TimeoutError, AttributeError): self.redis.decrease(proxy) logger.debug(f'代理请求失败 {proxy}') def run(self): """ 测试主函数 :return: """ logger.debug('测试器开始运行') try: count = self.redis.count() logger.debug(f'当前剩余 {count} 个代理') for i in range(0, count, BATCH_TEST_SIZE): start = i stop = min(i + BATCH_TEST_SIZE, count) logger.debug(f'正在测试第 {start + 1 - stop} 个代理') test_proxies = self.redis.batch(start, stop) loop = asyncio.get_event_loop() tasks = [ self.test_single_proxy(proxy) for proxy in test_proxies ] loop.run_until_complete(asyncio.wait(tasks)) sys.stdout.flush() time.sleep(5) except Exception as e: logger.debug(f'测试器发生错误 {e.args}')
def valid_proxy(cycle=VALID_CHECK_CYCLE): # 定时检测器 """ Get half of proxies which in redis """ conn = RedisClient() # redis连接对象 tester = ValidityTester() # 检测代理是否可用 while True: print('Refreshing ip') count = int(0.5 * conn.queue_len) # 取出前一半的代理 if count == 0: print('Waiting for adding') time.sleep(cycle) continue raw_proxies = conn.get(count) tester.set_raw_proxies(raw_proxies) tester.test() time.sleep(cycle)
class ProxyMiddleware(object): def __init__(self): # 连接redis数据库,调用radom方法获取一个随机的IP self.db = RedisClient() self.proxy = self.db.random() def process_request(self, request, spider): request.meta["proxy"] = self.proxy
def valid_proxy(cycle=VALID_CHECK_CYCLE): """ Get half of proxies which in redis """ conn = RedisClient() # redis连接对象 tester = ValidityTester() # 代理检测对象 while True: print('Refreshing ip') count = int(0.5 * conn.queue_len) # 需要从redis中取出一半的代理地址 if count == 0: print('Waiting for adding') time.sleep(cycle) continue raw_proxies = conn.get(count) # 从redis中取出一半的代理地址,返回列表 tester.set_raw_proxies(raw_proxies) tester.test() time.sleep(cycle)
class Getter(): def __init__(self): self.redis = RedisClient() self.crawler = Crawler() def log(self): if not os.path.exists('log2'): os.mkdir('log2') log_file_name = 'log2/' + LOG_PATH log_file_1 = logging.FileHandler(log_file_name, 'a', encoding='utf-8') fmt = logging.Formatter( fmt= "%(asctime)s - %(name)s - %(levelname)s -%(module)s: %(message)s") log_file_1.setFormatter(fmt) logger1 = logging.Logger('run_log', level=logging.DEBUG) logger1.addHandler(log_file_1) return logger1 def is_over_threshold(self): """ 判断是否达到了代理池限制 """ if self.redis.count() >= POOL_UPPER_THRESHOLD: return True else: return False def run(self): """爬取到代理设置初始分数,直接存入redis""" print('获取器开始执行') if not self.is_over_threshold(): try: for callback_label in range(self.crawler.__CrawlFuncCount__): callback = self.crawler.__CrawlFunc__[callback_label] # 获取代理 proxies = self.crawler.get_proxies(callback) sys.stdout.flush() if not proxies: self.log().error('代理抓取失败,抓取函数:%s' % callback) continue for proxy in proxies: self.redis.add(proxy) except Exception as e: self.log().exception(e)
def test_proxies(cycle=VALID_CHECK_CYCLE): """ 检查代理队列左半边(旧的)队列的代理有效性,无效的剔除,有效的重新放入队列右侧 :param cycle: 检测周期 """ conn = RedisClient() tester = ValidityTester() while True: print('testing & refreshing ips...') count = int(0.5 * conn.list_len) if count == 0: print('0 ip, waiting for adding...') time.sleep(cycle) continue raw_proxies = conn.get_for_test(count) #从数据库中获取ip进行测试 tester.set_raw_proxies(raw_proxies) tester.test() time.sleep(cycle)
def valid_proxy(cycle=VALID_CHECK_CYCLE): """ Get half of proxies which in redis 从redis取出代理进行异步检测,将可用代理重新放回redis数据库列表中的右侧,保证代理的定时更新 """ conn = RedisClient() tester = ValidityTester() while True: print('Refreshing ip') count = int(0.5 * conn.queue_len) if count == 0: print('Waiting for adding') time.sleep(cycle) continue raw_proxies = conn.get(count) tester.set_raw_proxies(raw_proxies) tester.test() time.sleep(cycle)
def check_pool(lower_threshold=POOL_LOWER_THRESHOLD, upper_threshold=POOL_UPPER_THRESHOLD, cycle=POOL_LEN_CHECK_CYCLE): conn = RedisClient() adder = PoolAdder(upper_threshold) while True: if conn.queue_len < lower_threshold: adder.add_to_queue() time.sleep(cycle)
class Getter(object): def __init__(self): self.redis = RedisClient() self.crawler = Crawler() def is_over_threshold(self): if self.redis.get_count() < MAX_THRESHOLD: return True else: return False def run(self): if self.is_over_threshold(): for i in range(self.crawler.__CrawlCount__): proxies = self.crawler.get_proxies( self.crawler.__CrawlFunc__[i]) for proxy in proxies: self.redis.add(proxy)
class ValidityTester(object): test_api = TEST_API def __init__(self): self._raw_proxies = None self._usable_proxies = [] def set_raw_proxies(self, proxies): self._raw_proxies = proxies self._conn = RedisClient() async def test_single_proxy(self, proxy): """ text one proxy, if valid, put them to usable_proxies. """ try: async with aiohttp.ClientSession() as session: try: if isinstance(proxy, bytes): proxy = proxy.decode('utf-8') real_proxy = 'http://' + proxy print('Testing', proxy) async with session.get(self.test_api, proxy=real_proxy, timeout=get_proxy_timeout) as response: if response.status == 200: self._conn.put(proxy) print('Valid proxy', proxy) except (ProxyConnectionError, TimeoutError, ValueError): print('Invalid proxy', proxy) except (ServerDisconnectedError, ClientResponseError,ClientConnectorError) as s: print(s) pass def test(self): """ aio test all proxies. """ print('ValidityTester is working') try: loop = asyncio.get_event_loop() tasks = [self.test_single_proxy(proxy) for proxy in self._raw_proxies] loop.run_until_complete(asyncio.wait(tasks)) except ValueError: print('Async Error')
class Tester(object): def __init__(self): self.redis = RedisClient() async def test_single_proxy(self, proxy): """ 异步测试单个代理 :param proxy: :return: """ conn = aiohttp.TCPConnector(verify_ssl=False) async with aiohttp.ClientSession(connector=conn) as session: try: """ 在Python3以后,字符串和bytes类型彻底分开了。字符串是以字符为单位进行处理的,bytes类型是以字节为单位处理的。 直接以默认的utf-8编码解码bytes成string """ if isinstance(proxy, bytes): proxy = proxy.decode('utf-8') real_proxy = 'http://' + proxy print("正在测试", proxy) async with session.get(TEST_URL, allow_redirects=False, proxy=real_proxy, timeout=15) as response: if response.status in VALID_STATUS_CODES: # 将代理设置为分数最大 self.redis.max(proxy) print("代理", proxy, '可用, 设置为100') else: self.redis.decrease(proxy) print('请求响应码不合法', response.status, 'IP', proxy) except (ClientError, aiohttp.client_exceptions.ClientConnectorError, asyncio.TimeoutError, AttributeError): print("代理验证失败", proxy) self.redis.decrease(proxy) def run(self): """ 测试函数 :return: """ print('测试器开始运行') try: count = self.redis.count() for i in range(0, count, BATCH_TEST_SIZE): start = i stop = min(i + BATCH_TEST_SIZE, count) print('正在测试第', start + 1, '-', stop, '个代理') """获取测试代理""" test_proxies = self.redis.batch(start, stop) loop = asyncio.get_event_loop() tasks = [self.test_single_proxy(proxy) for proxy in test_proxies] loop.run_until_complete(asyncio.wait(tasks)) time.sleep(5) except Exception as e: print("测试器发生错误", e.args)
def valid_proxy( cycle=VALID_CHECK_CYCLE): # VALID_CHECK_CYCLE 时间的参数 setting中配置 """ Get half of proxies which in redis 定时检测器 检测redis数据库ip是否有效 """ conn = RedisClient() # 连接redis数据库 tester = ValidityTester() # 检测代理是否可用的类 while True: print('Refreshing ip') count = int(0.5 * conn.queue_len) # 取出队列长度一般的代理 if count == 0: print('Waiting for adding') time.sleep(cycle) continue raw_proxies = conn.get(count) tester.set_raw_proxies(raw_proxies) tester.test() time.sleep(cycle)
class ValidityTester(object): #检查代理是否可用,并保存。 def __init__(self): self._raw_proxies=None def set_raw_proxies(self,proxies): self._raw_proxies=proxies self._redis=RedisClient() async def check_single_proxy(self,proxy): #检查单个代理 if isinstance(proxy,bytes): proxy=proxy.decode('utf-8') real_proxy='http://'+proxy try: async with aiohttp.ClientSession() as session: try: async with session.get(TEST_API,proxy=real_proxy,timeout=PROXY_TIMEOUT) as response: print('Check proxy',proxy) if response.status==200: self._redis.add(proxy) print('Add to redis',proxy) except (ProxyConnectionError,TimeoutError): print('Dont add to proxy',proxy) await session.close() except(ServerDisconnectedError, ClientResponseError,ClientConnectorError,Exception) as s: print(s) await session.close() def check_some_proxies(self): ''' 建立循环消息圈:循环检查_raw_proxies中的代理ip _raw_proxies 为空或者None 抛出异常 ''' if not self._raw_proxies: return try: print('Check_some_proxies Ing') loop=asyncio.get_event_loop() tasks=[self.check_single_proxy(task) for task in self._raw_proxies] loop.run_until_complete(asyncio.wait(tasks)) except: print('Check_some_proxies Error')
class Tester(object): def __init__(self): self.redis = RedisClient() async def test_single_proxy(self, proxy): """ 测试单个代理 :param proxy: :return: """ conn = aiohttp.TCPConnector(verify_ssl=False) async with aiohttp.ClientSession(connector=conn) as session: try: if isinstance(proxy, bytes): proxy = proxy.decode('utf-8') real_proxy = 'http://' + proxy print('正在测试', proxy) async with session.get(TEST_URL, proxy=real_proxy, timeout=15, allow_redirects=False) as response: if response.status in VALID_STATUS_CODES: self.redis.max(proxy) print('代理可用', proxy) else: self.redis.decrease(proxy) print('请求响应码不合法 ', response.status, 'IP', proxy) except (ClientError, aiohttp.client_exceptions.ClientConnectorError, asyncio.TimeoutError, AttributeError): self.redis.decrease(proxy) print('代理请求失败', proxy) def run(self): """ 测试主函数 :return: """ print('测试器开始运行') try: count = self.redis.count() print('当前剩余', count, '个代理') for i in range(0, count, BATCH_TEST_SIZE): start = i stop = min(i + BATCH_TEST_SIZE, count) print('正在测试第', start + 1, '-', stop, '个代理') test_proxies = self.redis.batch(start, stop) loop = asyncio.get_event_loop() tasks = [self.test_single_proxy(proxy) for proxy in test_proxies] loop.run_until_complete(asyncio.wait(tasks)) sys.stdout.flush() time.sleep(5) except Exception as e: print('测试器发生错误', e.args)
class Getter(): def __init__(self): self.redis = RedisClient() self.crawler = Crawler() def is_over_threshold(self): """ 判断是否达到了代理池限制 """ if self.redis.count() >= POOL_UPPER_THRESHOLD: return True else: return False def run(self): print('获取器开始执行') if not self.is_over_threshold(): for callback_label in range(self.crawler.__CrawlFuncCount__): callback = self.crawler.__CrawlFunc__[callback_label] # 获取代理 proxies = self.crawler.get_proxies(callback) sys.stdout.flush() for proxy in proxies: self.redis.add(proxy)
class Tester(object): def __init__(self): self.redis = RedisClient() async def test_single_proxy(self, session, proxy): """测试单个代理""" try: real_proxy = eval(proxy)['https'] print('正在测试', proxy) async with session.get(TEST_URL, proxy=real_proxy, timeout=20, allow_redirects=False) as response: if response.status in VALID_STATUS_CODES: rst = await response.text() if rst: resp_ip = '//'+eval(rst).get('headers').get('X-Forwarded-For') proxy_ip = real_proxy.split(':') if resp_ip == proxy_ip[1]: self.redis.max(proxy) print('代理可用', proxy) else: self.redis.decrease(proxy) print('请求响应码不合法 ', response.status, 'IP', proxy) except (ClientError, aiohttp.client_exceptions.ClientConnectorError, asyncio.TimeoutError, AttributeError): self.redis.decrease(proxy) print('代理请求失败', proxy) async def set_test_tasks(self, loop): """设置测试任务""" count = self.redis.count print('当前剩余', count, '个代理') for start in range(0, count, BATCH_TEST_SIZE): # 一段一段创建任务, 每一段一个Session减少内存开销 stop = min(start + BATCH_TEST_SIZE, count) print('正在测试第', start + 1, '-', stop, '个代理') test_proxies = self.redis.batch(start, stop) # conn = aiohttp.TCPConnector(verify_ssl=False) conn = aiohttp.TCPConnector() async with aiohttp.ClientSession(connector=conn, loop=loop) as session: tasks = [self.test_single_proxy(session, proxy) for proxy in test_proxies] await asyncio.wait(tasks) def run(self): """测试主函数""" print('测试器开始运行') try: loop = asyncio.get_event_loop() loop.run_until_complete(self.set_test_tasks(loop)) sys.stdout.flush() # 马上print不用等到循环结束 time.sleep(5) except Exception as e: print('测试器发生错误', e.args)
class Crawler(object, metaclass=ProxyMetaclass): # 继承ProxyMetaclass, 拥有__CrawlFunc__, __CrawlFuncCount__两个属性 def __init__(self): self.redis = RedisClient() def save_to_db(self, proxies): for proxy in proxies: sys.stdout.flush() print('成功获取到代理', proxy) self.redis.add(proxy) async def crawl_daili66(self): """获取代理66, 外国ip多""" urls = ['http://www.66ip.cn/{}.html'.format(page) for page in range(1, 5)] print('Crawling') html = await get_page(urls) if html: proxies = [] for page in html: doc = pq(page) trs = doc('.containerbox table tr:gt(0)').items() for tr in trs: ip = tr.find('td:nth-child(1)').text() port = tr.find('td:nth-child(2)').text() ip_port = ':'.join([ip, port]) proxy = {'https':'http://'+ip_port} proxies.append(proxy) self.save_to_db(proxies) async def crawl_ip3366(self): """云代理index""" urls = ['http://www.ip3366.net/?stype=1&page={}'.format(page) for page in range(1, 5)] html = await get_page(urls) if html: proxies = [] for page in html: find_tr = re.compile('<tr>(.*?)</tr>', re.S) trs = find_tr.findall(page) for s in range(1, len(trs)): find_ip = re.compile('<td>(\d+\.\d+\.\d+\.\d+)</td>') re_ip_address = find_ip.findall(trs[s]) find_port = re.compile('<td>(\d+)</td>') re_port = find_port.findall(trs[s]) for address, port in zip(re_ip_address, re_port): address_port = address+':'+port ip_port = address_port.replace(' ','') proxy = {'https':'http://'+ip_port} proxies.append(proxy) self.save_to_db(proxies) async def crawl_ip3366_(self): """云代理free""" urls = ['http://www.ip3366.net/free/?stype=1&page={}'.format(page) for page in range(1, 5)] html = await get_page(urls) if html: proxies = [] for page in html: ip_address = re.compile('<tr>\s*<td>(.*?)</td>\s*<td>(.*?)</td>') re_ip_address = ip_address.findall(page) # ip_port = [(re_ip_address[i] + ':' + re_port[i]).replace(' ', '') for i in range(len(re_port))] # proxies.append(ip_port) for address, port in re_ip_address: result = address+':'+ port ip_port = result.replace(' ', '') proxy = {'https': 'http://' + ip_port} proxies.append(proxy) self.save_to_db(proxies) async def crawl_kuaidaili(self): """快代理(都是http的)""" urls = ['http://www.kuaidaili.com/free/inha/{}/'.format(page) for page in range(1, 5)] html = await get_page(urls) if html: proxies = [] for page in html: ip_address = re.compile('<td data-title="IP">(.*?)</td>') re_ip_address = ip_address.findall(page) port = re.compile('<td data-title="PORT">(.*?)</td>') re_port = port.findall(page) for address,port in zip(re_ip_address, re_port): address_port = address+':'+port ip_port = address_port.replace(' ','') proxy = {'https': 'http://' + ip_port} proxies.append(proxy) self.save_to_db(proxies) async def crawl_xicidaili(self): """西刺代理""" urls = ['http://www.xicidaili.com/nn/{}'.format(page) for page in range(1, 4)] headers = { 'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'Cookie':'_free_proxy_session=BAh7B0kiD3Nlc3Npb25faWQGOgZFVEkiJWRjYzc5MmM1MTBiMDMzYTUzNTZjNzA4NjBhNWRjZjliBjsAVEkiEF9jc3JmX3Rva2VuBjsARkkiMUp6S2tXT3g5a0FCT01ndzlmWWZqRVJNek1WanRuUDBCbTJUN21GMTBKd3M9BjsARg%3D%3D--2a69429cb2115c6a0cc9a86e0ebe2800c0d471b3', 'Host':'www.xicidaili.com', 'Referer':'http://www.xicidaili.com/nn/3', 'Upgrade-Insecure-Requests':'1', } html = await get_page(urls, options=headers) if html: proxies = [] for page in html: find_trs = re.compile('<tr class.*?>(.*?)</tr>', re.S) trs = find_trs.findall(page) for tr in trs: find_ip = re.compile('<td>(\d+\.\d+\.\d+\.\d+)</td>') re_ip_address = find_ip.findall(tr) find_port = re.compile('<td>(\d+)</td>') re_port = find_port.findall(tr) for address,port in zip(re_ip_address, re_port): address_port = address+':'+port ip_port = address_port.replace(' ', '') proxy = {'https':'http://'+ip_port} proxies.append(proxy) self.save_to_db(proxies) async def crawl_iphai(self): """ip海""" urls = ['http://www.iphai.com/'] html = await get_page(urls) if html: proxies = [] find_tr = re.compile('<tr>(.*?)</tr>', re.S) trs = find_tr.findall(html[0]) for s in range(1, len(trs)): find_ip = re.compile('<td>\s+(\d+\.\d+\.\d+\.\d+)\s+</td>', re.S) re_ip_address = find_ip.findall(trs[s]) find_port = re.compile('<td>\s+(\d+)\s+</td>', re.S) re_port = find_port.findall(trs[s]) for address,port in zip(re_ip_address, re_port): address_port = address+':'+port ip_port = address_port.replace(' ', '') proxy = {'https': 'http://' + ip_port} proxies.append(proxy) self.save_to_db(proxies) async def crawl_data5u(self): """data5u""" urls = ['http://www.data5u.com/free/gngn/index.shtml'] headers = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'en-US,en;q=0.9,zh-CN;q=0.8,zh;q=0.7', 'Cache-Control': 'max-age=0', 'Connection': 'keep-alive', 'Cookie': 'JSESSIONID=47AA0C887112A2D83EE040405F837A86', 'Host': 'www.data5u.com', 'Referer': 'http://www.data5u.com/free/index.shtml', 'Upgrade-Insecure-Requests': '1', 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.108 Safari/537.36', } html = await get_page(urls, options=headers) if html: proxies = [] ip_address = re.compile('<span><li>(\d+\.\d+\.\d+\.\d+)</li>.*?<li class=\"port.*?>(\d+)</li>', re.S) re_ip_address = ip_address.findall(html[0]) for address, port in re_ip_address: result = address + ':' + port ip_port = result.replace(' ', '') proxy = {'https': 'http://' + ip_port} proxies.append(proxy) self.save_to_db(proxies) # 近期修改的 async def crawl_goubanjia(self): """全网ip""" urls = ['http://www.goubanjia.com'] html = await get_page(urls) if html: proxies = [] doc = pq(html[0]) tds = doc('td.ip').items() for td in tds: td.find('p').remove() ip_port = td.text().replace('\n', '') proxy = {'https': 'http://' + ip_port} proxies.append(proxy) self.save_to_db(proxies) async def crawl_89ip(self): """89ip""" urls = ['http://www.89ip.cn/index_{}.html'.format(page) for page in range(1, 4)] html = await get_page(urls) if html: proxies = [] for page in html: doc = pq(page) ips = doc('tr td:nth-child(1)').items() ports = doc('tr td:nth-child(2)').items() for ip, port in zip(ips, ports): result = ip.text() + ':' + port.text() ip_port = result.replace(' ', '') proxy = {'https': 'http://' + ip_port} proxies.append(proxy) self.save_to_db(proxies) async def crawl_ip181(self): """讯代理api接口""" urls = ['http://www.ip181.com/'] html = await get_page(urls) if html: proxies = [] json_ = eval(html[0]) RESULT = json_.get('RESULT') for i in RESULT: ip = i.get('ip') port = i.get('port') result = ip + ':' + port ip_port = result.replace(' ', '') proxy = {'https': 'http://' + ip_port} proxies.append(proxy) self.save_to_db(proxies) async def crawl_premproxy(self): """premproxy""" urls = ['https://premproxy.com/proxy-by-country/{}.htm'.format(country) for country in ('China-01','China-02','China-03','China-04','Taiwan-01')] html = await get_page(urls) if html: proxies = [] for page in html: ip_address = re.compile('<td data-label="IP:port ">(.*?)</td>') re_ip_address = ip_address.findall(page) for address_port in re_ip_address: ip_port = address_port.replace(' ', '') proxy = {'https': 'http://' + ip_port} proxies.append(proxy) self.save_to_db(proxies) async def crawl_xroxy(self): """xroxy 换了网址不挂代理, 访问很慢""" urls = ['https://www.xroxy.com/proxy-country-{}'.format(country) for country in ('cn','tw')] html = await get_page(urls) if html: proxies = [] for page in html: ip_address1 = re.compile('<td class="sorting_1">(\d+\.\d+\.\d+\.\d+)</td>') re_ip_address1 = ip_address1.findall(page) print(re_ip_address1) ip_address2 = re.compile("<td>\d[3-5]</td>") re_ip_address2 = ip_address2.findall(page) print(re_ip_address2) for address,port in zip(re_ip_address1,re_ip_address2): address_port = address+':'+port ip_port = address_port.replace(' ','') proxy = {'https': 'http://' + ip_port} proxies.append(proxy) self.save_to_db(proxies)
def set_raw_proxies(self, proxies): self._raw_proxies = proxies self._conn = RedisClient()
def __init__(self): self.redis = RedisClient() self.crawler = Crawler()
def __init__(self): self.redis = RedisClient()