def _check_ip_list(proxy_list: List[str]): print(proxy_list) def __check_a_ip_str(proxy_str): proxies = {'https': f'https://{proxy_str}', 'http': f'http://{proxy_str}'} try: requests.get('https://www.baidu.com/content-search.xml', proxies=proxies, timeout=10, verify=False) print(f'有效 {proxies}') except Exception as e: print(f'无效 {proxies} {type(e)}') pool = BoundedThreadPoolExecutor(50) [pool.submit(__check_a_ip_str, pr) for pr in proxy_list]
import json import random import sys import time import nb_log import requests from threadpool_executor_shrink_able import BoundedThreadPoolExecutor from proxypool_framework.proxy_pool_config import REDIS_CLIENT pool = BoundedThreadPoolExecutor(10) logger = nb_log.LogManager(__name__).get_logger_and_add_handlers( formatter_template=7, log_filename='rate_of_success.log') suceess_count = 0 total_count = 0 total_request_time = 0 for i in range(1, 10000): """ 本项目的public代理 :return: """ pr = json.loads( requests.get('http://127.0.0.1:6795/get_a_proxy/30', auth=('user', 'mtfy123')).text) t_start = time.time() try:
class ProxyCollector: pool_for_check_new = BoundedThreadPoolExecutor(100) pool_for_check_exists = BoundedThreadPoolExecutor(200) redis_key___has_start_check_exists_proxies_in_database_map = dict() logger_for_check_exists = LogManager( 'ProxyCollector-check_exists').get_logger_and_add_handlers( log_filename=f'ProxyCollector-check_exists.log', formatter_template=7) @staticmethod def check_proxy_validity(proxy_dict: dict): # noinspection PyUnusedLocal # noinspection PyBroadException try: # print(proxy_dict) requests.get(CHECK_PROXY_VALIDITY_URL, timeout=REQUESTS_TIMEOUT, proxies=proxy_dict, verify=False) return True except Exception as e: # print(e) return False def __init__( self, function_of_get_new_https_proxies_list_from_website, func_args=tuple(), func_kwargs: dict = None, platform_name='xx平台', redis_key=PROXY_KEY_IN_REDIS_DEFAULT, time_sleep_for_get_new_proxies=60, ): """ :param function_of_get_new_https_proxies_list_from_website: 獲取代理ip列表的函數,使用策略模式。 :param redis_key 代理ip存放的redis键名,是个zset。 :param time_sleep_for_get_new_proxies:每个单独的网页隔多久拉取一次。 """ self.function_of_get_new_https_proxies_list_from_website = function_of_get_new_https_proxies_list_from_website self._func_args = func_args self._func_kwargs = func_kwargs or {} self.platform_name = platform_name self._redis_key = redis_key self._time_sleep_for_get_new_proxies = time_sleep_for_get_new_proxies self.logger = LogManager( f'ProxyCollector-{platform_name}').get_logger_and_add_handlers( log_filename=f'ProxyCollector-{platform_name}.log', formatter_template=7) def __check_a_new_proxy_and_add_to_database(self, proxy_dict: dict): if self.check_proxy_validity(proxy_dict): # print(type(proxy_dict)) self.logger.info( f'新增 {self.platform_name} 代理ip到数据库 {json.dumps(proxy_dict, ensure_ascii=False)}' ) REDIS_CLIENT.zadd(self._redis_key, json.dumps(proxy_dict, ensure_ascii=False), time.time()) else: self.logger.warning(f'新拉取的 {self.platform_name} 平台 代理无效') def _check_all_new_proxies(self, ): """ 并发检测新代理,有效的入库 :return: """ exists_num_in_db = REDIS_CLIENT.zcard(self._redis_key) if exists_num_in_db < MAX_NUM_PROXY_IN_DB: self.pool_for_check_new.map( self.__check_a_new_proxy_and_add_to_database, [{ 'https': f'https://{ip}', 'platform': self.platform_name } for ip in self.function_of_get_new_https_proxies_list_from_website( *self._func_args, **self._func_kwargs)]) else: self.logger.critical( f'{self._redis_key} 键中的代理ip数量为 {exists_num_in_db},超过了制定阈值 {MAX_NUM_PROXY_IN_DB},此次循环暂时不拉取新代理' ) def __check_a_exists_proxy_and_drop_from_database(self, proxy_dict): if not self.check_proxy_validity(proxy_dict): self.logger_for_check_exists.warning( f'刪除数据库中失效代理ip {json.dumps(proxy_dict, ensure_ascii=False)}') REDIS_CLIENT.zrem(self._redis_key, json.dumps(proxy_dict, ensure_ascii=False)) else: self.logger_for_check_exists.info( f'数据库中的代理ip {json.dumps(proxy_dict, ensure_ascii=False)} 没有失效') REDIS_CLIENT.zadd(self._redis_key, json.dumps(proxy_dict, ensure_ascii=False), time.time()) # 更新检测时间。 def _check_exists_proxies_in_database(self): """ 并发删除数据库中的失效代理。上次检测时间离当前超过了指定的秒数,就重新检测。 :return: """ redis_proxies_list = REDIS_CLIENT.zrangebyscore( self._redis_key, 0, time.time() - MAX_SECONDS_MUST_CHECK_AGAIN) self.logger_for_check_exists.debug( f'需要检测的 {self._redis_key} 键中 {MAX_SECONDS_MUST_CHECK_AGAIN} ' f'秒内没检查过的 存量代理数量是 {len(redis_proxies_list)},总数量是 {REDIS_CLIENT.zcard(self._redis_key)}' ) self.pool_for_check_exists.map( self.__check_a_exists_proxy_and_drop_from_database, [json.loads(redis_proxy) for redis_proxy in redis_proxies_list]) @decorator_libs.synchronized def work(self): if not self.__class__.redis_key___has_start_check_exists_proxies_in_database_map.get( self._redis_key, False): self.__class__.redis_key___has_start_check_exists_proxies_in_database_map[ self._redis_key] = True self.logger.warning(f'启动对数据库中 {self._redis_key} zset键 已有代理的检测') decorator_libs.keep_circulating(1, block=False)( self._check_exists_proxies_in_database)() decorator_libs.keep_circulating(self._time_sleep_for_get_new_proxies, block=False)( self._check_all_new_proxies)()