class ProxyRefresh(): def __init__(self, proxy_type='https'): if proxy_type == 'https': self.redis_handler = RedisClient('https_proxy') elif proxy_type == 'http': self.redis_handler = RedisClient('http_proxy') else: raise Exception('type must be https or http') self.proxy_type = proxy_type self.proxy_pool = set([*fuzz_all(), *self.redis_handler.get_all()]) def refresh(self, pool_num=10): pool = ThreadPool(pool_num) pool.map(self.valid_ip, self.proxy_pool) pool.close() pool.join() def refresh_in_async(self): asynctask = AsyncTask() for ip in self.proxy_pool: asynctask.add_task(self.valid_ip, ip) asynctask.run() def valid_ip(self, ip): if proxy_is_useful(ip, self.proxy_type): self.redis_handler.add(ip) print('ok', ip) else: self.redis_handler.delete(ip)
class SaveIp(): def __init__(self): self.redis = RedisClient() self.crawler = Crawler() def is_over_threshold(self): """判断是否达到了代理池限制""" if self.redis.count() >= POOL_UPPER_THRESHOLD: return True else: return False def run(self): print(' 获取器开始执行 ') if not self.is_over_threshold(): proxies = self.crawler.run() for proxy in proxies: print(proxy, '存入') self.redis.add(proxy)
class Getter(): def __init__(self): self.redis = RedisClient() self.crawler = Crawler() def is_over_theshold(self): """ 判断是否达到了代理池的限制 :return: """ if self.redis.count() >= POOL_UPPER_THRESHOLD: return True else: return False def run(self): print("获取器开始执行") if not self.is_over_theshold(): for callback_label in range(self.crawler.__CrawlFuncCount__): callback = self.crawler.__CrawlFunc__[callback_label] proxies = self.crawler.get_proxies(callback) for proxy in proxies: self.redis.add(proxy)