class Getter(): def __init__(self): # self.redis = RedisClient() self.local = LocalDict() self.crawler = Crawler() self.factory = Factory() def is_over_threshold(self): """ 判断是否达到了代理池限制 """ if self.local.count() >= POOL_UPPER_THRESHOLD: return True else: return False def run(self): t = set() count = self.local.count() if self.is_over_threshold(): Log.info("Getter:此时容量已达上限,不获取ip") return Log.info(f'Getter:开始执行, 当前容量:{count}') for callback_label in range(self.crawler.__CrawlFuncCount__): try: callback = self.crawler.__CrawlFunc__[callback_label] # 获取代理 t.add(self.factory.add(self.crawler.get_proxies, callback)) sys.stdout.flush() except: traceback.print_exc() self.factory.wait(t) Log.info(f'Getter:执行结束, 获取前容量:{count}, 当前:{self.local.count()}')
def get_counts(): """ Get the sleep_count of proxies :return: 代理池总量 """ conn = LocalDict() return str(conn.count())
class Tester(object): def __init__(self): self.local = LocalDict() self.factory = Factory() self.mutex = threading.Lock() self._minus_count = 0 def _minus(self): with self.mutex: self._minus_count += 1 def test_single_proxy(self, url, proxy): """ 测试单个代理 :param proxy: :return: """ proxies = { "http": "http://" + proxy, } try: response = requests.head(url, headers=base_headers, proxies=proxies, timeout=15, allow_redirects=False, verify=False) status_code = response.status_code if status_code in VALID_STATUS_CODES: Log.debug(f'Tester:代理可用 {proxy}') pass else: if status_code in FORBIDEN_STATUS_CODES: self._minus() self.local.decrease(proxy, -MAX_SCORE) else: self.local.decrease(proxy) Log.error( f'Tester:请求响应码不合法 {status_code} ,IP {proxy}, URL: {url}') except (ReadTimeout, HTTPError, ProxyError, ConnectionError): self._minus() self.local.decrease(proxy, -MAX_SCORE) Log.warning(f'Tester:无用ip,直接删掉, ip: {proxy}') except (TypeError, AttributeError) as e: self.local.decrease(proxy) Log.error(f'Tester:代理请求失败 {proxy} ERROR: {e}') def run(self): """ 测试主函数 :return: """ t = set() self._minus_count = 0 count = self.local.count() if count == 0: Log.info("Tester:无代理") return Log.info(f'Tester:开始运行, 当前容量:{count}') try: stop = max(0, count) test_proxies = self.local.batch(0, stop) for proxy in test_proxies: for url in TEST_URLS: t.add(self.factory.add(self.test_single_proxy, url, proxy)) self.local.clear() except Exception as e: Log.error(f'Tester:发生错误 {e.args}') self.factory.wait(t) Log.info(f'Tester:执行结束, 测试前容量:{count}, 剩余:{count-self._minus_count}')