class ValidTester(object): test_url = TEST_URL # 校验测试初始化 def __init__(self): self.__raw_proxies = None self.__usable_proxies = None self.db = MongoOperator() # 设置初始需校验数据 def set_raw_proxies(self, raw_proxies): self.__raw_proxies = raw_proxies self.__usable_proxies = [] # 测试一个代理 async def test_one_proxy(self, proxy): async with aiohttp.ClientSession() as session: try: if isinstance(proxy, bytes): # 确保proxy是字符串数据 proxy = proxy.decode('utf-8') real_proxy = 'http://' + ("%s:%s" % (proxy['ip'], proxy['port'])) print('Testing ', real_proxy) async with session.get(self.test_url, proxy=real_proxy, timeout=15) as response: if response.status == 200: # 使用代理连接后访问正常 print('Valid proxy', proxy) self.__usable_proxies.append(proxy) # 存入可用代理set else: print('Invalid proxy', proxy) self.db.delete(proxy, filterone=True) except (TimeoutError, ValueError, Exception): print('Exception ,Invalid proxy %s' % (proxy)) self.db.delete(proxy, filterone=True) # 对队列中的代理进行测试 def test(self): print('ValidTester is working') try: loop = asyncio.get_event_loop() tasks = [ self.test_one_proxy(proxy) for proxy in self.__raw_proxies ] # 对数据迭代检测 loop.run_until_complete(asyncio.wait(tasks)) # 直到迭代检测任务完成 except ValueError: print('Async Error') # 获取有用代理的set @property def usable_proxies(self): return self.__usable_proxies
def valid_proxy(cycle=VALID_CHECK_CYCLE): __db = MongoOperator() __tester = ValidTester() while True: print('正在校验代理IP') total = int(0.5 * __db.count) if total == 0: print('当前可用代理IP为空, 等待添加补充') time.sleep(cycle) continue raw_proxies = __db.select(total) # 从redis数据库获取原始代理ip __tester.set_raw_proxies(raw_proxies) __tester.test() # 校验原始代理ip time.sleep(cycle)
def check_pool(lower_threshold=POOL_LOWER_THRESHOLD, upper_threshold=POOL_UPPER_THRESHOLD, cycle=POOL_LEN_CHECK_CYCLE): __db = MongoOperator() adder = PoolAdder(upper_threshold) while True: if __db.count < lower_threshold: adder.add_to_queue() time.sleep(cycle)
def gets(self, max_page): self.start_url = XICINN_URL # 默认使用高匿代理 urls = [self.start_url.format(i) for i in range(self._counter, self._counter + max_page)] self._increment(max_page) db = MongoOperator() for url in urls: html = self.get_one_page(url) # print(html) debug # self.parse_one_page(html) for item in self.parse_one_page(html): print(item) # 输出解析出来的单元 # proxy_spider.export_to_file(item) # 导出json字典 db.insert(item, MONGO_TABLE) # 导出到数据库中 db.close()
def __init__(self): self.__raw_proxies = None self.__usable_proxies = None self.db = MongoOperator()
def get_count(): db = MongoOperator() count = db.count db.close() return str(count)
def get_proxy(): db = MongoOperator() item = db.select(count=1) db.close() return json.dumps(item)
def __init__(self, threshold): self.db = MongoOperator() self._threshold = threshold self.valid_tester = ValidTester()