Пример #1
0
 async def test_single_proxy(self, proxy):
     """
     测试单个代理
     :param proxy:
     :return:
     """
     conn = aiohttp.TCPConnector(verify_ssl=False)
     async with aiohttp.ClientSession(connector=conn) as session:
         try:
             if isinstance(proxy, bytes):
                 proxy = proxy.decode('utf-8')
             #real_proxy = 'http://' + proxy
             real_proxy = proxy
             print_messege('正在测试', proxy)
             async with session.get(TEST_URL,
                                    proxy=real_proxy,
                                    timeout=15,
                                    allow_redirects=False) as response:
                 if response.status in VALID_STATUS_CODES:
                     self.redis.max(proxy)
                     print_messege('代理可用', proxy)
                 else:
                     self.redis.decrease(proxy)
                     error_messege('请求响应码不合法 ', response.status, 'IP',
                                   proxy)
         except (ClientError,
                 aiohttp.client_exceptions.ClientConnectorError,
                 asyncio.TimeoutError, AttributeError):
             self.redis.decrease(proxy)
             error_messege('代理请求失败', proxy)
Пример #2
0
 def schedule_getter(self, cycle=GETTER_CYCLE):
     """
     定时获取代理
     """
     getter = Getter()
     while True:
         print_messege('开始抓取代理')
         getter.run()
         time.sleep(cycle)
Пример #3
0
 def schedule_tester(self, cycle=TESTER_CYCLE):
     """
     定时测试代理
     """
     tester = Tester()
     while True:
         print_messege('测试器开始运行')
         tester.run()
         time.sleep(cycle)
Пример #4
0
 def max(self, proxy):
     """
     将代理设置为MAX_SCORE
     :param proxy: 代理
     :return: 设置结果
     """
     print_messege('代理', proxy, '可用,设置为', MAX_SCORE)
     #return self.db.zadd(REDIS_KEY, MAX_SCORE, proxy)
     #更新为redis3.0+版本,解决redis3.0更新后的报错,如用旧版本还原上方代码
     return self.db.zadd(REDIS_KEY, {proxy: MAX_SCORE})
Пример #5
0
 def run(self):
     print_messege('获取器开始执行')
     if not self.is_over_threshold():
         for callback_label in range(self.crawler.__CrawlFuncCount__):
             callback = self.crawler.__CrawlFunc__[callback_label]
             # 获取代理
             proxies = self.crawler.get_proxies(callback)
             sys.stdout.flush()
             for proxy in proxies:
                 self.redis.add(proxy)
Пример #6
0
 def run(self):
     print_messege('代理池开始运行')
     
     if TESTER_ENABLED:
         tester_process = Process(target=self.schedule_tester)
         tester_process.start()
     
     if GETTER_ENABLED:
         getter_process = Process(target=self.schedule_getter)
         getter_process.start()
     
     if API_ENABLED:
         api_process = Process(target=self.schedule_api)
         api_process.start()
Пример #7
0
 def decrease(self, proxy):
     """
     代理值减一分,小于最小值则删除
     :param proxy: 代理
     :return: 修改后的代理分数
     """
     score = self.db.zscore(REDIS_KEY, proxy)
     if score and score > MIN_SCORE:
         print_messege('代理', proxy, '当前分数', score, '减1')
         #return self.db.zincrby(REDIS_KEY, proxy, -1)
         #更新为redis3.0+版本,解决redis3.0更新后的报错,如用旧版本还原上方代码
         return self.db.zincrby(REDIS_KEY, -1, proxy)
     else:
         print_messege('代理', proxy, '当前分数', score, '移除')
         return self.db.zrem(REDIS_KEY, proxy)
Пример #8
0
def get_page_noverify(url, options={}):
    """
    抓取代理
    :param url:
    :param options:
    :return:
    """
    headers = dict(base_headers, **options)
    print_messege('正在抓取', url)
    try:
        requests.packages.urllib3.disable_warnings()
        response = requests.get(url, headers=headers, verify=False)
        print_messege('抓取成功', url, response.status_code)
        if response.status_code == 200:
            return response.text
    except ConnectionError:
        error_messege('抓取失败', url)
        return None
Пример #9
0
 def run(self):
     """
     测试主函数
     :return:
     """
     print_messege('测试器开始运行')
     try:
         count = self.redis.count()
         print_messege('当前剩余', count, '个代理')
         for i in range(0, count, BATCH_TEST_SIZE):
             start = i
             stop = min(i + BATCH_TEST_SIZE, count)
             print_messege('正在测试第', start + 1, '-', stop, '个代理')
             test_proxies = self.redis.batch(start, stop)
             loop = asyncio.get_event_loop()
             tasks = [
                 self.test_single_proxy(proxy) for proxy in test_proxies
             ]
             loop.run_until_complete(asyncio.wait(tasks))
             sys.stdout.flush()
             time.sleep(5)
     except Exception as e:
         error_messege('测试器发生错误', e.args)
Пример #10
0
 def get_proxies(self, callback):
     proxies = []
     for proxy in eval("self.{}()".format(callback)):
         print_messege('成功获取到代理', proxy)
         proxies.append(proxy)
     return proxies