示例#1
0
def get_page(url, options={}):
    headers = dict(base_header, **options)
    Logger.log_normal('Getting %s' % url)
    try:
        r = requests.get(url, headers=headers)
        Logger.log_high('Getting result %s %s' % (url, r.status_code))
        if r.status_code == 200:
            return r.text
    except ConnectionError:
        Logger.log_fail('Crawling Failed %s' % url)
        return None
示例#2
0
 def valid_proxy(cycle=VALID_CHECK_CYCLE):
     """从redis里面获取一半的代理
     """
     conn = RedisClient()
     tester = VaildityTester()
     while True:
         Logger.log_high('Refreshing ip')
         count = int(0.5 * conn.queue_len)
         if count == 0:
             Logger.log_normal('Waiting for adding')
             time.sleep(cycle)
             continue
         raw_proxies = conn.get(count)
         tester.set_raw_proxies(raw_proxies)
         tester.test()
         time.sleep(cycle)
示例#3
0
 async def test_single_proxy(self, proxy):
     """
     检测单个代理,如果可用,则将其加入_usable_proxies
     """
     async with aiohttp.ClientSession() as session:
         try:
             if isinstance(proxy, bytes):
                 proxy = proxy.decode('utf-8')
             real_proxy = 'http://' + proxy
             Logger.log_normal('Testing %s' % proxy)
             async with session.get(self.test_api,
                                    proxy=real_proxy,
                                    timeout=15) as resp:
                 if resp.status == 200:
                     self._conn.put(proxy)
                     Logger.log_high('Valid proxy %s' % proxy)
         except Exception:
             pass
示例#4
0
    def add_to_queue(self):
        """
        命令爬虫抓取一定量未检测的代理,然后检测,将通过检测的代理加入到代理池中
        """

        Logger.log_normal('PoolAdder is working')
        proxy_count = 0
        while not self.is_over_threshold():
            for callback_label in range(self._crawler.__CrawlFuncCount__):
                callback = self._crawler.__CrawlFunc__[callback_label]
                raw_proxies = self._crawler.get_raw_proxies(callback)

                # test crawled proxies
                self._tester.set_raw_proxies(raw_proxies)
                self._tester.test()

                proxy_count += len(raw_proxies)
                if self.is_over_threshold():
                    Logger.log_high('IP is enough, waiting to be used')
                    break
                if proxy_count == 0:
                    raise ResourceDepletionError
示例#5
0
 def run(self):
     Logger.log_high('Ip processing running')
     valid_process = Process(target=Schedule.valid_proxy)
     check_process = Process(target=Schedule.check_pool)
     valid_process.start()
     check_process.start()