예제 #1
0
class PoolThresh:
    def __init__(self, threshold):
        self.threshold = threshold
        self.conn = RedisClient()
        self.check = ProxyCheck()
        self.proxies = Proxy()

    def is_over_threshold(self):
        proxy_count = self.conn.count()
        if proxy_count >= self.threshold:
            return True
        else:
            return False

    def add_into_pool(self):
        print("Add proxy into proxypool....")
        proxy_count = 0
        proxysites = self.proxies.__crawlFunc__
        while not self.is_over_threshold():
            for callback in proxysites:
                proxies = self.proxies.getproxies(callback)
                proxy_count = len(proxies)
                print('%s crawled %d ip, under checking' % (callback, proxy_count))
                self.check.get_initial_proxies(proxies)
                self.check.test_all_proxies()
                if self.is_over_threshold():
                    print('proxypool is full')
                    break
            if proxy_count == 0:
                raise ResourceDepletionError
예제 #2
0
class ProxyCheck:
    def __init__(self):
        self._initial_proxies = None
        self._test_url = TEST_URL

    def get_initial_proxies(self, proxies):
        self._initial_proxies = proxies
        self._conn = RedisClient()

    async def test_single_proxy(self, proxy):
        try:
            async with aiohttp.ClientSession() as session:
                try:
                    if isinstance(proxy, bytes):
                        proxy = proxy.decode('utf-8')
                    current_proxy = "http://" + proxy
                    proxy_list = self._conn.get(self._conn.count())
                    async with session.get(self._test_url, proxy=current_proxy, timeout=10) as response:
                        if response.status == 200 and proxy not in proxy_list:
                            self._conn.put(proxy)
                            print("Valid proxy:", proxy)
                except (ProxyConnectionError, TimeoutError, ValueError):
                    print('Invalid proxy', proxy)
        except (ServerDisconnectedError, ClientResponseError, ClientConnectorError) as s:
            print(s)
            pass

    def test_all_proxies(self):
        loop = asyncio.get_event_loop()
        tasks = [self.test_single_proxy(proxy) for proxy in self._initial_proxies]
        result = loop.run_until_complete(asyncio.wait(tasks))
        print(result)
예제 #3
0
 def add_proxy(limit=POOL_LIMIT, threshold=POOL_THRESHOLD, cycle=WAIT_TIME):
     conn = RedisClient()
     adder = PoolThresh(threshold)
     while True:
         if conn.count() < limit:
             adder.add_into_pool()
         time.sleep(cycle)
예제 #4
0
class Tester(object):
    def __init__(self):
        self.redis = RedisClient()

    async def test_single_proxy(self, proxy):
        """
        测试单个代理
        :param proxy:
        :return:
        """
        conn = aiohttp.TCPConnector(verify_ssl=False)
        async with aiohttp.ClientSession(connector=conn) as session:
            try:
                if isinstance(proxy, bytes):
                    proxy = proxy.decode('utf-8')
                real_proxy = 'http://' + proxy
                print('正在测试', proxy)
                async with session.get(TEST_URL,
                                       proxy=real_proxy,
                                       timeout=15,
                                       allow_redirects=False) as response:
                    if response.status in VALID_STATUS_CODES:
                        self.redis.max(proxy)
                        print('代理可用', proxy)
                    else:
                        self.redis.decrease(proxy)
                        print('请求响应码不合法 ', response.status, 'IP', proxy)
            except (ClientError, aiohttp.ClientConnectorError,
                    asyncio.TimeoutError, AttributeError):
                self.redis.decrease(proxy)
                print('代理请求失败', proxy)

    def run(self):
        """
        测试主函数
        :return:
        """
        print('测试器开始运行')
        try:
            count = self.redis.count()
            print('当前剩余', count, '个代理')
            for i in range(0, count, BATCH_TEST_SIZE):
                start = i
                stop = min(i + BATCH_TEST_SIZE, count)
                print('正在测试第', start + 1, '-', stop, '个代理')
                test_proxies = self.redis.batch(start, stop)
                loop = asyncio.get_event_loop()
                tasks = [
                    self.test_single_proxy(proxy) for proxy in test_proxies
                ]
                loop.run_until_complete(asyncio.wait(tasks))
                sys.stdout.flush()
                time.sleep(5)
        except Exception as e:
            print('测试器发生错误', e.args)
예제 #5
0
 def check_proxy(cycle=WAIT_TIME):
     conn = RedisClient()
     check = ProxyCheck()
     while True:
         count = int(0.5 * conn.count())
         if count == 0:
             print('there is no data in pool, please wait...')
             time.sleep(cycle)
         proxies = conn.get(count)
         check.get_initial_proxies(proxies)
         check.test_all_proxies()
         time.sleep(cycle)
예제 #6
0
class Getter:
    def __init__(self):
        self.redis = RedisClient()
        self.crawler = Crawler()

    def is_over_threshold(self):
        """
        判断是否达到了代理池限制
        """
        if self.redis.count() >= POOL_UPPER_THRESHOLD:
            return True
        else:
            return False

    def run(self):
        print('获取器开始执行')
        if not self.is_over_threshold():
            for callback_label in range(self.crawler.CrawlFuncCount):
                callback = self.crawler.CrawlFunc[callback_label]
                # 获取代理
                proxies = self.crawler.get_proxies(callback)
                sys.stdout.flush()
                for proxy in proxies:
                    self.redis.add(proxy)
예제 #7
0
 def get(self):
     connect = RedisClient()
     self.write(str(connect.count()))