Пример #1
0
class ValidityTester(object):
    """
    检测代理是否正常
    """
    def __init__(self):
        self.redis = RedisClient()

    async def test_single_proxy(self, proxy):
        """
        测试单个代理IP
        :return:
        """
        conn = aiohttp.TCPConnector(verify_ssl=False)
        async with aiohttp.ClientSession(connector=conn) as session:
            try:
                if isinstance(proxy, bytes):
                    proxy = proxy.decode('utf-8')
                # real_proxy = 'http://' + proxy
                real_proxy_data = {
                    'http:': 'http://{}'.format(proxy),
                    'https:': 'https://{}'.format(proxy),
                }
                if TEST_URL.startswith('http:'):
                    real_proxy = real_proxy_data.get('http:')
                else:
                    real_proxy = real_proxy_data.get('https:')  # aiohttp不支持检测https的代理
                # print("正在测试ip:{}".format(real_proxy))
                async with session.get(url=TEST_URL, proxy=real_proxy, timeout=15, allow_redirects=False) as response:
                    if response.status in VALID_STATUS_CODES:
                        self.redis.max(proxy)  # 检测正常,设置分数
                        # results = await response.text()
                        # print('代理检测正常:', json.loads(results).get('origin'))
                    else:
                        self.redis.decrase(proxy)  # 检测不正常,减分
                        # print("响应状态码不合法:{} - ip:{}".format(response.status, proxy))
            except(ClientError, aiohttp.client_exceptions.ClientConnectorError, asyncio.TimeoutError, AttributeError):
                self.redis.decrase(proxy)  # 抛异常减分
                # print("请求不到测试地址,代理不能用:{}".format(proxy))

    def run(self):
        """
        检测主函数
        :return:
        """
        # print('测试器开始运行')
        try:
            count = self.redis.count()
            print('当前剩余', count, '个代理')
            for i in range(0, count, BATCH_TEST_SIZE):
                start = i
                stop = min(i + BATCH_TEST_SIZE, count)
                print('正在测试第', start + 1, '-', stop, '个代理')
                test_proxies = self.redis.batch(start, stop)  # 代理的列表
                loop = asyncio.get_event_loop()
                tasks = [self.test_single_proxy(proxy=proxy) for proxy in test_proxies]
                loop.run_until_complete(asyncio.wait(tasks))
                sys.stdout.flush()  # 调用sys.stdout.flush()强制它“刷新”缓冲区,这意味着它会将缓冲区中的所有内容写入终端,即使通常它会在执行此操作之前等待
                time.sleep(5)
        except Exception as e:
            print("测试器错误:{}".format(e.args))
Пример #2
0
class Tester(object):
    def __init__(self):
        self.redis = RedisClient()

    async def test_single_proxy(self, proxy):
        """
        测试单个代理
        :param proxy:
        :return:
        """
        conn = aiohttp.TCPConnector(verify_ssl=False)

        async with aiohttp.ClientSession(connector=conn) as session:
            try:
                if isinstance(proxy, bytes):
                    proxy = proxy.decode('utf-8')
                real_proxy = 'http://' + proxy
                print('正在测试', proxy)
                # proxy = proxy
                # real_proxy = "http://*****:*****@{}".format(proxy)
                async with session.get(TEST_URL,
                                       proxy=real_proxy,
                                       timeout=15,
                                       allow_redirects=False) as response:
                    # async with session.get(TEST_URL, proxy=real_proxy, timeout=15, allow_redirects=False) as response:
                    if response.status in VALID_STATUS_CODES:
                        self.redis.max(proxy)
                        print('代理可用', proxy)
                    else:
                        self.redis.decrease(proxy)
                        print('请求响应码不合法 ', response.status, 'IP', proxy)
            except (ClientError,
                    aiohttp.client_exceptions.ClientConnectorError,
                    asyncio.TimeoutError, AttributeError):
                self.redis.decrease(proxy)
                print('代理请求失败', proxy)

    def run(self):
        """
        测试主函数
        :return:
        """
        print('测试器开始运行')
        try:
            count = self.redis.count()
            print('当前剩余', count, '个代理')
            for i in range(0, count, BATCH_TEST_SIZE):
                start = i
                stop = min(i + BATCH_TEST_SIZE, count)
                print('正在测试第', start + 1, '-', stop, '个代理')
                test_proxies = self.redis.batch(start, stop)
                loop = asyncio.get_event_loop()
                tasks = [
                    self.test_single_proxy(proxy) for proxy in test_proxies
                ]
                loop.run_until_complete(asyncio.wait(tasks))
                sys.stdout.flush()
                time.sleep(5)
        except Exception as e:
            print('测试器发生错误', e.args)
Пример #3
0
class Filter:
    def __init__(self):
        self.db = RedisClient()

    async def check_one(self, proxy):

        proxies = {'http': 'http://' + proxy}
        try:
            print('正在测试: {}'.format(proxy))
            r = requests.get(TEST_URL, proxies=proxies)
        except requests.RequestException:
            print('检测失败', proxy)
            self.db.remove(proxy)
            return
        if r.status_code == 200:
            print('代理可用', proxy)
            self.db.decrease(proxy)

    def run(self):
        print('===开始测试代理===')
        try:
            print('当前代理个数:{}'.format(self.db.count))
            tasks = [
                asyncio.ensure_future(self.check_one(proxy.decode()))
                for proxy in self.db.batch()
            ]
            loop = asyncio.get_event_loop()
            loop.run_until_complete(asyncio.wait(tasks))

        except Exception as e:
            print('测试错误', e.args)
Пример #4
0
class Tester(object):
    def __init__(self, redis_key):
        self.redis = RedisClient(redis_key)

    async def test_single_proxy(self, proxy):
        """
        测试单个代理
        :param proxy:
        :return:
        """
        conn = aiohttp.TCPConnector(ssl=False)
        async with aiohttp.ClientSession(connector=conn) as session:
            try:
                if isinstance(proxy, bytes):
                    proxy = proxy.decode('utf-8')
                real_proxy = 'http://' + proxy
                print('正在测试', proxy)
                headers = {
                    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
                    'Accept-Encoding': 'gzip, deflate',
                    'Accept-Language': 'en;q=0.9,ja;q=0.8,fr;q=0.7',
                    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:76.0) Gecko/20100101 Firefox/76.0',
                    # 'Upgrade-Insecure-Requests': 1,
                    'Connection': 'close',
                }

                async with session.get(TEST_URL, headers=headers, proxy=real_proxy, timeout=TIMEOUT, allow_redirects=False) as response:
                    if response.status in VALID_STATUS_CODES:
                        self.redis.max(proxy)
                        print('代理可用', proxy)
                    else:
                        self.redis.decrease(proxy)
                        print('请求响应码不合法 ', response.status, 'IP', proxy)
            except (ClientError, ClientConnectorError, asyncio.TimeoutError, AttributeError):
                self.redis.decrease(proxy)
                print('代理请求失败', proxy)

    def run(self):
        """
        测试主函数
        :return:
        """
        print('测试器开始运行')
        try:
            count = self.redis.count()
            print('当前剩余', count, '个代理')
            for i in range(0, count, BATCH_TEST_SIZE):
                start = i
                stop = min(i + BATCH_TEST_SIZE, count)
                print('正在测试第', start + 1, '-', stop, '个代理')
                test_proxies = self.redis.batch(start, stop)
                loop = asyncio.get_event_loop()
                tasks = [self.test_single_proxy(proxy) for proxy in test_proxies]
                loop.run_until_complete(asyncio.wait(tasks))
                sys.stdout.flush()
                time.sleep(5)
        except Exception as e:
            print('测试器发生错误', e.args)
Пример #5
0
class Tester:
    def __init__(self):
        self.redis = RedisClient()

    async def test_single_proxy(self, proxy):
        """
        测试单个代理
        :param proxy:
        :return:
        """
        conn = aiohttp.TCPConnector(ssl=False)
        async with aiohttp.ClientSession(connector=conn) as session:
            try:
                if isinstance(proxy, bytes):  # 判断是不是bytes类型
                    proxy = proxy.decode('utf-8')
                real_proxy = 'http://' + proxy
                print('正在测试')
                async with session.get(TEST_URL,
                                       proxy=real_proxy,
                                       timeout=15,
                                       allow_redirects=False) as response:
                    if response.status in VALID_STATUS_CODES:  # 状态码是否为200,302
                        self.redis.max(proxy)  # 代理可用就改变代理的分数为100
                        print('代理可用', proxy)
                    else:
                        self.redis.decrease(proxy)  # 代理减分
                        print('请求响应码不合理', response.status, 'IP', proxy)
            except (ClientError,
                    aiohttp.client_exceptions.ClientConnectorError,
                    asyncio.TimeoutError, AttributeError):
                self.redis.decrease(proxy)
                print('代理请求失败______________', proxy)

    def run(self):
        """
        检测主函数
        :return:
        """
        print('检测器开始运行')
        try:
            count = self.redis.count()  # 获取proxies数量
            print('当前剩余', count, '个代理')
            for i in range(0, count,
                           BATCH_TEST_SIZE):  # 最大批测试量BATCH_TEST_SIZE = 10
                start = i
                stop = min(i + BATCH_TEST_SIZE, count)
                print('正在测试第', start + 1, '-', stop, '个代理')
                test_proxies = self.redis.batch(start, stop)  # 批量获取
                loop = asyncio.get_event_loop()
                tasks = [
                    self.test_single_proxy(proxy) for proxy in test_proxies
                ]
                loop.run_until_complete(asyncio.wait(tasks))
                sys.stdout.flush()
                time.sleep(5)
        except Exception as e:
            print('测试器发生错误', e.args)
Пример #6
0
class Tester(object):
    def __init__(self):
        self.redis = RedisClient()

    async def test_single_proxy(self, proxy):
        """
        Test single proxy
        :param proxy: Single proxy
        :return: None
        """
        conn = aiohttp.TCPConnector(verify_ssl=False)
        async with aiohttp.ClientSession(connector=conn) as session:
            try:
                if isinstance(proxy, bytes):
                    proxy = proxy.decode('utf-8')
                real_proxy = 'http://' + proxy
                print('Testing', proxy)
                async with session.get(TEST_URL,
                                       proxy=real_proxy,
                                       timeout=15,
                                       allow_redirects=False) as response:
                    if response.status in VALID_STATUS_CODES:
                        self.redis.max(proxy)
                        print("Proxy is OK", proxy)
                    else:
                        self.redis.decrease(proxy)
                        print("Response code is wrong", response.status, 'IP',
                              proxy)
            except (ClientError,
                    aiohttp.client_exceptions.ClientConnectorError,
                    asyncio.TimeoutError, AttributeError):
                self.redis.decrease(proxy)
                print("Fail to get proxy", proxy)

    def run(self):
        """
        Main function
        :return: None
        """
        print("Tester starts running")
        try:
            count = self.redis.count()
            print("Current surplus", count, "proxies")
            for i in range(0, count, BATCH_TEST_SIZE):
                start = i
                stop = min(i + BATCH_TEST_SIZE, count)
                print("Current testing the", start + 1, '-', stop, 'th proxy')
                test_proxies = self.redis.batch(start, stop)
                loop = asyncio.get_event_loop()
                tasks = [
                    self.test_single_proxy(proxy) for proxy in test_proxies
                ]
                loop.run_until_complete(asyncio.wait(tasks))
                sys.stdout.flush()
                time.sleep(5)
        except Exception as e:
            print("Error!", e.args)
Пример #7
0
class Tester(object):
    def __init__(self):
        self.redis = RedisClient()

    async def test_single_proxy(self, proxy):
        """
        测试单个代理
        :param proxy:
        :return:
        """
        headers = {
            "Connection": "keep-alive",
            "Host": "www.sogou.com",
            "Pragma": "no-cache",
            "User-Agent": 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.87 Safari/537.36',
        }
        conn = aiohttp.TCPConnector(verify_ssl=False)
        async with aiohttp.ClientSession(connector=conn) as session:
            try:
                if isinstance(proxy, bytes):
                    proxy = proxy.decode('utf-8')
                real_proxy = 'http://' + proxy
                print('正在测试' + proxy)
                async with session.get(TEST_URL, proxy=real_proxy, timeout=15, allow_redirects=False,
                                       headers=headers) as response:
                    if response.status in VALID_STATUS_CODES:
                        self.redis.max(proxy)
                        print('代理可用' + proxy)
                    else:
                        self.redis.decrease(proxy)
                        print('请求响应码不合法 ' + str(response.status) + 'IP' + proxy)
            except (ClientError, aiohttp.client_exceptions.ClientConnectorError, asyncio.TimeoutError, AttributeError):
                self.redis.decrease(proxy)
                print('代理请求失败{}'.format(proxy))

    def run(self):
        """
        测试主函数
        :return:
        """
        print('测试器开始运行')
        try:
            count = self.redis.count()
            print('当前剩余{}个代理'.format(count))
            for i in range(0, count, BATCH_TEST_SIZE):
                start = i
                stop = min(i + BATCH_TEST_SIZE, count)
                print('正在测试第{}-{}个代理'.format(start + 1, stop))
                test_proxies = self.redis.batch(start, stop)
                loop = asyncio.get_event_loop()
                tasks = [self.test_single_proxy(proxy) for proxy in test_proxies]
                loop.run_until_complete(asyncio.wait(tasks))
                sys.stdout.flush()
                time.sleep(5)
        except Exception as e:
            print('测试器发生错误{}'.format(e.args))
Пример #8
0
class Tester(object):
    def __init__(self):
        self.redis = RedisClient()

    async def test_single_proxy(self, proxy):
        """
        测试单个代理
        :param proxy:
        :return:
        """
        conn = aiohttp.TCPConnector(verify_ssl=False)
        async with aiohttp.ClientSession(connector=conn) as session:
            try:
                if isinstance(proxy, bytes):
                    proxy = proxy.decode('utf-8')
                real_proxy = 'http://' + proxy
                logger.debug(f'正在测试 {proxy}')
                async with session.get(TEST_URL,
                                       proxy=real_proxy,
                                       timeout=15,
                                       allow_redirects=False) as response:
                    if response.status in VALID_STATUS_CODES:
                        self.redis.max(proxy)
                        logger.debug(f'代理可用 {proxy}')
                    else:
                        self.redis.decrease(proxy)
                        logger.debug(f'请求响应码不合法 {response.status}, IP {proxy}')
            except (ClientError,
                    aiohttp.client_exceptions.ClientConnectorError,
                    asyncio.TimeoutError, AttributeError):
                self.redis.decrease(proxy)
                logger.debug(f'代理请求失败 {proxy}')

    def run(self):
        """
        测试主函数
        :return:
        """
        logger.debug('测试器开始运行')
        try:
            count = self.redis.count()
            logger.debug(f'当前剩余 {count} 个代理')
            for i in range(0, count, BATCH_TEST_SIZE):
                start = i
                stop = min(i + BATCH_TEST_SIZE, count)
                logger.debug(f'正在测试第 {start + 1 - stop} 个代理')
                test_proxies = self.redis.batch(start, stop)
                loop = asyncio.get_event_loop()
                tasks = [
                    self.test_single_proxy(proxy) for proxy in test_proxies
                ]
                loop.run_until_complete(asyncio.wait(tasks))
                sys.stdout.flush()
                time.sleep(5)
        except Exception as e:
            logger.debug(f'测试器发生错误 {e.args}')
Пример #9
0
class Tester(object):
    def __init__(self):
        self.redis = RedisClient()

    async def test_single_proxy(self, proxy):
        """
        异步测试单个代理
        :param proxy:
        :return:
        """
        conn = aiohttp.TCPConnector(verify_ssl=False)
        async with aiohttp.ClientSession(connector=conn) as session:
            try:
                """
                在Python3以后,字符串和bytes类型彻底分开了。字符串是以字符为单位进行处理的,bytes类型是以字节为单位处理的。
                直接以默认的utf-8编码解码bytes成string
                """
                if isinstance(proxy, bytes):
                    proxy = proxy.decode('utf-8')
                real_proxy = 'http://' + proxy
                print("正在测试", proxy)
                async with session.get(TEST_URL, allow_redirects=False, proxy=real_proxy, timeout=15) as response:
                    if response.status in VALID_STATUS_CODES:
                        # 将代理设置为分数最大
                        self.redis.max(proxy)
                        print("代理", proxy, '可用, 设置为100')
                    else:
                        self.redis.decrease(proxy)
                        print('请求响应码不合法', response.status, 'IP', proxy)
            except (ClientError, aiohttp.client_exceptions.ClientConnectorError, asyncio.TimeoutError, AttributeError):
                print("代理验证失败", proxy)
                self.redis.decrease(proxy)

    def run(self):
        """
        测试函数
        :return:
        """
        print('测试器开始运行')
        try:
            count = self.redis.count()
            for i in range(0, count, BATCH_TEST_SIZE):
                start = i
                stop = min(i + BATCH_TEST_SIZE, count)
                print('正在测试第', start + 1, '-', stop, '个代理')
                """获取测试代理"""
                test_proxies = self.redis.batch(start, stop)
                loop = asyncio.get_event_loop()
                tasks = [self.test_single_proxy(proxy) for proxy in test_proxies]
                loop.run_until_complete(asyncio.wait(tasks))
                time.sleep(5)
        except Exception as e:
            print("测试器发生错误", e.args)
Пример #10
0
class Tester(object):
    def __init__(self):
        self.redis = RedisClient()
    
    async def test_single_proxy(self, proxy):
        """
        测试单个代理
        :param proxy:
        :return:
        """
        headers = {"User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36"}
        conn = aiohttp.TCPConnector(verify_ssl=False)
        async with aiohttp.ClientSession(connector=conn) as session:
            try:
                if isinstance(proxy, bytes):
                    proxy = proxy.decode('utf-8')
                real_proxy = 'http://' + proxy
                print('正在测试', proxy)
                async with session.get(TEST_URL, proxy=real_proxy,headers=headers, timeout=15, allow_redirects=False) as response:
                    if response.status in VALID_STATUS_CODES:
                        self.redis.max(proxy)
                        print('代理可用', proxy)
                    else:
                        self.redis.decrease(proxy)
                        print('请求响应码不合法 ', response.status, 'IP', proxy)
            except (ClientError, aiohttp.client_exceptions.ClientConnectorError, asyncio.TimeoutError, AttributeError):
                self.redis.decrease(proxy)
                print('代理请求失败', proxy)
    
    def run(self):
        """
        测试主函数
        :return:
        """
        print('测试器开始运行')
        try:
            count = self.redis.count()
            print('当前剩余', count, '个代理')
            for i in range(0, count, BATCH_TEST_SIZE):
                start = i
                stop = min(i + BATCH_TEST_SIZE, count)
                print('正在测试第', start + 1, '-', stop, '个代理')
                test_proxies = self.redis.batch(start, stop)
                loop = asyncio.get_event_loop()
                tasks = [self.test_single_proxy(proxy) for proxy in test_proxies]
                loop.run_until_complete(asyncio.wait(tasks))
                sys.stdout.flush()
                time.sleep(5)
        except Exception as e:
            print('测试器发生错误', e.args)
Пример #11
0
class Tester(object):

    def __init__(self):
        self.redis = RedisClient()

    async def test_single_proxy(self, session, proxy):
        """测试单个代理"""
        try:
            real_proxy = eval(proxy)['https']
            print('正在测试', proxy)
            async with session.get(TEST_URL, proxy=real_proxy, timeout=20, allow_redirects=False) as response:
                if response.status in VALID_STATUS_CODES:
                    rst = await response.text()
                    if rst:
                        resp_ip = '//'+eval(rst).get('headers').get('X-Forwarded-For')
                        proxy_ip = real_proxy.split(':')
                        if resp_ip == proxy_ip[1]:
                            self.redis.max(proxy)
                            print('代理可用', proxy)
                else:
                    self.redis.decrease(proxy)
                    print('请求响应码不合法 ', response.status, 'IP', proxy)
        except (ClientError, aiohttp.client_exceptions.ClientConnectorError, asyncio.TimeoutError, AttributeError):
            self.redis.decrease(proxy)
            print('代理请求失败', proxy)

    async def set_test_tasks(self, loop):
        """设置测试任务"""
        count = self.redis.count
        print('当前剩余', count, '个代理')
        for start in range(0, count, BATCH_TEST_SIZE): # 一段一段创建任务, 每一段一个Session减少内存开销
            stop = min(start + BATCH_TEST_SIZE, count)
            print('正在测试第', start + 1, '-', stop, '个代理')
            test_proxies = self.redis.batch(start, stop)
            # conn = aiohttp.TCPConnector(verify_ssl=False)
            conn = aiohttp.TCPConnector()
            async with aiohttp.ClientSession(connector=conn, loop=loop) as session:
                tasks = [self.test_single_proxy(session, proxy) for proxy in test_proxies]
                await asyncio.wait(tasks)

    def run(self):
        """测试主函数"""
        print('测试器开始运行')
        try:
            loop = asyncio.get_event_loop()
            loop.run_until_complete(self.set_test_tasks(loop))
            sys.stdout.flush()  # 马上print不用等到循环结束
            time.sleep(5)
        except Exception as e:
            print('测试器发生错误', e.args)
Пример #12
0
class Tester(object):
    def __init__(self):
        self.redis = RedisClient()
        
    async def test_single_proxy(self,proxy):
        """
        测试单个代理
        :param proxy:单个代理
        :return:None
        """
        
        conn = aiohttp.TCPConnector(verify_ssl=False)
        async with aiohttp.ClientSession(connector=conn) as session:
            try:
                if isinstance(proxy,bytes):
                    proxy = proxy.decode('utf-8')
                real_proxy = 'http://'+proxy
                print('正在测试',proxy)
                async with session.get(test_url,proxy=real_proxy,timeout=15,allow_redirects=False) as response:
                    if response.status in valid_status_codes:
                        self.redis.max(proxy)
                        print('代理可用',proxy)
                    else:
                        self.redis.decrease(proxy)
                        print('请求响应码不合法',proxy)
            except (ClientError, aiohttp.client_exceptions.ClientConnectorError, asyncio.TimeoutError, AttributeError):
                self.redis.decrease(proxy)
                print('代理请求失败',proxy)
                
    def run(slef):
        """
        测试主函数
        :return:None
        """
        print('测试器开始运行')
        try:
            count = self.redis.count()
            print('当前剩余',count,'个代理')
            for i in range(0,count,batch_test_size):
                start = i
                stop = min(i+batch_test_size,count)
                print('正在测试第',start+i,'-',stop,'个代理')
                test_proxies = self.redis.batch(start,stop)
                loop= asyncio.get_event_loop()
                tasks = [self.test_single_proxy(proxy) for proxy in test_proxies]
                loop.run_until_complete(asyncio.wait(tasks))
                sys.stdout.flush()
                time.sleep(5)
        except Exception as e:
            print('测试器发生错误',e.args)
Пример #13
0
class Tester(object):
    def __init__(self):
        self.redis = RedisClient()
    
    async def test_single_proxy(self, proxy):
        """
        测试单个代理
        :param proxy:
        :return:
        """
        conn = aiohttp.TCPConnector(verify_ssl=False)
        async with aiohttp.ClientSession(connector=conn) as session:
            try:
                if isinstance(proxy, bytes):
                    proxy = proxy.decode('utf-8')
                real_proxy = 'http://' + proxy
                print('正在测试', proxy)
                async with session.get(TEST_URL, proxy=real_proxy, timeout=15, allow_redirects=False) as response:
                    if response.status in VALID_STATUS_CODES:
                        self.redis.max(proxy)
                        print('代理可用', proxy)
                    else:
                        self.redis.decrease(proxy)
                        print('请求响应码不合法 ', response.status, 'IP', proxy)
            except (ClientError, aiohttp.client_exceptions.ClientConnectorError, asyncio.TimeoutError, AttributeError):
                self.redis.decrease(proxy)
                print('代理请求失败', proxy)
    
    def run(self):
        """
        测试主函数
        :return:
        """
        print('测试器开始运行')
        try:
            count = self.redis.count()
            print('当前剩余', count, '个代理')
            for i in range(0, count, BATCH_TEST_SIZE):
                start = i
                stop = min(i + BATCH_TEST_SIZE, count)
                print('正在测试第', start + 1, '-', stop, '个代理')
                test_proxies = self.redis.batch(start, stop)
                loop = asyncio.get_event_loop()
                tasks = [self.test_single_proxy(proxy) for proxy in test_proxies]
                loop.run_until_complete(asyncio.wait(tasks))
                sys.stdout.flush()
                time.sleep(5)
        except Exception as e:
            print('测试器发生错误', e.args)
Пример #14
0
class Tester(object):
    def __init__(self):
        self.redis = RedisClient()

    async def test_single_proxy(self, proxy):
        conn = aiohttp.TCPConnector(verify_ssl=False)
        async with aiohttp.ClientSession(connector=conn) as session:
            try:
                if isinstance(proxy, bytes):
                    proxy = proxy.decode("utf-8")
                real_proxy = "http://" + proxy
                print("正常测试代理", proxy)
                async with session.get(TEST_URL,
                                       proxy=real_proxy,
                                       timeout=15,
                                       allow_redirects=False) as response:
                    if response.status in VALID_STATUS:
                        self.redis.max(proxy)
                        print("代理可用", proxy)
                    else:
                        self.redis.derease(proxy)
                        print("请求验证不合法", response.status, 'IP', proxy)
            except (ClientError, aiohttp.ClientProxyConnectionError,
                    asyncio.TimeoutError, AttributeError):
                self.redis.derease(proxy)
                print("请求代理失败", proxy)

    def run(self):
        print("测试开始")
        try:
            count = self.redis.count()
            print("当前剩余:", count, "个代理")
            for i in range(0, count, BATCH_TEST_SIZE):
                start = i
                stop = min(i + BATCH_TEST_SIZE, count)
                print("正在测试第", start + 1, '-', stop, "个代理")
                test_proxies = self.redis.batch(start, stop)
                loop = asyncio.get_event_loop()
                tasks = [
                    self.test_single_proxy(proxy) for proxy in test_proxies
                ]
                loop.run_until_complete(asyncio.wait(tasks))
                sys.stdout.flush()
                time.sleep(5)
        except Exception as e:
            print("测试发送错误", e.args)
Пример #15
0
class Tester(object):
    def __init__(self):
        """
        初始化 Redis
        """
        self.redis = RedisClient()
        self.loop = asyncio.get_event_loop()
    
    async def test(self, proxy: Proxy):
        """
        测试单个代理:
        """
        async with aiohttp.ClientSession(connector=aiohttp.TCPConnector(ssl=False)) as session:
            try:
                logger.debug(f'测试 {proxy.string()}')
                async with session.get(TEST_URL, proxy=f'http://{proxy.string()}', timeout=TEST_TIMEOUT,
                                       allow_redirects=False) as response:
                    if response.status in TEST_VALID_STATUS:
                        self.redis.max(proxy)
                        logger.debug(f'代理 {proxy.string()} 可用, 加分')
                    else:
                        self.redis.decrease(proxy)
                        logger.debug(f'代理 {proxy.string()} 无效, 减分')
            except EXCEPTIONS:
                self.redis.decrease(proxy)
                logger.debug(f'代理 {proxy.string()} 无效, 减分')
    
    def run(self):
        """
        测试主函数
        """
        logger.info('启动测试器......')
        count = self.redis.count()
        logger.debug(f'{count} 个代理等待测试')
        
        for i in range(0, count, TEST_BATCH):
            # 开始测试的代理,停止测试的代理
            start, end = i, min(i + TEST_BATCH, count)
            logger.debug(f'测试索引值从 {start} 到 {end} 的代理')
            proxies = self.redis.batch(start, end)
            tasks = [self.test(proxy) for proxy in proxies]
            # 使用事件循环运行任务
            self.loop.run_until_complete(asyncio.wait(tasks))
Пример #16
0
class Tester:
    def __init__(self):
        self.redis = RedisClient()

    async def test_single_proxy(self, proxy):
        conn = aiohttp.TCPConnector(verify_ssl=False)
        async with aiohttp.ClientSession(connector=conn) as session:
            try:
                if isinstance(proxy, bytes):
                    proxy = proxy.decode("utf-8")
                format_proxy = "http://" + proxy
                async with session.get(TEST_API,
                                       proxy=format_proxy,
                                       timeout=PROXY_TIMEOUT,
                                       allow_redirects=False) as response:
                    if response.status == 200:
                        self.redis.max(proxy)
                    else:
                        self.redis.decrease(proxy)
            except (ClientError,
                    aiohttp.client_exceptions.ClientConnectorError,
                    asyncio.TimeoutError, AttributeError):
                self.redis.decrease(proxy)

    def run(self):
        print("测试器开始运行")
        try:
            count = self.redis.count()
            print("当前剩余%s个代理" % count)
            for i in range(0, count, BATCH_TEST_SIZE):
                start = i
                stop = min(i + BATCH_TEST_SIZE, count)
                print("正在测试第%s个--第%s个代理" % (start + 1, stop))
                test_proxies = self.redis.batch(start, stop)
                loop = asyncio.get_event_loop()
                tasks = [
                    self.test_single_proxy(proxy) for proxy in test_proxies
                ]
                loop.run_until_complete(asyncio.wait(tasks))
                sys.stdout.flush()
                time.sleep(5)
        except Exception as e:
            print("代理测试器发生错误%s" % e.args)
Пример #17
0
class Tester(object):
    def __init__(self):
        self.redis = RedisClient()
        self.spider_log = logging.getLogger(TESTLOGGER)

    async def test_single_proxy(self, proxy, mode=None):
        """
        测试单个代理
        :param proxy:
        :return:
        """
        if isinstance(proxy, bytes):
            proxy = proxy.decode('utf-8')

        if mode is None:
            rediskey = REDIS_KEY
            url = TEST_URL
            proxy_prefix = 'http'

        elif mode == REDIS_HTTP:
            rediskey = REDIS_HTTP
            url = TEST_URL
            proxy_prefix = 'http'

        elif mode == REDIS_HTTPS:
            rediskey = REDIS_HTTPS
            url = HTTPSTEST_URL
            proxy_prefix = 'https'
        test_proxy = 'http://' + proxy

        conn = aiohttp.TCPConnector(verify_ssl=False)
        async with aiohttp.ClientSession(connector=conn) as session:
            try:
                # self.spider_log.info('正在测试' + test_proxy)

                async with session.get(url,
                                       proxy=test_proxy,
                                       timeout=15,
                                       allow_redirects=False) as response:
                    if response.status in VALID_STATUS_CODES:
                        self.redis.max(proxy, mode)
                        # self.spider_log.info(proxy_prefix + '代理可用' + proxy)
                    else:
                        self.redis.decrease(proxy, mode)
                        self.spider_log.warn('请求响应码不合法 ' +
                                             str(response.status) + 'IP' +
                                             proxy_prefix + ":" + proxy)
            except (ClientError,
                    aiohttp.client_exceptions.ClientConnectorError,
                    asyncio.TimeoutError, AttributeError):
                self.redis.decrease(proxy, mode)
                # self.spider_log.warn(proxy_prefix + '代理请求失败' + proxy)

    def run(self, mode=None):
        """
        测试主函数
        :return:
        """
        self.spider_log.info('测试器定时开始')

        if mode is None:
            rediskey = REDIS_KEY
        else:
            rediskey = mode
        try:
            count = self.redis.count(mode)
            self.spider_log.info('测试器开始运行' + rediskey + '当前剩余' + str(count) +
                                 '个代理')

            for i in range(0, count, BATCH_TEST_SIZE):
                start = i
                stop = min(i + BATCH_TEST_SIZE, count)
                self.spider_log.info('正在测试第' + str(start + 1) + '-' +
                                     str(stop) + '个' + rediskey + '代理')
                test_proxies = self.redis.batch(start, stop, mode=mode)
                loop = asyncio.get_event_loop()
                tasks = [
                    self.test_single_proxy(proxy, mode=mode)
                    for proxy in test_proxies
                ]
                loop.run_until_complete(asyncio.wait(tasks))
                sys.stdout.flush()
                time.sleep(5)
        except Exception as e:
            self.spider_log.error('测试器发生错误' + str(e.args))
            self.spider_log.error('traceback:' + traceback.format_exc())
Пример #18
0
class Tester(object):
    def __init__(self):
        #创建对象,供该对象中其他方法使用
        self.redis = RedisClient()

    #异步方法,aiohttp写法
    async def test_single_proxy(self, proxy):
        '''
        测试单个代理
        :param proxy:
        :return:
        '''
        conn = aiohttp.TCPConnector(verify_ssl=False)
        #创建ClientSession对象,类似于requests的session对象
        #可直接调用该对象的get方法访问网页
        async with aiohttp.ClientSession(connector=conn) as session:
            try:
                if isinstance(proxy, bytes):
                    proxy = proxy.decode('utf-8')
                real_proxy = 'http://' + proxy
                print('正在测试', proxy)
                #通过proxy传递参数给get()
                #TEST_URL测试url
                async with session.get(TEST_URL,
                                       proxy=real_proxy,
                                       timeout=15,
                                       allow_redirects=False) as response:
                    #VALID_STATUS_CODES,状态码列表
                    if response.status in VALID_STATUS_CODES:
                        #max()将代理分数设为100
                        self.redis.max(proxy)
                        print('代理可用', proxy)
                    else:
                        #decrease()代理分数-1
                        self.redis.decrease(proxy)
                        print('请求响应码不合法', response.status, 'IP', proxy)
            except (ClientError,
                    aiohttp.client_exceptions.ClientConnectorError,
                    asyncio.TimeoutError, AttributeError):
                self.redis.decrease(proxy)
                print('请求代理失败', proxy)

    def run(self):
        '''
        测试主函数
        :return:
        '''
        print('测试器开始运行')
        try:
            count = self.redis.count()
            print('当前剩余', count, '个代理')
            #BATCH_TEST_SIZE最大测试数
            for i in range(0, count, BATCH_TEST_SIZE):
                start = i
                stop = min(i + BATCH_TEST_SIZE, count)
                print('正在测试第', start + 1, '-', stop, '个代理')
                #获取测试代理
                test_proxies = self.redis.batch(start, stop)
                loop = asyncio.get_event_loop()
                tasks = [
                    self.test_single_proxy(proxy) for proxy in test_proxies
                ]
                loop.run_until_complete(asyncio.wait(tasks))
                sys.stdout.flush()
                time.sleep(5)
        except Exception as e:
            print('测试器发生错误', e.args)
Пример #19
0
class Tester(object):
    def __init__(self):
        self.redis = RedisClient()

    async def test_single_proxy(self, proxy):  #异步请求
        """
        测试单个代理
        :param proxy:
        :return:
        """
        conn = aiohttp.TCPConnector(verify_ssl=False)  #防止ssl报错
        async with aiohttp.ClientSession(connector=conn) as session:
            try:
                if isinstance(proxy, bytes):  #判断一个对象是否是一个已知的类型,类似 type()
                    proxy = proxy.decode('utf-8')
                real_proxy = 'http://' + proxy
                print('正在测试', proxy)
                async with session.get(
                        TEST_URL,
                        proxy=real_proxy,
                        timeout=15,
                        allow_redirects=False
                ) as response:  #allow_redirects禁止重定向,默认为开启
                    if response.status in VALID_STATUS_CODES:
                        self.redis.max(proxy)
                        print('代理可用', proxy)
                    else:
                        self.redis.decrease(proxy)
                        print('请求响应码不合法 ', response.status, 'IP', proxy)
            except (ClientError,
                    aiohttp.client_exceptions.ClientConnectorError,
                    asyncio.TimeoutError, AttributeError):
                self.redis.decrease(proxy)
                print('代理请求失败', proxy)

    def run(self):
        """
        测试主函数
        :return:
        """
        print('测试器开始运行')
        try:
            count = self.redis.count()
            print('当前剩余', count, '个代理')
            for i in range(0, count, BATCH_TEST_SIZE):
                start = i
                stop = min(i + BATCH_TEST_SIZE, count)
                print('正在测试第', start + 1, '-', stop, '个代理')
                test_proxies = self.redis.batch(
                    start, stop
                )  #batch会把所需要执行的命令打包成一条请求发到Redis,然后一起等待返回结果。这样批量操作的速度就大大提升
                loop = asyncio.get_event_loop()
                tasks = [
                    self.test_single_proxy(proxy) for proxy in test_proxies
                ]
                loop.run_until_complete(asyncio.wait(tasks))
                sys.stdout.flush()
                time.sleep(5)
        except Exception as e:
            print('测试器发生错误', e.args)
        '''asyncio.get_event_loop方法可以创建一个事件循环,然后使用run_until_complete将协程注册到事件循环,并启动事件循环。
Пример #20
0
class Tester(object):
    def __init__(self):
        self.redis = RedisClient()

    async def test_single_proxy(self, proxy):
        """
        测试单个代理
        :param proxy:
        :return:
        """
        # clientSession 客户端,TCPConnector 忽略证书验证
        conn = aiohttp.TCPConnector(verify_ssl=False)
        async with aiohttp.ClientSession(connector=conn) as session:
            try:
                # encoding
                if isinstance(proxy, bytes):
                    proxy = proxy.decode('utf-8')
                real_proxy = 'http://' + proxy
                logger.info('正在测试%s' % proxy)
                async with session.get(TEST_URL,
                                       proxy=real_proxy,
                                       timeout=5,
                                       allow_redirects=False) as response:
                    # VALID_STATUS_CODES = [200, 302]
                    if response.status in VALID_STATUS_CODES:
                        self.redis.max(proxy)
                        logger.info('代理可用%s' % proxy)
                    else:
                        self.redis.decrease(proxy)
                        logger.info('请求响应码不合法%s IP%s' %
                                    (response.status, proxy))
            except (ClientError,
                    aiohttp.client_exceptions.ClientConnectorError,
                    asyncio.TimeoutError, AttributeError):
                self.redis.decrease(proxy)
                logger.info('代理请求失败%s' % proxy)

    def run(self):
        """
        测试主函数
        :return:
        """
        logger.info('测试器开始运行')
        try:
            count = self.redis.count()
            logger.info('当前剩余%d个代理' % count)
            # BATCH_TEST_SIZE = 10
            # In[1]: [i for i in range(0, 100, 10)] Out[2]: [0, 10, 20, 30, 40, 50, 60, 70, 80, 90]
            for i in range(0, count, BATCH_TEST_SIZE):
                start = i
                stop = min(i + BATCH_TEST_SIZE, count)
                logger.info('正在测试第%d-%d个代理' % (start + 1, stop))
                test_proxies = self.redis.batch(start, stop)
                # asyncio/wait 协程
                loop = asyncio.get_event_loop()
                tasks = [
                    self.test_single_proxy(proxy) for proxy in test_proxies
                ]
                loop.run_until_complete(asyncio.wait(tasks))
                # 必须注释,否则报RuntimeError: Event loop is closed
                # loop.close()
                # 刷新输出
                sys.stdout.flush()
                time.sleep(5)
        except Exception as e:
            logger.exception('测试器发生错误%s' % e.args)
Пример #21
0
class Tester(object):
    def __init__(self):
        self.redis = RedisClient()
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36',
        }
    
    async def test_single_proxy(self, proxy):
        """
        测试单个代理
        :param proxy:
        :return:
        """
        conn = aiohttp.TCPConnector(verify_ssl=False)
        async with aiohttp.ClientSession(connector=conn) as session:
            try:
                if isinstance(proxy, bytes):
                    proxy = proxy.decode('utf-8')
                real_proxy = 'http://' + proxy
                # real_proxy = 'https://' + proxy
                print('正在测试', proxy)
                async with session.get(url=TEST_URL, proxy=real_proxy, headers=self.headers, timeout=15, allow_redirects=False) as response:
                    if response.status in VALID_STATUS_CODES:
                        self.redis.max(proxy)
                        print('代理可用', proxy)
                    else:
                        self.redis.decrease(proxy)
                        print('请求响应码不合法 ', response.status, 'IP', proxy)
            except (ClientError, aiohttp.ClientConnectorError, asyncio.TimeoutError, AttributeError):
                self.redis.decrease(proxy)
                print('代理请求失败', proxy)
    
    def run(self):
        """
        测试主函数
        :return:
        """
        print('测试器开始运行')
        count = self.redis.count()
        print('当前剩余', count, '个代理')
        # 每次运行本测试单元,应该先将库存里满分的代理取出来测试,剔除无效代理,保证开启线程池后提供的代理即是可用的;
        useful_ip = self.redis.all_useful()
        if useful_ip:
            count_usefully = len(useful_ip)
            print('第一个有用的代理: {}'.format(useful_ip[0]), '共{}个'.format(count_usefully))
            for i in range(0, count_usefully, BATCH_TEST_SIZE):
                start = i
                stop = min(i + BATCH_TEST_SIZE, count_usefully)
                print('正在测试第', start + 1, '-', stop, '个代理(usefully)')
                self.batch_proxies(useful_ip[start: stop + 1])
        else:
            print('当前无可用代理,请等待...')
        for i in range(0, count, BATCH_TEST_SIZE):
            start = i
            stop = min(i + BATCH_TEST_SIZE, count)
            print('正在测试第', start + 1, '-', stop, '个代理(normally)')
            test_proxies = self.redis.batch(start, stop)
            self.batch_proxies(test_proxies)

    def batch_proxies(self, test_proxies):
        try:
            loop = asyncio.get_event_loop()
            tasks = [self.test_single_proxy(proxy) for proxy in test_proxies]
            loop.run_until_complete(asyncio.wait(tasks))
            sys.stdout.flush()
            time.sleep(5)
        except Exception as e:
            print('测试器发生错误', e.args)
Пример #22
0
class Tester(object):
    def __init__(self, data):
        self.redis = RedisClient(data)
        self.data = data

    async def test_single_proxy(self, proxy):
        """
        测试单个代理
        :param proxy:
        :return:
        """
        conn = aiohttp.TCPConnector(verify_ssl=False)
        async with aiohttp.ClientSession(connector=conn) as session:
            try:
                if isinstance(proxy, bytes):
                    proxy = proxy.decode('utf-8')
                real_proxy = 'http://' + proxy
                print('正在测试', proxy)
                async with session.get(self.data['TEST_URL'],
                                       proxy=real_proxy,
                                       timeout=15,
                                       allow_redirects=False) as response:
                    text = await response.read()
                    if self.data['TEST_tage'] == 'in':
                        if bytes(self.data['TEST_if'],
                                 encoding="utf8") in text:
                            self.redis.max(proxy)
                            print('代理可用', proxy)
                        else:
                            self.redis.decrease(proxy)
                            print(
                                '不满足条件{}'.format(self.data['TEST_if'] +
                                                 self.data['TEST_tage']),
                                response.status, 'IP', proxy)
                    if self.data['TEST_tage'] == 'not in':
                        if bytes(self.data['TEST_if'],
                                 encoding="utf8") not in text:
                            self.redis.max(proxy)
                            print('代理可用', proxy)
                        else:
                            self.redis.decrease(proxy)
                            print(
                                '不满足条件{}'.format(self.data['TEST_if'] +
                                                 self.data['TEST_tage']),
                                response.status, 'IP', proxy)
            except (ClientError,
                    aiohttp.client_exceptions.ClientConnectorError,
                    asyncio.TimeoutError, AttributeError):
                self.redis.delete(proxy)
                print('代理请求失败', proxy)

    async def _single_proxy(self, proxy):
        """
        测试单个代理
        :param proxy:
        :return:
        """

    def run(self):
        """
        测试主函数
        :return:
        """
        print('测试器开始运行')
        try:
            count = self.redis.count()
            print('当前剩余', count, '个代理')
            for i in range(0, count, self.data['BATCH_TEST_SIZE']):
                start = i
                stop = min(i + self.data['BATCH_TEST_SIZE'], count)
                print('正在测试第', start + 1, '-', stop, '个代理')
                test_proxies = self.redis.batch(start, stop)
                loop = asyncio.get_event_loop()
                tasks = [
                    self.test_single_proxy(proxy) for proxy in test_proxies
                ]
                loop.run_until_complete(asyncio.wait(tasks))
                sys.stdout.flush()
                time.sleep(5)
        except Exception as e:
            print('测试器发生错误', e.args)


# Tester().run()
Пример #23
0
class Tester(object):
    def __init__(self):
        self.redis = RedisClient()

    async def test_single_proxy(self, proxy):
        """
        测试单个代理
        :param proxy:
        :return:
        """
        headers = {
            'accept':
            'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
            'accept-encoding':
            'gzip, deflate, br',
            'accept-language':
            'zh-CN,zh;q=0.9',
            'user-agent':
            'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36',
        }
        try:
            if isinstance(proxy, bytes):
                proxy = proxy.decode('utf-8')
            real_proxy = 'http://' + proxy
            options = {}
            options["url"] = TEST_URL
            options["headers"] = headers
            options["proxy"] = real_proxy
            options["timeout"] = 5
            options["allow_redirects"] = False
            options["verify_ssl"] = False

            response = await requests.get(**options)
            print(response.status)
            if response.status in VALID_STATUS_CODES:
                self.redis.max(proxy)
            else:
                print('请求响应码不合法 ', response.status, 'IP', proxy)
                self.redis.delete(proxy)
        except (ClientError, aiohttp.client_exceptions.ClientConnectorError,
                asyncio.TimeoutError, AttributeError):
            self.redis.delete(proxy)
            print('代理请求失败', proxy)

    def run(self):
        """
        测试主函数
        :return:
        """
        print('测试器开始运行')
        try:
            count = self.redis.count()
            print('当前剩余', count, '个代理')
            for i in range(0, count, BATCH_TEST_SIZE):
                start = i
                stop = min(i + BATCH_TEST_SIZE, count)
                print('正在测试第', start + 1, '-', stop, '个代理')
                test_proxies = self.redis.batch(start, stop)
                loop = asyncio.get_event_loop()
                tasks = [
                    self.test_single_proxy(proxy) for proxy in test_proxies
                ]
                loop.run_until_complete(asyncio.wait(tasks))
                time.sleep(5)
        except Exception as e:
            print('测试器发生错误', e.args)