예제 #1
0
class Getter:
    def __init__(self):
        self.redis = RedisClient()
        self.crawler = Crawler()

    def is_over_threshold(self):
        if self.redis.count() >= POOL_UPPER_THRESHOLD:
            return True
        else:
            return False

    def run(self):
        print('获取器开始执行')
        if not self.is_over_threshold():
            for callback_label in range(self.crawler.__CrawlFuncCount__):
                callback = self.crawler.__CrawlFunc__[callback_label]
                proxies = self.crawler.get_proxies(callback)
                sys.stdout.flush()
                for proxy in proxies:
                    self.redis.add(proxy)
예제 #2
0
class Getter():
    def __init__(self):
        self.redis = RedisClient()
        self.crawler = Crawler()

    def is_over_threshold(self):
        """
        判断是否达到了代理池限制
        :return:
        """
        if self.redis.count() >= POOL_UPPER_THRESHOLD:
            return True
        else:
            return False

    def run(self):
        print('获取器开始执行')
        if not self.is_over_threshold():
            for callback_label in range(self.crawler.__CrawlFuncCount__):
                callback = self.crawler.__CrawlFunc__[callback_label]
                proxies = self.crawler.get__proxies(callback)
                for proxy in proxies:
                    self.redis.add(proxy)
예제 #3
0
class Tester(object):
    def __init__(self):
        self.db = RedisClient()

    async def test_single_proxy(self, proxy):
        conn = aiohttp.TCPConnector(verify_ssl=False)
        async with aiohttp.ClientSession(connector=conn) as session:
            try:
                if isinstance(proxy, bytes):
                    proxy = proxy.decode("utf-8")
                real_proxy = "http://" + proxy
                async with session.get(url=TEST_URL,
                                       proxy=real_proxy,
                                       timeout=15,
                                       allow_redirects=False) as response:
                    if response.status in VALUE_CODE:
                        print("代理", proxy, "有效,分数置为100")
                        self.db.max(proxy)
                    else:
                        print("代理", proxy, "响应码,分数减1")
                        self.db.decrease(proxy)
            except Exception:
                print("代理", proxy, "请求出错,分数减1")
                self.db.decrease(proxy)

    def run(self):
        count = self.db.count()
        print('当前剩余', count, '个代理')
        for i in range(0, count, BATCH):
            start = i
            end = min(i + BATCH, count)
            proxies = self.db.batch(start, end)
            print('正在测试第', start + 1, '-', end, '个代理')
            loop = asyncio.get_event_loop()
            tasks = [self.test_single_proxy(proxy) for proxy in proxies]
            loop.run_until_complete(asyncio.wait(tasks))
            time.sleep(5)
예제 #4
0
class Fetcher:
    def __init__(self):
        self.redis = RedisClient()
        self.crawler = Crawler()
    
    def is_over_threshold(self):
        """
        判断是否达到了代理池数量上限
        """
        if self.redis.count() >= POOL_UPPER_THRESHOLD:
            return True
        else:
            return False
    
    def run(self):
        print('获取器开始执行')
        if not self.is_over_threshold():
            for func in self.crawler.get_funclist():
                # 从各个代理IP网站开始获取IP代理地址
                proxies = self.crawler.get_proxies(func)
                sys.stdout.flush()
                for proxy in proxies:
                    # 将获取的proxy加入到redis队列
                    self.redis.add(proxy)
예제 #5
0
파일: getter.py 프로젝트: ttly20/mypp
class Getter:


    def __init__(self):
        """Initializing databases class and spider class"""
        self.redis = RedisClient()
        self.crawler = Crawler()


    def is_over_threshold(self):
        """Determine if the database if full"""
        if self.redis.count() >= POOL_UPPER_THRESHLD:
            return True
        return False

    async def run(self):
        print('开始获取代理...')
        if not self.is_over_threshold():
            for i in range(self.crawler.CrawlFuncCount):
                crawl_func = self.crawler.CrawlFunc[i]
                proxies = await self.crawler.get_proxy(crawl_func)
                for proxy in proxies:
                    print(proxy)
                    self.redis.add(proxy)
예제 #6
0
class Saver:
    """
    爬取代理,并且存入redis数据库
    """
    def __init__(self):
        self.redis = RedisClient()
        self.crawler = Crawler()

    def is_over_threshold(self):
        """
        判断代理池中的代理数是否已经足够
        """
        if self.redis.count() >= settings.proxy_enough_count:
            return True
        else:
            return False

    def run(self):
        print('获取器开始执行')
        if not self.is_over_threshold():
            for crawl_func in self.crawler.crawl_funcs:
                proxies = self.crawler.get_proxies(crawl_func)
                for proxy in proxies:
                    self.redis.add(proxy)
예제 #7
0
class Tester(object):
    def __init__(self):
        self.redis = RedisClient()

    async def test_single_proxy(self, proxy):
        """
        测试单个代理
        proxy = 代理
        return
        """
        conn = aiohttp.TCPConnector(verify_ssl=False)
        async with aiohttp.ClientSession(connector=conn) as session:
            try:
                if isinstance(proxy, bytes):
                    proxy = proxy.decode('utf-8')
                pattern = re.compile('(\d+\.\d+\.\d+\.\d+\:\d+)')
                result = re.findall(pattern, proxy)
                real_proxy = 'http://' + result[0]
                print('正在测试', proxy)
                async with session.get(TEST_URL,
                                       proxy=real_proxy,
                                       timeout=15,
                                       allow_redirects=False) as response:
                    if response.status in VALID_STATUS_CODES:
                        self.redis.max(proxy)
                        print('代理可用', proxy)
                    else:
                        self.redis.decrease(proxy)
                        print('请求响应码不合法 ', response.status, 'IP', proxy)
            except (ClientError,
                    aiohttp.client_exceptions.ClientConnectorError,
                    asyncio.TimeoutError, AttributeError):
                self.redis.decrease(proxy)
                print('代理请求失败', proxy)
            except Exception:
                self.redis.decrease(proxy)
                print('代理请求失败, 原因None', proxy)

    def run(self):
        """
        测试主函数
        return
        """
        print('测试器开始运行')
        try:
            count = self.redis.count()
            print('当前剩余', count, '个代理')
            for i in range(0, count, BATCH_TEST_SIZE):
                start = i
                stop = min(i + BATCH_TEST_SIZE, count)
                print('正在测试第', start + 1, '-', stop, '个代理')
                test_proxies = self.redis.batch(start, stop)
                loop = asyncio.get_event_loop()
                tasks = [
                    self.test_single_proxy(proxy) for proxy in test_proxies
                ]
                loop.run_until_complete(asyncio.wait(tasks))
                sys.stdout.flush()
                time.sleep(1)
        except Exception as e:
            print('测试器发生错误', e.args)
예제 #8
0
from db import RedisClient
from crawler import Crawler

if __name__ == "__main__":
    redisclient = RedisClient()
    print(redisclient.count())
    crawler = Crawler()
    results = crawler.crawl_dail66()
    for result in results:
        print(result)
예제 #9
0
class Tester(object):
    def __init__(self):
        self.redis = RedisClient()

    #测试单个代理
    async def test_single_proxy(self, proxy):
        conn = aiohttp.TCPConnector(verify_ssl=False)
        async with aiohttp.ClientSession(connector=conn) as session:
            try:
                if isinstance(proxy, bytes):
                    proxy = proxy.decode('utf-8')
                real_proxy = 'http://' + proxy
                print('正在测试', proxy)
                async with session.get(
                        TEST_URL,
                        proxy=real_proxy,
                        timeout=15,
                        allow_redirects=False,
                        headers=
                    {
                        'Host':
                        'xueqiu.com',
                        'Referer':
                        'https://xueqiu.com/u/8205178197',
                        'User-Agent':
                        'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36',
                        'X-Requested-With':
                        'XMLHttpRequest',
                        'Cookie':
                        'device_id=5d27463e2df6a534e7ecba029eb95e29; xq_a_token=f89219d7e7ee863a5773244ad9d2db6e3dc5ea38; xq_r_token=8bdf53186f54b2c5c885621e64fd4d728f3111e0;',
                    }) as response:
                    if response.status in VALID_STATUS_CODES:
                        self.redis.max(proxy)
                        print('代理可用', proxy)
                    else:
                        self.redis.decrease(proxy)
                        print('请求响应码不合法', response.status, 'IP', proxy)
            except (ClientError,
                    aiohttp.client_exceptions.ClientConnectorError,
                    asyncio.TimeoutError, AttributeError):
                self.redis.decrease(proxy)
                print('代理请求失败', proxy)

    #测试启动
    def run(self):
        print('测试器开始运行')
        try:
            count = self.redis.count()
            print('当前剩余', count, '个代理')
            #批量测试
            for i in range(0, count, BATCH_TEST_SIZE):
                start = i
                stop = min(i + BATCH_TEST_SIZE, count)
                print('正在测试第', start + 1, '-', stop, '个代理')
                test_proxies = self.redis.batch(start, stop)
                loop = asyncio.get_event_loop()
                tasks = [
                    self.test_single_proxy(proxy) for proxy in test_proxies
                ]
                loop.run_until_complete(asyncio.wait(tasks))
                sys.stdout.flush()
                time.sleep(5)
        except Exception as e:
            print('测试器发生错误', e.args)
예제 #10
0
class Tester(object):
    def __init__(self):
        self.redis = RedisClient()
        self.headers = {
            'User-agent':
            'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36',
        }

    # async def test_single_proxy(self, proxy):
    def test_single_proxy(self, proxy):
        """
        测试单个代理
        :param proxy:
        :return:
        """
        # conn = aiohttp.TCPConnector(verify_ssl=False)
        # async with aiohttp.ClientSession(connector=conn) as session:
        #     try:
        #         if isinstance(proxy, bytes):
        #             proxy = proxy.decode('utf-8')
        #         real_proxy = 'http://' + proxy
        #         print('正在测试', proxy)
        #         async with session.get(TEST_URL, proxy=real_proxy, timeout=5, allow_redirects=False) as response:
        #             if response.status in VALID_STATUS_CODES:
        #                 self.redis.max(proxy)
        #                 print('代理可用', proxy)
        #             else:
        #                 self.redis.decrease(proxy)
        #                 print('请求响应码不合法 ', response.status, 'IP', proxy)
        #     except (ClientError, aiohttp.client_exceptions.ClientConnectorError, asyncio.TimeoutError, AttributeError):
        #         self.redis.decrease(proxy)
        #         print('代理请求失败', proxy)
        # if isinstance(proxy, bytes):
        # proxy = proxy.decode('utf-8')
        ip = proxy.split(':')[0]
        port = proxy.split(':')[1]
        print('正在测试', proxy)
        try:
            conn = http.client.HTTPConnection(ip, port, timeout=5.0)
            conn.request(method='GET', url=TETS_URL, headers=self.headers)
            res = conn.getresponse()
            print("+++Success:" + proxy)
            self.redis.max(proxy)
        except:
            print("---Failure:" + proxy)
            self.redis.decrease(proxy)

    def run(self):
        """
        测试主函数
        :return:
        """
        print('测试器开始运行')
        try:
            count = self.redis.count()
            print('当前剩余', count, '个代理')
            for i in range(0, count, BATCH_TEST_SIZE):
                start = i
                stop = min(i + BATCH_TEST_SIZE, count)
                print('正在测试第', start + 1, '-', stop, '个代理')
                test_proxies = self.redis.batch(start, stop)
                # loop = asyncio.get_event_loop()
                # tasks = [self.test_single_proxy(proxy) for proxy in test_proxies]
                for proxy in test_proxies:
                    self.test_single_proxy(proxy)
                    time.sleep(0.5)
                # loop.run_until_complete(asyncio.wait(tasks))
                # sys.stdout.flush()
                # time.sleep(5)
        except Exception as e:
            print('测试器发生错误', e.args)
예제 #11
0
class IpValidation(Utility):
    def __init__(self):
        self.redis = RedisClient()
        self.real_ip = ''
        # 每次验证不成功,减去的分值
        self.minus_every_time = (INITIAL_SCORE -
                                 DISCARD_SCORE) // VALIDATE_TIME
        self.key = PROXY_ORIGINAL
        self.anon_check_url = 'http://httpbin.org/ip'

    @staticmethod
    async def is_proxy_valid(proxy, url=TEST_URL):
        url = url
        ua = get_random_ua()
        headers = {'User-Agent': ua}
        try:
            conn = aiohttp.TCPConnector(verify_ssl=False)
            async with aiohttp.ClientSession(headers=headers,
                                             connector=conn) as session:
                async with session.get(url, proxy=proxy, ssl=False) as resp:
                    code = resp.status
                    if 200 <= code < 300:
                        logger.info('%s is valid' % proxy)
                        return True
                    else:
                        logger.info('%s is invalid, code: %s' % (proxy, code))
                        return False
        except (ClientConnectionError, ClientHttpProxyError, TimeoutError,
                CancelledError, ClientProxyConnectionError, Exception) as e:
            logger.warning(e)
            return False

    async def is_high_anon(self, proxy):
        url = ANON_CHECK_URL
        try:
            async with aiohttp.ClientSession() as session:
                async with session.get(url, proxy=proxy, ssl=False,
                                       timeout=15) as resp:
                    code = resp.status
                    if 200 <= code < 300:
                        x_forwarded_for_json = await resp.json()
                        if self.anon_check_url == ANON_CHECK_URL:
                            x_forwarded_for = x_forwarded_for_json['origin']
                        else:
                            # 根据接口自己定义
                            x_forwarded_for = x_forwarded_for_json[
                                'X-Forwarded-For']
                        if self.real_ip in x_forwarded_for:
                            return False
                        return True
                    return False
        except (ClientConnectionError, ClientHttpProxyError, TimeoutError,
                CancelledError, ClientProxyConnectionError, Exception) as e:
            logger.warning('proxy: %s, %s' % (proxy, e))
            return False

    async def test_proxy(self, proxy):
        try:
            if len(proxy.split('-')[1]) > 1:
                if not await self.is_high_anon(
                        proxy.split('-')[1].replace('https://', 'http://')):
                    self.redis.adjust_score(proxy,
                                            -self.minus_every_time,
                                            key=self.key)
                else:
                    self.redis.adjust_score(proxy, +1, key=self.key)
        except CancelledError as e:
            logger.warning('proxy: %s, %s' % (proxy, e))

    def get_real_ip(self):
        resp = requests.get(ANON_CHECK_URL)
        if self.anon_check_url == ANON_CHECK_URL:
            self.real_ip = resp.json()['origin'].split(',')[0]
        else:
            self.real_ip = resp.json()['X-Real-Ip']

    def run_validation(self, key=None):
        if key:
            self.key = key
        logger.info('start checking...')
        start, end = DISCARD_SCORE + 1, INITIAL_SCORE
        while True:
            proxy_unvalidated = self.redis.count(start, end, name=self.key)
            if proxy_unvalidated:
                logger.info('checking...')
                if proxy_unvalidated <= CONCURRENCY_TASK_LIMIT:
                    self.get_real_ip()
                    proxy_list = self.redis.get_proxy_by_score(
                        start, end, proxy_unvalidated, key=self.key)
                    # loop = asyncio.get_event_loop()
                    loop = asyncio.new_event_loop()
                    asyncio.set_event_loop(loop)
                    tasks = [self.test_proxy(proxy) for proxy in proxy_list]
                    loop.run_until_complete(asyncio.wait(tasks))
                else:
                    fetch_times = proxy_unvalidated // CONCURRENCY_TASK_LIMIT
                    left_nums = proxy_unvalidated // CONCURRENCY_TASK_LIMIT
                    for i in range(fetch_times):
                        self.get_real_ip()
                        proxy_list = self.redis.get_proxy_by_score(
                            start, end, CONCURRENCY_TASK_LIMIT, key=self.key)
                        # loop = asyncio.get_event_loop()
                        loop = asyncio.new_event_loop()
                        asyncio.set_event_loop(loop)
                        tasks = [
                            self.test_proxy(proxy) for proxy in proxy_list
                        ]
                        loop.run_until_complete(asyncio.wait(tasks))
                    proxy_list = self.redis.get_proxy_by_score(start,
                                                               end,
                                                               left_nums,
                                                               key=self.key)
                    # loop = asyncio.get_event_loop()
                    loop = asyncio.new_event_loop()
                    asyncio.set_event_loop(loop)
                    tasks = [self.test_proxy(proxy) for proxy in proxy_list]
                    loop.run_until_complete(asyncio.wait(tasks))
            import settings
            if not proxy_unvalidated and not settings.SPIDER_RUNNING:
                settings.SPIDER_RUNNING = True
                self.key = PROXY_ORIGINAL
                logger.info('scrawl finished,all proxies check finished')
                break
예제 #12
0
class DownLoader(Utility):
    def __init__(self):
        self.rules = rules
        self.spider = ProxySpider()
        self.redis = RedisClient()
        self.crack_anti_crawl = CrackAntiCrawl()

    def start_crawl(self):
        for start_urls in self.rules:
            urls = start_urls['resources']
            gfw = start_urls['GFW']
            name = start_urls['name']
            page_type = start_urls['type']
            referer = start_urls['referer']
            host = start_urls['host']
            anti_crawl = start_urls['AntiCrawl']

            cookies = None
            if anti_crawl:
                cookies = eval('crack()', {
                    'crack':
                    eval('self.crack_anti_crawl.crack_{}'.format(name))
                })

            ua = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.14; rv:67.0) Gecko/20100101 Firefox/67.0'
            headers = {'User-Agent': ua, 'Referer': referer, 'Host': host}
            # loop = asyncio.get_event_loop()
            loop = asyncio.new_event_loop()
            asyncio.set_event_loop(loop)
            tasks = [
                self.proxy_downlaod(url, gfw, page_type, name, headers,
                                    cookies) for url in urls
            ]
            loop.run_until_complete(asyncio.wait(tasks))
            # 检测有效proxy数量,达到指定数量,停止爬取
            validated_proxy_num = self.redis.count(VALIDATED_SCORE,
                                                   VALIDATED_SCORE)
            if validated_proxy_num >= VALIDATED_PROXY_NUM:
                break
        settings.SPIDER_RUNNING = False
        logger.info('scrawl finished')

    async def proxy_downlaod(self, url, gfw, page_type, name, headers,
                             cookies):
        logger.info('downloading %s' % url)
        try:
            if not gfw:
                async with aiohttp.ClientSession(headers=headers,
                                                 cookies=cookies) as session:
                    # 设置随机爬取间隔
                    time.sleep(random.randint(1, 3) + random.random())
                    async with session.get(url, ssl=False) as r:
                        code = r.status
                        if 200 <= code < 300:
                            if page_type == 'normal':
                                text = await r.text()
                                try:
                                    await eval(
                                        'parse(text)', {
                                            'parse':
                                            eval('self.spider.parse_{}'.format(
                                                name)),
                                            'text':
                                            text
                                        })
                                except Exception:
                                    logger.error('parse_%s error' % name,
                                                 exc_info=True)
                            else:
                                text = await r.text()
                                try:
                                    await eval(
                                        'parse(text,api)', {
                                            'parse':
                                            eval('self.spider.parse_{}'.format(
                                                name)),
                                            'text':
                                            text,
                                            'api':
                                            1
                                        })
                                except Exception:
                                    logger.error('parse_%s error' % name,
                                                 exc_info=True)
                        else:
                            logger.error('page %s failed, status code: %s' %
                                         (url, code),
                                         exc_info=True)
            else:
                async with aiohttp.ClientSession(headers=headers,
                                                 cookies=cookies) as session:
                    # 设置随机爬取间隔
                    time.sleep(random.randint(1, 3) + random.random())
                    async with session.get(url, proxy=GFW_PROXY,
                                           ssl=False) as r:
                        code = r.status
                        if 200 <= code < 300:
                            if page_type == 'normal':
                                text = await r.text()
                                try:
                                    await eval(
                                        'parse(text)', {
                                            'parse':
                                            eval('self.spider.parse_{}'.format(
                                                name)),
                                            'text':
                                            text
                                        })
                                except Exception:
                                    logger.error('parse_%s error' % name,
                                                 exc_info=True)
                            else:
                                text = await r.text()
                                try:
                                    await eval(
                                        'parse(text,api)', {
                                            'parse':
                                            eval('self.spider.parse_{}'.format(
                                                name)),
                                            'text':
                                            text,
                                            'api':
                                            page_type
                                        })
                                except Exception:
                                    logger.error('parse_%s error' % name,
                                                 exc_info=True)
                        else:
                            logger.error('page %s failed, status code: %s' %
                                         (url, code),
                                         exc_info=True)
        except (ClientConnectionError, ClientHttpProxyError,
                ClientProxyConnectionError, CancelledError, Exception):
            logger.error('page %s failed' % url, exc_info=True)
예제 #13
0
class Tester():
    def __init__(self):
        self.redis = RedisClient()

    async def test_single_proxy(self, proxy):
        """
        测试单个代理
        """
        # 如果proxy是字节类型的,以utf-8格式解码
        if isinstance(proxy, bytes):
            proxy = proxy.decode('utf-8')
        real_proxy = 'http://' + proxy
        # 不验证SSL
        conn = aiohttp.TCPConnector(verify_ssl=False)
        async with aiohttp.ClientSession(connector=conn) as session:
            try:
                # 访问httpbin
                async with session.get(TEST_URL,
                                       proxy=real_proxy,
                                       timeout=7,
                                       allow_redirects=False) as req:
                    # # 获取相应内容
                    # response_content = await req.json()
                    # ip_response = response_content['origin']
                    # # 获取访问IP
                    # juege_proxy = re.search('(.*):', proxy).group(1)
                    # # 判断访问IP是否与代理一致
                    # if ip_response == juege_proxy:
                    #     # 代理分值设置为最高
                    #     self.redis.max(proxy)
                    #     # print('代理可用', proxy)
                    if req.status in VALID_STATUS_CODES:
                        self.redis.max(proxy)
                    else:
                        self.redis.decrease(proxy)

            except Exception as e:
                # print(e.args)
                # 有异常则代理分数减一
                self.redis.decrease(proxy)
                # print('代理不可用,分值-1', proxy)

    def run(self):
        """
        批量测试代理
        """
        try:
            # 获取当前代理池代理数量
            count = self.redis.count()
            print('当前共有', count, '个代理!')
            # 批量测试代理
            for i in range(0, count, BATCH_SIZE):
                start = i
                stop = min(i + BATCH_SIZE, count - 1)
                print('正在测试第', start + 1, '-', stop, '个代理!')
                proxies_list = self.redis.batch(start, stop)
                # 启用一个事件循环
                loop = asyncio.get_event_loop()
                # 把携程对象封装为task
                task = [
                    self.test_single_proxy(proxy) for proxy in proxies_list
                ]
                # 运行
                loop.run_until_complete(asyncio.wait(task))
                sys.stdout.flush()
                time.sleep(5)
        except Exception as e:
            print('测试器发生错误', e.args)
예제 #14
0
class Tester(object):
    def __init__(self):
        self.redis = RedisClient()

    #这是一个异步的方法
    async def test_single_proxy(self, proxy):
        """
        测试单个代理
        :param proxy:
        :return:
        """

        conn = aiohttp.TCPConnector(ssl=False)
        #建立一个session对象
        #session可以进行多项操作,比如post, get, put, head等
        async with aiohttp.ClientSession(connector=conn) as session:

            #检查如果是字节类型就解码
            try:
                if isinstance(proxy, bytes):
                    proxy = proxy.decode('utf-8')
                real_proxy = 'http://' + proxy
                print('正在测试', proxy)

                #利用session对象去get
                async with session.get(TEST_URL,
                                       proxy=real_proxy,
                                       timeout=15,
                                       allow_redirects=False) as response:
                    #如果状态码有效
                    if response.status in VALID_STATUS_CODES:
                        #状态码值设置为最大
                        self.redis.max(proxy)
                        print('代理可用', proxy)
                    else:
                        self.redis.decrease(proxy)
                        print('请求响应码不合法 ', response.status, 'IP', proxy)
            except (ClientError,
                    aiohttp.client_exceptions.ClientConnectorError,
                    asyncio.TimeoutError, AttributeError):
                self.redis.decrease(proxy)
                print('代理请求失败', proxy)

    def run(self):
        """
        测试主函数
        :return:
        """
        print('测试器开始运行')
        try:
            count = self.redis.count()
            print('当前剩余', count, '个代理')

            #取出数量为BATCH_TEST_SIZE的proxy
            for i in range(0, count, BATCH_TEST_SIZE):
                start = i
                #这样可以取到最后一个proxy
                stop = min(i + BATCH_TEST_SIZE, count)
                print('正在测试第', start + 1, '-', stop, '个代理')
                test_proxies = self.redis.batch(start, stop)

                #调用这个方法可以避免“进程已经运行”这个错误
                nest_asyncio.apply()

                #主线程调用asyncio.get_event_loop()时会创建事件循环
                loop = asyncio.get_event_loop()  #

                #tasks为异步的任务,列表里面生成的为coroutine(协程)元素
                tasks = [
                    self.test_single_proxy(proxy) for proxy in test_proxies
                ]
                #把异步的任务丢给这个循环的run_until_complete()方法
                loop.run_until_complete(asyncio.wait(tasks))
                sys.stdout.flush()
                time.sleep(5)
        except Exception as e:
            print('测试器发生错误', e.args)
예제 #15
0
class Verify:
    def __init__(self):
        self.db = RedisClient()

    async def verify_proxy(self, redis_key, proxy):
        '''
        验证一个代理IP
        :param proxy:
        :return:
        '''
        if isinstance(proxy, bytes):
            proxy = proxy.decode('utf-8')
        re_proxy = 'http://' + proxy

        conn = aiohttp.TCPConnector(ssl=False)
        async with aiohttp.ClientSession(connector=conn) as session:
            try:
                async with session.get(setting.TEST_URL,
                                       proxy=re_proxy,
                                       timeout=6,
                                       allow_redirects=False) as resp:
                    if resp.status in [200, 302]:
                        print("{}||{}池:{}: ok 100点".format(
                            time.ctime(), redis_key, proxy))
                        self.db.max(redis_key, proxy)
                    else:
                        print("{}||{}池:{}: fail -1点".format(
                            time.ctime(), redis_key, proxy))
                        self.db.decrease(redis_key, proxy)
            except (aiohttp.ClientError, aiohttp.ClientConnectorError,
                    asyncio.TimeoutError) as e:
                print("{}||{}池:{}: error -1点".format(time.ctime(), redis_key,
                                                     proxy))
                self.db.decrease(redis_key, proxy)

    # async def run_by_redis(self, redis_key):
    #     count = self.db.count(redis_key)
    #     print(redis_key, '当前剩余', count, '个代理')
    #     for i in range(0, count, setting.TEST_SIZE):
    #         start = i
    #         end = min(i + setting.TEST_SIZE, count) - 1
    #         print('正在测试{}第'.format(redis_key), start + 1, '-', end + 1, '个代理')
    #         proxies = self.db.batch(redis_key, start, end)
    #         for proxy in proxies:
    #             await self.verify_proxy(redis_key, proxy)
    #
    # def run(self):
    #     print("开始验证代理")
    #     try:
    #         tasks = [
    #             self.run_by_redis(setting.REDIS_KEY_HTTP),
    #             self.run_by_redis(setting.REDIS_KEY_HTTPS)
    #         ]
    #         loop = asyncio.get_event_loop()
    #         loop.run_until_complete(asyncio.wait(tasks))
    #         time.sleep(5)
    #     except Exception as e:
    #         print('验证程序运行错误: ', e)

    def run_verify_http(self, part):
        stime = time.time()

        count = self.db.count(setting.REDIS_KEY_HTTP)
        start = part * (count // 4)
        stop = start + (count // 4)
        if part == 3:
            stop = count
        try:
            logger.info("{}开始验证{}-{}".format(setting.REDIS_KEY_HTTP, start,
                                             stop))

            for i in range(start, stop, setting.HTTP_VERIFY_SIZE):
                proxies = self.db.batch(setting.REDIS_KEY_HTTP, i,
                                        i + setting.HTTP_VERIFY_SIZE)
                loop = asyncio.get_event_loop()
                tasks = [
                    self.verify_proxy(setting.REDIS_KEY_HTTP, proxy)
                    for proxy in proxies
                ]
                loop.run_until_complete(asyncio.wait(tasks))

            logger.info("{}验证完成{}-{}耗时:{}".format(setting.REDIS_KEY_HTTP,
                                                  start, stop,
                                                  time.time() - stime))
        except Exception as e:
            logger.info('{}验证报错{}-{}:{}'.format(setting.REDIS_KEY_HTTP, start,
                                                stop, e))

    def run_verify_https(self):
        stime = time.time()
        try:
            logger.info("{}开始验证".format(setting.REDIS_KEY_HTTPS))

            count = self.db.count(setting.REDIS_KEY_HTTPS)
            for i in range(0, count, setting.HTTP_VERIFY_SIZE):
                proxies = self.db.batch(setting.REDIS_KEY_HTTPS, i,
                                        i + setting.HTTP_VERIFY_SIZE)
                loop = asyncio.get_event_loop()
                tasks = [
                    self.verify_proxy(setting.REDIS_KEY_HTTPS, proxy)
                    for proxy in proxies
                ]
                loop.run_until_complete(asyncio.wait(tasks))

            logger.info("{}验证完成,耗时:{}".format(setting.REDIS_KEY_HTTPS,
                                              time.time() - stime))
        except Exception as e:
            logger.warning('{}验证报错:{}'.format(setting.REDIS_KEY_HTTPS, e))
예제 #16
0
def count():
    r = RedisClient()
    return str(r.count())
예제 #17
0
파일: count.py 프로젝트: hssss1867/fastapi
from db import RedisClient
import time

db = RedisClient()
while True:
    print(db.count())
    time.sleep(10)
예제 #18
0
파일: tester.py 프로젝트: sleepray/Python-
class Tester(object):
    def __init__(self):
        self.redis = RedisClient()

    #函数前面加 async 表示此函数是异步的
    async def test_single_proxy(self, proxy):
        '''
        测试单个代理
        :param proxy: 
        :return: 
        '''
        conn = aiohttp.TCPConnector(
            verify_ssl=False)  #获取请求,verify_ssl=False防止ssl证书报错
        async with aiohttp.ClientSession(
                connector=conn
        ) as session:  #创建一个session对象(session用于存储特定对话所需信息)
            try:
                if isinstance(proxy, bytes):  #判断proxy是不是bytes类型
                    proxy = proxy.decode('utf-8')
                real_proxy = 'http://' + proxy
                print('正在测试', proxy)
                async with session.get(
                        TEST_URL,
                        proxy=real_proxy,
                        timeout=15,
                        allow_redirects=False
                ) as response:  #allow_redirects=False禁止重定向
                    if response.status in VALID_STATUS_CODES:
                        self.redis.max(proxy)  #调用db的max()方法将score设为100
                        print('代理可用', proxy)
                    else:
                        self.redis.decrease(proxy)  #调用db的decrease方法将score减一
                        print('请求响应码不合法', response.status, 'IP', proxy)
            except (ClientError,
                    aiohttp.client_exceptions.ClientConnectorError,
                    asyncio.TimeoutError, AttributeError):
                self.redis.decrease(proxy)
                print('代理请求失败', proxy)

    def run(self):
        '''
        测试主函数
        :return: 
        '''
        print('测试器开始运行')
        try:
            count = self.redis.count()
            print('当前剩余', count, '个代理')
            for i in range(0, count, BATCH_TEST_SIZE):  # 步长为100
                start = i
                stop = min(i + BATCH_TEST_SIZE, count)
                print('正在测试第', start + 1, '-', stop, '个代理')
                test_proxies = self.redis.batch(
                    start, stop)  # 调用db的batch()获取100个代理列表从高到低排列
                loop = asyncio.get_event_loop()  # 获取EventLoop
                tasks = [
                    self.test_single_proxy(proxy) for proxy in test_proxies
                ]
                loop.run_until_complete(asyncio.wait(
                    tasks))  #执行异步任务tasks。在等待网站返回的时候去执行另一个任务,网站返回后跳回任务继续执行原任务
                sys.stdout.flush()  #输出实时信息,而不是等待运行完毕后输出
                time.sleep(5)
        except Exception as e:
            print('测试器发送错误', e.args)