예제 #1
0
 def __init__(self):
     """
     初始化进程池、队列及数据库操作对象
     """
     self.pool = Pool()
     self.queue = Queue()
     self.mongo_pool = MongoPool()
예제 #2
0
class ProxyAip(object):
    def __init__(self):

        self.app = Flask(__name__)
        self.mongo = MongoPool()

        @self.app.route('/')
        def hello_world():
            tips = '/random?protocol&domain&count'

            return tips

        # 随机获取一个代理
        @self.app.route('/random')
        def random():
            # 获取协议
            protocol = request.args.get('protocol')
            # 获取域名
            domain = request.args.get('domain')
            proxy = self.mongo.random_proxy(protocol,
                                            domain,
                                            count=PROXIES_MAX_COUT)

            if protocol:
                return '{}://{}:{}'.format(protocol, proxy.ip, proxy.prot)
            else:
                return '{}:{}'.format(proxy.ip, proxy.prot)

        # 获取队列中代理
        @self.app.route('/proxies')
        def proxies():
            protocol = request.args.get('protocol')
            domain = request.args.get('domain')
            proxies = self.mongo.get_proxies(protocol,
                                             domain,
                                             count=PROXIES_MAX_COUT)
            proxies = [proxy.__dict__ for proxy in proxies]

            return json.dumps(proxies)

        # 获取过滤掉不可用域名的代理
        @self.app.route('/disable_domain')
        def disable_domain():

            ip = request.args.get('ip')
            domain = request.args.get('domain')
            if ip is None:
                return '提供ip参数'
            if domain is None:
                return '提供domian参数'
            self.mongo.disable_domain(ip, domain)
            return '{}禁用()成功!'.format(ip, domain)

    def run(self):
        self.app.run('127.0.0.1', port=8000)

    @classmethod
    def start(cls):
        proxy_api = ProxyAip()
        proxy_api.run()
예제 #3
0
 def __init__(self):
     # 创建操作数据库对象
     self.mongo_pool = MongoPool()
     # 待检测ip队列
     self.queue = Queue()
     # 协程池
     self.coroutine_pool = Pool()
예제 #4
0
class Proxy_Api(object):
    def __init__(self):
        #初始化一个Flask服务
        self.app = Flask(__name__)
        # 创建MongoPool对象用于操作数据库
        self.mongo_pool = MongoPool()

        @self.app.route('/random')
        def random():
            protocol = request.args.get('protocol')
            domain = request.args.get('domain')
            # 随机返回多个proxy对象
            proxy = self.mongo_pool.random_proxy(protocol,
                                                 domain,
                                                 count=PROXIES_MAX_COUNT,
                                                 nick_type=2)
            #print(proxy)
            if protocol:
                return '{}://{}:{}'.format(protocol, proxy.ip, proxy.port)
            else:
                return '{}:{}'.format(proxy.ip, proxy.port)

        @self.app.route('/proxies')
        def proxies():
            protocol = request.args.get('protocol')
            domain = request.args.get('domain')

            proxies = self.mongo_pool.get_proxies(protocol,
                                                  domain,
                                                  count=PROXIES_MAX_COUNT)
            #把Proxy对象列表转换成字典才可以转换成json
            proxies = [proxy.__dict__ for proxy in proxies]
            #字典转换成json
            print(json.dumps(proxies))
            return json.dumps(proxies, ensure_ascii=False)

        @self.app.route('/disable_domain')
        def disable_domain():
            ip = request.args.get('ip')
            domain = request.args.get('domain')
            if ip is None:
                return '请提供ip这个参数'
            if domain is None:
                return '请提供domain这个参数'

            self.mongo_pool.disable_domain(ip, domain)
            return '{}禁用域名{}成功'.format(ip, domain)

    def run(self):
        self.app.run('127.0.0.1', port=16889)

    @classmethod
    def start(cls):
        Pa = cls()
        Pa.run()
예제 #5
0
    def __init__(self):
        # 初始一个Flask的Web服务
        self.app = Flask(__name__)
        self.proxy_pool = MongoPool()

        @self.app.route('/random')
        def random():
            # 从传入参数中获取URL
            # 根据protocol参数获取协议
            protocol = request.args.get('protocol')
            # 根据domain参数获取域名
            domain = request.args.get('domain')

            proxy = self.proxy_pool.random(protocol=protocol,
                                           domain=domain,
                                           count=settings.AVAILABLE_IP_COUNT)

            # 如果有协议, 就返回带有协议代理IP和端口号
            if protocol:
                return '{}://{}:{}'.format(protocol, proxy.ip, proxy.port)
            else:
                # 如果没有协议就返回, 不带协议的IP和端口号
                return '{}:{}'.format(proxy.ip, proxy.port)

        @self.app.route('/proxies')
        def proxies():
            # 根据protocol参数获取协议
            protocol = request.args.get('protocol')
            # 根据domain参数获取域名
            domain = request.args.get('domain')

            proxies = self.proxy_pool.get_proxies(
                protocol=protocol,
                domain=domain,
                count=settings.AVAILABLE_IP_COUNT)
            lis = []
            for proxy in proxies:
                lis.append(proxy.__dict__)
            return json.dumps(lis)

        @self.app.route('/disable_domain')
        def disable_domain():
            # 获取IP地址
            ip = request.args.get('ip')
            # 获取不可用域名
            domain = request.args.get('domain')
            if ip is None:
                return '请传入ip参数'
            if domain is None:
                return '请传入domain参数'

            # 更新域名成功
            self.proxy_pool.disable_domain(ip=ip, domain=domain)
            return '该IP添加不可用域名成功'
예제 #6
0
class RunSpider(object):
    def __init__(self):

        self.mongo_pool = MongoPool()
        self.coroutine_pool = Pool()

    def get_spider_from_settings(self):
        """
        获取配置文件中的具体爬虫列表创建对象
        """
        for full_class_name in PROXIES_SPIDERS:
            module_name, class_name = full_class_name.rsplit('.', maxsplit=1)
            # 动态导入模块
            module = importlib.import_module(module_name)

            cls = getattr(module, class_name)
            spider = cls()
            yield spider

    def run(self):
        '''
        遍历爬虫对象,执行get_proxies方法
        '''
        spiders = self.get_spider_from_settings()
        for spider in spiders:
            self.coroutine_pool.apply_async(self.__run_one_spider,
                                            args=(spider, ))
        # 当前线程等待爬虫执行完毕
        self.coroutine_pool.join()

    def __run_one_spider(self, spider):
        try:
            for proxy in spider.get_proxies():
                time.sleep(0.1)
                checked_proxy = check_proxy(proxy)
                if proxy.speed != -1:
                    self.mongo_pool.insert(checked_proxy)
        except Exception as er:
            logger.exception(er)
            logger.exception("爬虫{} 出现错误".format(spider))

    @classmethod
    def start(cls):
        """
        类方法,依据配置文件汇总的时间间隔run爬虫,单位小时
        """
        rs = RunSpider()
        rs.run()
        schedule.every(SPIDERS_RUN_INTERVAL).hours.do(rs.run)

        while 1:
            schedule.run_pending()
            time.sleep(60)
예제 #7
0
class RunSpider(object):
    """启动spider"""

    def __init__(self):
        '''创建数据库对象'''
        self.mongo_pool = MongoPool()
        # 创建协程池
        self.coroutine_pool = Pool()

    def get_spider_from_settings(self):
        '''根据配置信息,获取爬虫列表'''
        for full_class_name in PROXIES_SPDERS:
            module_name, class_name = full_class_name.rsplit('.', maxsplit=1)  # 从左往右截1次
            module = importlib.import_module(module_name)
            cls = getattr(module, class_name)
            spdier = cls()

            yield spdier

    def run(self):

        spdiers = self.get_spider_from_settings()

        for spider in spdiers:
            # self.__execute_one_spider_task(spider)
            # 通过一部的方法执行
            self.coroutine_pool.apply_async(self.__execute_one_spider_task, args=(spider,))
        # 调用join方法,当前线程 等待 协程 任务的完成
        self.coroutine_pool.join()

    def __execute_one_spider_task(self, spider):
        '''把处理一个代理爬虫的代码抽到一个方法'''
        try:
            for proxy in spider.get_proxies():
                proxy = check_proxy(proxy)
                # print(proxy)
                # 写入数据库
                self.mongo_pool.insert_one(proxy)
        except Exception as ex:
            logger.exception(ex)

    @classmethod
    def start(cls):
        '''运行时间设定,制动执行'''
        rs = RunSpider()
        rs.run()
        # schedule.every(RUN_SPDERS_INTERVAL).hour.do(rs.run)  # 小时
        schedule.every(RUN_SPDERS_INTERVAL).minutes.do(rs.run) # 分钟

        while True:
            schedule.run_pending()
            time.sleep(1)
예제 #8
0
class DbProxiesCheck(object):
    def __init__(self):
        # 创建操作数据库对象
        self.mongo_pool = MongoPool()
        # 待检测ip队列
        self.queue = Queue()
        # 协程池
        self.coroutine_pool = Pool()

    # 异步回调函数
    def __check_callback(self, temp):
        self.coroutine_pool.apply_async(self.__check_one,
                                        callback=self.__check_one())

    def run(self):
        # 处理检测代理ip核心逻辑
        proxies = self.mongo_pool.find_all()

        for proxy in proxies:
            self.queue.put(proxy)

        # 开启多异步任务
        for i in range(TEST_PROXIES_ASYNC_COUNT):
            # 异步回调,死循环执行该方法
            self.coroutine_pool.apply_async(self.__check_one,
                                            callback=self.__check_one())
        # 当前线程等待队列任务完成
        self.queue.join()

    def __check_one(self):
        # 检查一个代理ip可用性
        # 从队列中获取一个proxy
        proxy = self.queue.get()

        checked_proxy = check_proxy(proxy)

        if checked_proxy.speed == -1:
            checked_proxy.score -= 1
            if checked_proxy.score == 0:
                self.mongo_pool.delete(checked_proxy)
            else:
                self.mongo_pool.update(checked_proxy)
        else:
            checked_proxy.score = MAX_SCORE
            self.mongo_pool.update(checked_proxy)
        # 调度队列的task_done方法(一个任务完成)
        self.queue.task_done()

    @classmethod
    def start(cls):
        '''
        类方法,依据配置文件的时间间隔运行检测数据库中的ip可用性,单位小时
        '''
        test = DbProxiesCheck()
        test.run()
        schedule.every(TEST_RUN_INTERVAL).hours.do(test.run)

        while 1:
            schedule.run_pending()
            time.sleep(60)
예제 #9
0
class ProxyTexter(object):

    def __init__(self):
        '''创建操作数据库的对象'''
        self.mongo_pool = MongoPool()
        self.queue = Queue()
        self.coroutine_proxy = Pool()

    def __check_callback(self, temp):
        # 死循环调用
        self.coroutine_proxy.apply_async(self.__check_noe_proxy, callback=self.__check_callback)

    def run(self):
        '''检测核心逻辑'''
        proxies = self.mongo_pool.find_all()

        for proxy in proxies:
            # 检测
            # self.__check_noe_proxy(proxy)
            # 把代理添加到队列中
            self.queue.put(proxy)
        # 异步
        for i in range(TEXT_PROXIES_AXYNC_COUT):
            # 异步回调
            self.coroutine_proxy.apply_async(self.__check_noe_proxy, callback=self.__check_callback)
        # 让当前的 线程 等待 队列任务的完成
        self.queue.join()

    def __check_noe_proxy(self):
        '''处理单个代理'''
        # 获取队列中的代理
        proxy = self.queue.get()
        proxy = check_proxy(proxy)
        if proxy.speed == -1:
            proxy.score -= 1
            if proxy.score == 0:
                self.mongo_pool.delete_one(proxy)
            else:
                self.mongo_pool.delete_one(proxy)
        else:
            proxy.score = MAX_SCORE
            self.mongo_pool.delete_one(proxy)
        # 调度队列的task_done方法
        self.queue.task_done()

    @classmethod
    def start(cls):
        '''运行时间设定,制动执行'''
        proxy_tester = cls()
        proxy_tester.run()
        # schedule.every(TEXT_PROXIES_INTERVAL).hour.do(proxy_tester.run)  # 每多少小时检测一次
        schedule.every(TEXT_PROXIES_INTERVAL).minutes.do(proxy_tester.run)  # 每多少分钟检测一次
        while True:
            schedule.run_pending()
            time.sleep(1)
예제 #10
0
class ProxyTester(object):
    def __init__(self):
        #创建操作数据库的对象
        self.mongo_pool=MongoPool()
        self.queue=Queue()
        self.corourine_pool=Pool()

    def __check_callback(self,temp):
        self.corourine_pool.apply_async(self.__check_one_proxy,callback=self.__check_callback)

    def __check_one_proxy(self):
        proxy=self.queue.get()
        proxy = check_proxy(proxy)
        if proxy.speed == -1:
            proxy.score -= 1
            if proxy.score <= 0:
                self.mongo_pool.delete_one(proxy)
            else:
                self.mongo_pool.update_one(proxy)
        else:
            proxy.score = MAX_SCORE
            self.mongo_pool.update_one(proxy)
        self.queue.task_done()

    def run(self):
        proxies=self.mongo_pool.find_all()
        for proxy in proxies:
            self.queue.put(proxy)
            logger.info(proxy)
        # 开启多个异步
        for i in range(TEST_PROXIES_ASYNC_COUNT):
            self.corourine_pool.apply_async(self.__check_one_proxy(),callback=self.__check_callback)
        self.queue.join()

    @classmethod
    def start(cls):
        pt = cls()
        pt.run()
        schedule.every(RUN_PROXY_TEST_INTERVAL).hours.do(pt.run())
        while True:
            schedule.run_pending()
            time.sleep(3600)
예제 #11
0
 def __init__(self):
     '''创建数据库对象'''
     self.mongo_pool = MongoPool()
     # 创建协程池
     self.coroutine_pool = Pool()
예제 #12
0
class ProxyTest(object):
    """
    测试代理ip可用性
    """
    def __init__(self):
        """
        初始化进程池、队列及数据库操作对象
        """
        self.pool = Pool()
        self.queue = Queue()
        self.mongo_pool = MongoPool()

    def _test_proxies(self):
        """
        测试代理ip可用性,并更新到数据库
        :return:
        """
        # 从队列中取出一个代理ip
        proxy = self.queue.get()
        try:
            # 检测代理ip
            proxy = check_proxy(proxy)
            # 判断此代理ip此次是否有效,若有效则恢复为默认分值,否则分值减1
            if proxy.speed == -1:
                proxy.score -= 1
                # 若评分为0,则表示此代理ip不可用,则从数据库中删除此代理ip
                if proxy.score == 0:
                    self.mongo_pool.delete(proxy)
                else:
                    # 更新此代理ip
                    self.mongo_pool.update(proxy)
            else:
                # 此代理ip恢复为默认分值
                proxy.score = DEFAULT_SCORE
                # 更新此代理ip
                self.mongo_pool.update(proxy)
        except Exception as e:
            print(e)

    def _test_callback(self, source):
        """
        检测代理ip(_test_proxies)的回调函数
        :param source: 回调函数所需参数
        :return:
        """
        # 使其死循环,不断检测代理ip
        self.pool.apply_async(self._test_proxies, callback=self._test_callback)

    def run(self):
        """
        启动代理ip的检测
        :return:
        """
        # 从数据库中获取所有代理ip
        proxies = self.mongo_pool.find()
        # 判断是否有代理ip
        if proxies is None or len(proxies) == 0:
            print("代理ip池为空")
            return

        # 依次将代理ip添加到队列中
        for proxy in proxies:
            self.queue.put(proxy)

        # 开启若干进程,用于检测代理ip
        for test in range(TEST_ANSYC_COUNT):
            # 异步非阻塞
            self.pool.apply_async(self._test_proxies,
                                  callback=self._test_callback)

        # 让主线程等待异步任务完成
        self.pool.join()

    @classmethod
    def start(cls):
        """
        开启检测代理ip服务
        :return:
        """
        # 创建并启动检测
        test = cls()
        test.run()

        # 定时启动检测
        schedule.every(TEST_INTERVAL).hours.do(test.run)
        while True:
            # 运行任务
            schedule.run_pending()
            time.sleep(1)
예제 #13
0
 def __init__(self):
     self.queue = Queue()
     self.pool = Pool() # 协程池
     self.proxy_pool = MongoPool() # 基于MongoDB的代理池
예제 #14
0
 def __init__(self):
     self.pool = Pool()
     self.proxy_pool = MongoPool()
예제 #15
0
    def __init__(self):

        self.mongo_pool = MongoPool()
        self.coroutine_pool = Pool()
예제 #16
0
 def __init__(self):
     '''创建操作数据库的对象'''
     self.mongo_pool = MongoPool()
     self.queue = Queue()
     self.coroutine_proxy = Pool()
예제 #17
0
class RunSpider(object):
    def __init__(self):
        self.pool = Pool()
        self.proxy_pool = MongoPool()

    def _auto_import_instances(self):
        """根据配置信息, 自动导入爬虫"""
        instances = []
        # 遍历配置的爬虫, 获取爬虫路径
        for path in settings.PROXIES_SPIDERS:
            # 根据路径, 获取模块名 和 类名
            module_name, cls_name = path.rsplit('.', maxsplit=1)
            # 根据模块名导入模块
            module = importlib.import_module(module_name)
            # 根据类名, 从模块中, 获取爬虫类
            cls = getattr(module, cls_name)
            # 创建爬虫对象, 添加到列表中
            instances.append(cls())

        # 返回爬虫对象列表
        return instances

    def run(self):
        """启动爬虫"""
        # 获取代理爬虫
        spiders = self._auto_import_instances()
        # 执行爬虫获取代理
        for spider in spiders:
            # 使用协程异步调用该方法,提高爬取的效率
            self.pool.apply_async(self.__run_one_spider, args=(spider, ))

        # 等待所有爬虫任务执行完毕
        self.pool.join()

    def __run_one_spider(self, spider):
        try:
            for proxy in spider.get_proxies():
                if proxy is None:
                    # 如果是None继续一个
                    continue
                # 检查代理, 获取代理协议类型, 匿名程度, 和速度
                proxy = check_proxy(proxy)
                # 如果代理速度不为-1, 就是说明该代理可用
                if proxy.speed != -1:
                    # 保存该代理到数据库中
                    self.proxy_pool.save(proxy)
        except Exception as e:
            logger.exception(e)
            logger.exception("爬虫{} 出现错误".format(spider))

    @classmethod
    def start(cls):
        # 创建本类对象
        run_spider = RunSpider()
        run_spider.run()

        # 每隔 SPIDER_INTERVAL 小时检查下代理是否可用
        schedule.every(settings.SPIDER_INTERVAL).hours.do(run_spider.run())
        while True:
            schedule.run_pending()
            time.sleep(1)
예제 #18
0
class RunSpiders(object):
    """
    启动各个爬虫
    """
    def __init__(self):
        """
        创建协程池及数据库操作对象
        """
        self.pool = Pool()
        self.proxy_pool = MongoPool()

    @staticmethod
    def _import_spider_instance():
        """
        动态导入各爬虫模块并创建爬虫对象
        :return:
        """
        # 存放爬虫对象
        instances = []
        # 依次导入各爬虫模块并创建爬虫对象
        for instance_path in PROXIES_SPIDERS:
            # 获取爬虫模块路径及爬虫类名称
            module_name, class_name = instance_path.rsplit('.', maxsplit=1)
            # 导入模块
            module = importlib.import_module(module_name)
            # 获取模块中的爬虫对象
            _class = getattr(module, class_name)
            # 创建爬虫对象,并加入爬虫列表中
            instances.append(_class())
        return instances

    def _run_spider(self, spider):
        """
        开启爬虫
        :param spider:
        :return:
        """
        try:
            # 获取代理ip数据
            for proxy in spider.get_proxies():
                if proxy is None:
                    continue
                # 检测此代理ip
                proxy = check_proxy(proxy)
                # 判断此代理ip是否有效
                if proxy.speed != -1:
                    # 将此代理ip保存到数据库中
                    self.proxy_pool.save(proxy)
        except Exception as e:
            logger.exception("爬虫{}出错,原因:{}".format(spider, e))

    def run(self):
        """
        将各个爬虫加入到协程池中,并启动
        :return:
        """
        # 导入爬虫模块,并创建对象
        spiders = self._import_spider_instance()
        # 将各个爬虫加入到协程池中
        for spider in spiders:
            # 异步非阻塞
            self.pool.apply_async(self._run_spider, args=(spider, ))
        # 让主线程等待异步任务完成
        self.pool.join()

    @classmethod
    def start(cls):
        """
        开启爬虫服务
        :return:
        """
        # 创建并启动爬虫
        spiders = cls()
        spiders.run()

        # 设置定时启动爬虫
        schedule.every(SPIDER_INTERVAL).hours.do(spiders.run)
        while True:
            # 运行任务
            schedule.run_pending()
            time.sleep(1)
예제 #19
0
class ProxyApi(object):
    """
    代理ip的web服务
    """
    def __init__(self):
        """
        初始化flask服务
        """
        # 创建flask服务
        self.app = Flask(__name__)
        # 创建数据库操作对象
        self.proxy_pool = MongoPool()

        @self.app.route('/random')
        def _random():
            """
            根据条件随意获取一个代理ip
            :return:
            """
            # 获取url中的请求参数
            protocol = request.args.get('protocol')
            nick_type = request.args.get('nick_type')
            domain = request.args.get('domain')
            nick_type = int(nick_type) if nick_type else None

            # 从数据库中取出数据
            proxy = self.proxy_pool.random(protocol=protocol,
                                           domain=domain,
                                           nick_type=nick_type,
                                           count=AVAILABLE_IP_COUNT)
            if proxy:
                # 判断是否有协议类型
                if protocol:
                    return "{}://{}:{}".format(protocol, proxy.ip, proxy.port)
                else:
                    return "{}:{}".format(proxy.ip, proxy.port)
            else:
                return ''

        @self.app.route('/proxies')
        def proxies():
            """
            根据条件获取若干个代理ip
            :return:
            """
            # 获取url中的请求参数
            protocol = request.args.get('protocol')
            nick_type = request.args.get('nick_type')
            domain = request.args.get('domain')
            count = request.args.get('count')
            nick_type = int(nick_type) if nick_type else None
            count = int(count) if count else AVAILABLE_IP_COUNT

            # 从数据库中取出数据
            proxies_list = self.proxy_pool.get_proxies(protocol=protocol,
                                                       domain=domain,
                                                       nick_type=nick_type,
                                                       count=count)
            # 用于存放若干个代理ip
            result = []
            # 依次将代理ip转化为字典格式并存入结果列表中
            for proxy in proxies_list:
                result.append(proxy.__dict__)
            # 将结果列表转化为字符串格式
            return json.dumps(result)

        @self.app.route('/disable_domain')
        def disable_domain():
            """
            更新此代理ip下的不可用域名列表
            """
            # 获取url中的请求参数
            ip = request.args.get('ip')
            domain = request.args.get('domain')

            # 判断传入参数是否为空
            if ip is None:
                return "请传入有效ip"
            if domain is None:
                return "请传入有效domain"

            # 更新此代理ip下的不可用域名列表
            self.proxy_pool.disable_domain(ip=ip, domain=domain)
            return "此代理ip不可用域名列表更新成功"

        @self.app.route('/headers')
        def headers():
            # 请求头
            with open('user-agent.txt', 'r', encoding='utf-8') as f:
                USER_AGENTS_LIST = f.readlines()
            # 去掉开头的",末尾的"\n
            return str(random.choice(USER_AGENTS_LIST)[1:-2])

    def run(self):
        """
        启动flask服务
        :return:
        """
        self.app.run(host=FLASK_HOST, port=FLASK_PORT)

    @classmethod
    def start(cls):
        """
        开启web服务
        :return:
        """
        # 创建并运行web服务
        api = cls()
        api.run()
예제 #20
0
 def __init__(self):
     #创建操作数据库的对象
     self.mongo_pool=MongoPool()
     self.queue=Queue()
     self.corourine_pool=Pool()
예제 #21
0
class ProxyTester(object):
    def __init__(self):
        self.queue = Queue()
        self.pool = Pool() # 协程池
        self.proxy_pool = MongoPool() # 基于MongoDB的代理池

    def _test_proxy(self):
        # 从代理队列中, 获取请求
        proxy = self.queue.get()
        try:
            # 验证当前的代理
            proxy = check_proxy(proxy)
            # 如果速度为-1就说明请求失败了
            if proxy.speed == -1:
                # 代理的分数-1
                proxy.score -= 1

                # 如果分数为0, 就删除该代理
                if proxy.score == 0:
                    self.proxy_pool.delete(proxy)
                    logger.info('删除代理:{}'.format(proxy))
                else:
                    # 如果分数不为0 ,就更新当前的代理
                    self.proxy_pool.update(proxy)
            else:
                # 如果请求成功了, 恢复为最高分数
                proxy.score = settings.MAX_SCORE
                self.proxy_pool.update(proxy)

        except Exception as ex:
            logger.exception(ex)

        self.queue.task_done()

    def _test_proxy_finish(self, temp):
        self.pool.apply_async(self._test_proxy, callback=self._test_proxy_finish)

    def run(self):
        # 1. 获取所有代理IP
        proxies = self.proxy_pool.find()
        # 2. 如果代理池为空, 直接返回
        if proxies is None or len(proxies) == 0:
            print("代理池为空")
            return

        # 获取所有的代理, 放到队列中
        for proxy in proxies:
            self.queue.put(proxy)

        # 开启多个异步任务执行检查IP的任务
        for i in range(settings.TESTER_ANSYC_COUNT):
            self.pool.apply_async(self._test_proxy,callback=self._test_proxy_finish)

        # 让主线程等待异步任务完成
        self.queue.join()

    @staticmethod
    def start():
        tester = ProxyTester()
        tester.run()
        # 每隔2小时检查下代理是否可用
        schedule.every(settings.TESTER_INTERVAL).hours.do(tester.run)
        while True:
            schedule.run_pending()
            time.sleep(1)
예제 #22
0
 def __init__(self):
     """
     创建协程池及数据库操作对象
     """
     self.pool = Pool()
     self.proxy_pool = MongoPool()