예제 #1
0
    def __init__(self):
        # 创建MongoPool对象
        self.mongopool = MongoPool()
        # 3,使用异步来执行每一个爬虫任务

        # 3.1 在init方法中创建协程池队象
        self.coroutine_pool = Pool()
예제 #2
0
class RunSpider(object):
    def __init__(self):
        self.mongo_pool = MongoPool()
        self.coroutine_pool = Pool()

    def get_spider_from_settings(self):
        for full_class_name in PROXIES_SPIDERS:
            module_name, class_name = full_class_name.rsplit('.', maxsplit=1)
            module = importlib.import_module(module_name)
            cls = getattr(module, class_name)
            spider = cls()
            yield spider

    def run(self):
        spiders = self.get_spider_from_settings()
        for spider in spiders:
            # self.__execute_one_spider_task(spider)
            self.coroutine_pool.apply_async(self.__execute_one_spider_task,
                                            args=(spider, ))

        self.coroutine_pool.join()

    def __execute_one_spider_task(self, spider):
        try:
            for proxy in spider.get_proxies():
                proxy = check_proxy(proxy)
                if proxy.speed != -1:
                    self.mongo_pool.insert_one(proxy)
        except Exception as ex:
            logger.exception(ex)
예제 #3
0
class RunSpider(object):
    def __init__(self):
        # 创建mongopool对象
        self.mongo_pool = MongoPool()
        # 创建协程池
        self.coroutine_pool = Pool()

    def get_spider_from_settings(self):
        '''
        根据配置文件获取爬虫对象列表,
        :return:
        '''
        # 遍历文件爬虫的全类名
        for full_class_name in PROXIES_SPIDERS:
            # 获取模块名和类名
            module_name, class_name = full_class_name.rsplit('.', maxsplit=1)
            # print(full_class_name.rsplit('.', maxsplit=1))
            # 根据模块名导入模块
            module = importlib.import_module(module_name)
            # 根据类名,从模块中获取类
            cls = getattr(module, class_name)
            spider = cls()
            # print(spider)
            yield spider

    def run(self):
        #  根据配置文件获取爬虫对象列表,
        spiders = self.get_spider_from_settings()
        for spider in spiders:
            # 异步调用执行的方法
            self.coroutine_pool.apply_async(self._execute_one_spider_task, args=(spider,))
        # 调用协程的join,让当前线程等待 协程的任务完成
        self.coroutine_pool.join()

    def _execute_one_spider_task(self, spider):
        # 用于处理爬虫的方法
        try:
            # 遍历爬虫对象的方法
            for proxy in spider.get_proxies():
                # print(proxy)
                # 检测代理可用性
                proxy = check_proxy(proxy)
                # 如果speed不为-1 就说明可用
                if proxy.speed != -1:
                    self.mongo_pool.insert_one(proxy)
        except Exception as ex:
            logger.exception(ex)

    @classmethod
    def start(cls):
        rs = RunSpider()
        rs.run()

        # 每间隔多长时间进行一次执行
        # settings里面配置
        schedule.every(RUN_SPIDERS_INTERVAL).hours.do(rs.run)
        while True:
            # 检测时间  每隔一秒钟检查一次是否到了时间
            schedule.run_pending()
            time.sleep(1)
예제 #4
0
    def __init__(self):
        self.app = Flask(__name__)
        self.mongo_pool = MongoPool()
        @self.app.route('/random')
        def random():
            protocol = request.args.get('protocol')
            domain = request.args.get('domain')
            proxy = self.mongo_pool.random_proxy(protocol, domain, count=API_COUNT)
            print(proxy)
            if protocol:
                return '{}://{}:{}'.format(protocol, proxy.ip, proxy.port)
            else:
                return '{}:{}'.format(proxy.ip, proxy.port)

        @self.app.route('/all')
        def all():
            protocol = request.args.get('protocol')
            domain = request.args.get('domain')
            proxies = self.mongo_pool.get_proxies(protocol, domain, count=API_COUNT)
            proxies = [proxy.__dict__ for proxy in proxies]
            return json.dumps(proxies)

        @self.app.route('/disable_domain')
        def disable_domain():
            ip = request.args.get('ip')
            domain = request.args.get('domain')
            if ip is None:
                return "请提供ip参数"
            if domain is None:
                return "请传入域名"
            self.mongo_pool.disable_domain(ip, domain)
            return "{} 禁用域名{} 成功".format(ip, domain)
예제 #5
0
class ProxyApi():
    def __init__(self):
        #初始一个Flask的web服务
        self.app=Flask(__name__)
        #创建MongoPool对象,用于操作数据库
        self.mongo_pool=MongoPool()
        @self.app.route('/random')
        def random():
            """
            # 实现根据协议类型和域名,提供随机的获取高可用代理IP的服务
            # 可用通过protocol和domain 参数对ip进行过滤
            # protocol: 当前请求的协议类型
            # domain: 当前请求域名
            :return:
            """
            protocol=request.args.get('protocol')
            domain=request.args.get('domain')
            print(protocol)
            print(domain)
            proxy=self.mongo_pool.random_proxy(protocol,domain,count=PROXIES_MAX_COUNT)
            if protocol:
                return '{}://{}:{}'.format(protocol,proxy.ip,proxy.port)
            else:
                return  '{}:{}'.format(proxy.ip,proxy.port)
        @self.app.route('/proxies')
        def proxies():
            # 实现根据协议类型和域名,提供获取多个高可用代理IP的服务
            # 可用通过protocol和domain参数对ip进行过滤
            # 实现给指定的IP上追加不可用域名的服务
            #获取协议:http/https
            protocol=request.args.get('proxies')
            #区域名:jd.com
            domain=request.args.get('domain')
            proxies=self.mongo_pool.get_proxies(protocol,domain,count=PROXIES_MAX_COUNT)
            #proxies 是一个 Proxy对象的列表,但是Proxy对象不能进行josn序列化,需要转化字典列表
            proxies=[proxy.__dict__ for proxy in proxies]
            #返回json格式值串
            return json.dumps(proxies)

        @self.app.route('/disable_domain')
        def disable_domain():
            # 如果在获取ip的时候,有指定域名参数,将不再获取IP,从而进一步提高代理IP的可用性
            ip=request.args.get('ip')
            domain=request.args.get('domain')
            if ip is None:
                return "请提供ip参数"
            if domain is None:
                return '请提供域名domain参数'
            self.mongo_pool.disable_domain(ip,domain)
            return "{} 禁用域名 {} 成功".format(ip,domain)

    # 实现run方法,用于启动Flask的web服务
    def run(self):
        self.app.run('0.0.0.0',port=16888)

    # 实现start的类方法,用于通过类名,启动服务
    @classmethod
    def start(cls):
        proxy_api = ProxyApi()
        proxy_api.run()
예제 #6
0
class RunSpider(object):
    def __init__(self):
        #在init中,建立数据连接,获取要操作的集合
        self.mongo_pool=MongoPool()
        # 在init方法中创建协程池对象
        self.coroutine_pool=Pool()

    def get_spider_from_settings(self):
        #根据配置文件信息,获取爬虫对象列表
        #遍历配置文件中爬虫信息,获取每个爬虫全类名
        for full_class_name in PROXIES_SPIDERS:
            #core.proxy_spider.proxy_spiders.XiciSpider
            #获取模块名 和 类名
            module_name, class_name = full_class_name.rsplit('.', maxsplit=1)
            # 根据模块名,导入模块
            module = importlib.import_module(module_name)
            # 根据类名,从模块中,获取类
            cls = getattr(module, class_name)
            # 3创建爬虫对象
            spider = cls()
            print(spider, "666")
            yield spider


    def run(self):
        #根据配置文件信息,获取爬虫对象列表,
        spiders=self.get_spider_from_settings()
        # 遍历爬虫对象列表,获取爬虫对象,遍历爬虫对象的get_proxies方法,获取IP

        for spider in spiders:
            # 使用异步执行这个方法
            # self._execute_one_spider_task(spider)
            self.coroutine_pool.apply_async(self._execute_one_spider_task,args=(spider,))
        # 调用协程的join方法,让当前线程等待 协程 任务完成
        self.coroutine_pool.join()
    # 把处理一个代理爬虫的代码抽到一个方法,用于处理一个爬虫任务的
    def _execute_one_spider_task(self, spider):
        try:
            for proxy in spider.get_proxies():
                # print(proxy)
                # 检验代理IP(代理ip检验模块)
                proxy = check_proxy(proxy)
                # 如果可用,写入数据库(数据库模块),如果speed不为-1,就说明可用
                if proxy.speed != -1:
                    # 写入数据库(数据库模块)
                    self.mongo_pool.insert_one(proxy)
        except Exception as ex:
            logger.exception(ex)
    @classmethod
    def start(cls):
        # 1,定义一个start的类方法
        # 2,创建当前类的对象,调用run方法
        rs=RunSpider()
        rs.run()
        # 3,使用schedule模块,每隔一定时间,执行当前对象的run方法
        #修改配置文件,增加爬虫运行时间间隔的配置,单位为小时
        schedule.every(RUN_SPIDERS_INTERVAL).hours.do(rs.run)
        while True:
            schedule.run_pending()
            time.sleep(2)
예제 #7
0
    def __init__(self):
        self.app = Flask(__name__)
        self.mongo_pool = MongoPool()

        @self.app.route('/random/')
        def random():
            protocol = request.args.get('protocol')
            domain = request.args.get('domain')
            proxy = self.mongo_pool.random_proxy(protocol, domain, count=PROXIES_DEFAULT_COUNT)
            if protocol:
                return '{}://{}:{}'.format(protocol, proxy.ip, proxy.port)
            else:
                return '{}:{}'.format(proxy.ip, proxy.port)

        @self.app.route('/proxies/')
        def proxies():
            protocol = request.args.get('protocol')
            domain = request.args.get('domain')
            proxies = self.mongo_pool.get_proxies(protocol, domain, count=PROXIES_DEFAULT_COUNT)
            proxies = [{'ip': proxy.ip, 'port': proxy.port} for proxy in proxies]
            return json.dumps(proxies)

        @self.app.route('/disabled_domain/')
        def disable_domain():
            ip = request.args.get('ip', None)
            domain = request.args.get('domain', None)
            if ip is None:
                return '填写ip'
            if domain is None:
                return '填写域名'
            self.mongo_pool.disabled_domain(ip, domain)
            return '成功设置{}禁止访问{}'.format(ip, domain)
예제 #8
0
class ProxyAPI(object):
    def __init__(self):
        self.app = Flask(__name__)
        self.mongo_pool = MongoPool()

        @self.app.route('/random')
        def random_proxy():
            protocol = request.args.get('protocol')
            domain = request.args.get('domain')
            proxies = self.mongo_pool.find_all()
            p_index = random.randint(1, 100)
            pr = proxies[p_index]
            pr = pr.__dict__
            return pr

        @self.app.route('/proxies')
        def proxies_list():
            proxies = self.mongo_pool.find_all()
            dict_list = [proxy.__dict__ for proxy in proxies]
            return json.dumps(dict_list)

    def run(self):
        self.app.run('0.0.0.0', port=9999, debug=True)

    @classmethod
    def start(cls):
        pa = cls()
        pa.run()
예제 #9
0
    def __init__(self):
        # 2.1 初始一个Flask的Web服务
        self.app = Flask(__name__)
        # 创建MongoPool对象,用于操作数据库
        self.mongo_pool = MongoPool()

        @self.app.route('/random')
        def random():
            """
            2.2 实现根据协议类型和域名,提供随机的获取高可用代理IP的服务
                可用通过 protocol 和 domain 参数对IP进行过滤
                protocol:当前请求的协议类型
                domain:当前请求域名
            :return:
            """
            protocol = request.args.get('protocol')
            domain = request.args.get('domain')
            proxy = self.mongo_pool.random_proxy(protocol,
                                                 domain,
                                                 count=PROXIES_MAX_COUNT)

            if protocol:
                return f'{protocol}://{proxy.ip}:{proxy.port}'
            else:
                return f'{proxy.ip}:{proxy.port}'

        @self.app.route('/proxies')
        def proxies():
            """
            2.3 实现根据协议类型和域名,提供获取多个高可用代理IP的服务
                可用通过protocol 和 domain 参数对IP进行guolv
                实现给指定的IP上追加不可用域名的服务
            :return:
            """
            # 获取协议: http/https
            protocol = request.args.get('protocol')
            # 域名:如jd.com
            domain = request.args.get('domain')

            proxies = self.mongo_pool.get_proxies(protocol,
                                                  domain,
                                                  count=PROXIES_MAX_COUNT)
            # proxies 是一个Proxy对象的列表,但是Proxy对象不饿能进行json序列化,需要转换为字典的列表
            # 转换为字典列表
            proxies = [proxy.__dict__ for proxy in proxies]

            return {"proxies": proxies}

        @self.app.route('/disable_domain')
        def disable_domain():
            # 2.4 如果在获取IP的时候,有指定域名参数,将不再获取该IP,从而进一步提高代理IP的可用性
            ip = request.args.get('ip')
            domain = request.args.get('domain')

            if ip is None:
                return '请提供ip参数'
            if domain is None:
                return '请提供域名domain参数'
            self.mongo_pool.disable_domain(ip, domain)
            return f"{ip} 禁用域名 {domain} 成功"
예제 #10
0
class ProxyApi(object):

    def __init__(self):
        # 2. 实现初始化方法
        # 2.1 初始一个Flask的Web服务
        self.app = Flask(__name__)
        self.mongo_pool = MongoPool()

        @self.app.route('/random')
        def random():
            protocol = request.args.get('protocol')
            domain = request.args.get('domain')
            proxy = self.mongo_pool.random_proxy(protocol, domain, count=PROXIES_MAX_COUNT)
            if protocol:
                return '{}://{}:{}'.format(protocol, proxy.ip, proxy.port)
            else:
                return '{}:{}'.format(proxy.ip, proxy.port)

        @self.app.route('/proxies')
        def proxies():
            protocol = request.args.get('protocol')
            domain = request.args.get('domain')

            proxies = self.mongo_pool.get_proxies(protocol, domain, count=PROXIES_MAX_COUNT)
            proxies = [proxy.__dict__ for proxy in proxies]
            return json.dumps(proxies)

    def run(self):
        self.app.run('0.0.0.0', port=9000)
예제 #11
0
    def __init__(self):
        # 初始化一个Flask的Web服务
        self.app = Flask(__name__)
        # 创建MongoPool对象,用于操作数据库
        self.mongo_pool = MongoPool()

        @self.app.route('/random')
        def random():
            '''
            根据协议类型和域名,提供随机的获取高可用代理IP的服务
            :protocol: 当前请求的协议类型
            :domain: 当前请求域名
            '''
            protocol = request.args.get('protocol')
            domain = request.args.get('domain')
            proxy = self.mongo_pool.random_proxy(protocol,
                                                 domain,
                                                 count=PROXIES_MAX_COUNT,
                                                 nick_type=2)

            if protocol:
                return "{}://{}:{}".format(protocol, proxy.ip, proxy.port)
            else:
                return "{}:{}".format(proxy.ip, proxy.port)

        @self.app.route('/proxies')
        def proxies():
            '''
            实现根据协议类型和域名,提供获取多个高可用代理的IP服务
            :return:
            '''
            protocol = request.args.get('protocol')
            domain = request.args.get('domain')
            proxies = self.mongo_pool.get_proxies(protocol,
                                                  domain,
                                                  count=PROXIES_MAX_COUNT,
                                                  nick_type=0)
            # proxies 是一个Proxy对象列表,需要转化为字典列表
            # 转化为字典列表
            proxies = [proxy.__dict__ for proxy in proxies]
            # 返回json格式的字符串
            return json.dumps(proxies)

        @self.app.route('/disable_domain')
        def disable_domain():
            '''
            如果在获取IP的时候,有指定域名参数,将不再获取该IP,从而进一步提高代理IP的可用性
            :return:
            '''
            ip = request.args.get('ip')
            domain = request.args.get('domain')

            if ip is None:
                return "请提供ip参数"
            if domain is None:
                return "请提供域名domain参数"

            self.mongo_pool.disable_domain(ip, domain)
            return f"{ip}禁用域名{domain}成功"
예제 #12
0
 def __init__(self, module_name='', spider_list=[]):
     if module_name:
         self.module_name = module_name
     if spider_list:
         self.spider_list = spider_list
     self.mongo_pool = MongoPool()
     #创建协程池
     self.coroutine_pool = Pool()
예제 #13
0
    def __init__(self):
        #初始化一个Flask的Web服务
        self.app = Flask(__name__)
        #创建MongoPool对象,用于操作数据库
        self.mongo_pool = MongoPool()

        @self.app.route('/random')
        def random():
            """
            2.2实现根据协议类型和域名,提供随机的获取高可用的代理IP服务
            可通过protocol 和 domain 参数对Ip进行过滤
            protocol :当前请求的协议
            domain :当前请求域名
            :return:
            """
            protocol = request.args.get('protocol')
            domain = request.args.get('domain')
            proxy = self.mongo_pool.random_proxy(protocol,
                                                 domain,
                                                 count=PROXIES_MAX_COUNT)

            if protocol:
                return '{}://{}:{}'.format(protocol, proxy.ip, proxy.port)
            else:
                return '{}:{}'.format(proxy.ip, proxy.port)

        @self.app.route('/proxies')
        def proxies():
            """
            2.3实现根据协议和域名,提供获取多个高可用代理的服务
                可指定potocol 和domain 参数对IP进行过滤

            """
            protocol = request.args.get('protocol')
            domain = request.args.get('domain')
            proxies = self.mongo_pool.get_proxies(protocol,
                                                  domain,
                                                  count=PROXIES_MAX_COUNT)
            #proxies 是一个Proxy对象的列表,但是Proxy对象不能进行json序列化,需要转换成字典列表
            #转化为字典
            proxies = [proxy.__dict__ for proxy in proxies]
            #返回json字符串
            return json.dumps(proxies)

        @self.app.route('/disable_domain')
        def disable_domain():
            #2.4实现给指定的IP上追加不可用域名的服务
            #如果在获取IP的时候,有指定域名参数,将不在获取该ip,从而进一步提高代理IP的可用性
            ip = request.args.get('ip')
            domain = request.args.get('domain')

            if ip is None:
                return '情提供ip参数'
            if domain is None:
                return '情提供域名domain参数'

            self.mongo_pool.disable_domain(ip, domain)
            return '{}禁用域名{}成功'.format(ip, domain)
예제 #14
0
class ProxyAPI_Flask(object):
    def __init__(self, count):
        self.count = count
        self.app = Flask(__name__, template_folder="../assets/templates")
        self.mongo_pool = MongoPool()

        @self.app.route('/')
        @self.app.route('/index')
        def index():
            return render_template("index.html")

        @self.app.route('/random')
        def random():
            protocol = request.args.get('protocol')
            domain = request.args.get('domain')
            proxy = self.mongo_pool.random_proxy(protocol,
                                                 domain,
                                                 count=self.count)

            if protocol:
                return f"{protocol}://{proxy.ip}:{proxy.port}"
            else:
                return f"{proxy.ip}:{proxy.port}"

        @self.app.route('/proxies')
        def proxies():
            protocol = request.args.get('protocol')
            domain = request.args.get('domain')
            proxies = self.mongo_pool.get_proxies(protocol,
                                                  domain,
                                                  count=self.count)
            # 将Proxy对象列表转化为字典列表
            proxies = [proxy.__dict__ for proxy in proxies]
            # 将字典变为json返回
            return json.dumps(proxies)

        @self.app.route('/disable_domain')
        def disable_domain():
            ip = request.args.get('ip')
            domain = request.args.get('domain')

            if ip is None:
                return '请提供IP参数\n'
            if domain is None:
                return '请提供domain参数\n'

            self.mongo_pool.disable_domain(ip, domain)
            return f"{ip} 禁用域名 {domain} 成功"

    def run(self):
        self.app.run('0.0.0.0', port=16888)

    @classmethod
    def start(cls):
        pf = cls(100)
        logger.info(
            '*****************Flask启动在localhost:16888端口,监听中*****************')
        pf.run()
예제 #15
0
    def __init__(self):
        # 实现初始方法
        # 初始一个Flask的Web服务
        self.app = Flask(__name__)
        # 创建MongoPool对象
        self.mongo_pool = MongoPool()

        @self.app.route('/random')
        def random():
            """
            实现根据协议类型和域名,提供随机的获取高可用代理IP的服务
            可用通过protocol和domain参数对IP进行过滤
            protocol:当前请求的协议类型
            domain:当前请求域名
            :return:
            """
            protocol = request.args.get('protocal')
            domain = request.args.get('domain')
            proxy = self.mongo_pool.random_proxy(protocol,
                                                 domain,
                                                 count=PROXIES_MAX_COUNT)

            if protocol:
                return '{}://{}:{}'.format(protocol, proxy.ip, proxy.port)
            else:
                return '{}:{}'.format(proxy.ip, proxy.port)

        @self.app.route('/proxies')
        def proxies():
            """实现根据协议类型和域名,提供获取多个高可用代理IP的服务"""
            protocol = request.args.get('protocal')
            domain = request.args.get('domain')

            proxies = self.mongo_pool.get_proxies(protocol,
                                                  domain,
                                                  count=PROXIES_MAX_COUNT)
            # proxies是对象的列表,不能进行json序列化,需要转化为字典列表
            # 转换为字典列表
            proxies = [proxy.__dict__ for proxy in proxies]

            # 返回json格式字符串
            return json.dumps(proxies)

        @self.app.route('/disable_domain')
        def disable_domain():
            """如果在获取IP的时候,有指定域名参数,将不在获取该IP从而进一步提高代理IP的可用性"""
            ip = request.args.get('ip')
            domain = request.args.get('domain')

            if ip is None:
                return '请求提供ip参数'
            if domain is None:
                return '请提供域名domain参数'

            self.mongo_pool.disable_domain(ip, domain)

            return "{} 禁用域名{} 成功".format(ip, domain)
예제 #16
0
    def __init__(self):
        # 实现初始方法
        # 2.1 初始一个Flask的web服务器
        self.app = Flask(__name__)
        # 创建MongoPool对象用于操作数据库
        self.mongo_pool = MongoPool()

        @self.app.route('/random')
        def random():
            """
            2.2 实现根据协议类型和域名,提供随机获取高可用代理ip的服务
              - 可通过protocol和domain参数对ip进行过滤
              - protocol:当前请求的协议类型
              - domain:当前请求域名
            """
            protocol = request.args.get('protocol')
            domain = request.args.get('domain')
            # print(protocol)
            # print(domain)
            proxy = self.mongo_pool.random_proxies(protocol, domain, count=PROXIES_MAX_COUNT)

            if protocol:
                return f"{protocol}://{proxy.ip}:{proxy.port}"
            else:
                return f"{proxy.ip}:{proxy.port}"

            # return '测试'

        @self.app.route('/proxies')
        def proxies():
            """
            2.3 实现根据协议类型和域名,提供获取多个高可用代理ip的服务
            可用通过protocol和domain参数对ip进行过滤
            :return:
            """
            protocol = request.args.get('protocol')
            domain = request.args.get('domain')
            proxies = self.mongo_pool.get_proxies(protocol, domain, count=PROXIES_MAX_COUNT)
            # proxies是一个Proxy对象的列表,但是proxy对象不能进行json序列化,需要转换为字典列表
            proxies = [proxy.__dict__ for proxy in proxies]
            # 返回json格式值串
            return json.dumps(proxies)

        # 2.4 实现给指定ip追加不可用域名的服务
        @self.app.route('/disable')
        def disable_domain():
            ip = request.args.get('ip')
            domain = request.args.get('domain')

            if ip is None:
                return '请提供ip参数'
            if domain is None:
                return '请提供domain参数'

            self.mongo_pool.disable_domain(ip, domain)
            return f"{ip}禁用域名{domain}成功"
예제 #17
0
class ProxyApi(object):
    def __init__(self):
        self.app = Flask(__name__)
        # 创建数据库
        self.mongo_pool = MongoPool()

        @self.app.route('/random')
        def random():
            '''
                实现根据协议类型和域名, 提供随机的获取高可用代理ip服务
                可以通过 protocol 和 domain 参数对ip进行过滤
                protocol:当前请求的协议类型
                domain: 当前请求域名
            :return:
            '''
            protocol = request.args.get('protocol')
            domain = request.args.get('domain')
            proxy = self.mongo_pool.random_proxy(protocol,
                                                 domain,
                                                 count=PROXIES_MAX_COUNT)
            if protocol:
                return '{}://{}:{}'.format(protocol, proxy.ip, proxy.port)
            else:
                return '{}:{}'.format(proxy.ip, proxy.port)

        @self.app.route('/proxies')
        def proxies():
            protocol = request.args.get('protocol')
            domain = request.args.get('domain')
            proxies = self.mongo_pool.get_proxies(protocol,
                                                  domain,
                                                  count=PROXIES_MAX_COUNT)
            # proxies 是一个对象的列表,不能json序列化 ,需要转化为字典列表
            proxies = [proxy.__dict__ for proxy in proxies]
            # 返回json
            return json.dumps(proxies)

        @self.app.route('/disable_domain')
        def disable_domain():
            ip = request.args.get('ip')
            domain = request.args.get('domain')
            if ip is None:
                return "请提供ip参数"
            if domain is None:
                return "请提供domain参数"
            self.mongo_pool.disable_domain(ip, domain)
            return '{} 禁用域名 {} 成功'.format(ip, domain)

    def run(self):
        self.app.run('0.0.0.0', port=16888)

    @classmethod
    def start(cls):
        #        4. 实现start的类方法啊,用于通过类名,启动服务
        proxy_api = cls()
        proxy_api.run()
예제 #18
0
class RunSpider(object):
    def __init__(self):
        # 创建MongoPool对象
        self.mongo_pool = MongoPool()
        # 3.1 在init方法中创建协程池对象
        self.coroutine_pool = Pool()

    def get_spider_from_settings(self):
        """根据胚子文件信息, 获取爬虫对象列表"""
        # 首先遍历配置文件中的爬虫信息, 获取每个爬虫全类名
        for full_class_name in PROXIES_SPIDERS:
            # core.proxy_spider.proxy_spiders.Ip66Spider
            # 获取模块名和类名,然后根据模块名动态创建类对象
            module_name, class_name = full_class_name.rsplit('.', maxsplit=1)
            # 根据模块名导入模块
            module = importlib.import_module(module_name)
            # 根据模块获取爬虫对象
            cls = getattr(module, class_name)
            # 创建爬虫对象
            spider = cls()
            yield spider

    def run(self):

        # 2.1 根据配置文件信息,获取爬虫对象列表
        spiders = self.get_spider_from_settings()
        for spider in spiders:
            # 2.2 遍历爬虫对象列表, 获取爬虫对象, 遍历爬虫对象的get_proxies方法, 获取代理IP
            # self._execute_one_spider_task(spiders)
            # 抽取出的方法使用线程池调度
            self.coroutine_pool.apply_async(self._execute_one_spider_task, args=(spider, ))
        # 3.4 调用协程的 join方法, 让当前线程等待协程任务的完成
        self.coroutine_pool.join()

    def _execute_one_spider_task(self, spider):
        try:
            # 遍历爬虫对象的get_proxies方法, 获取代理IP
            for proxy in spider.get_proxies():
                # 2.3 检测代理IP(代理IP检测模块)
                proxy = check_proxy(proxy)
                # 如果速度不为-1, 说明可用
                if proxy.speed != -1:
                    # 写入数据库
                    self.mongo_pool.insert_one(proxy)
        except Exception as e:
            logger.exception(e)

    @classmethod
    def start(cls):
        cls().run()

        schedule.every(SPIDER_TIME_DELAY).hours.do(cls().run)
        while True:
            schedule.run_pending()
            time.sleep(1)
예제 #19
0
class RunSpider(object):

    spider_list = [
        'kuaiSpider',
        'jiangxianSpider',
        'xilaSpider',
        'xiaohuanSpider',
        'zhimaSpider',
        'nimaSpider',
        'qiyunSpider',
        'spider89',
    ]
    module_name = 'core.proxy_spider.proxy_spiders'

    def __init__(self, module_name='', spider_list=[]):
        if module_name:
            self.module_name = module_name
        if spider_list:
            self.spider_list = spider_list
        self.mongo_pool = MongoPool()
        #创建协程池
        self.coroutine_pool = Pool()

    def get_spider_cls(self, spider_list, module_name):
        module = importlib.import_module(module_name)
        for spider_name in spider_list:
            spider_cls = getattr(module, spider_name)
            yield spider_cls

    def run_spider(self):
        for spider in self.get_spider_cls(self.spider_list, self.module_name):
            #self.__execute_one_spider_task(spider)
            self.coroutine_pool.apply_async(self.__execute_one_spider_task,
                                            args=(spider, ))
        self.coroutine_pool.join()

    def __execute_one_spider_task(self, spider):
        try:
            for proxy in spider.get_proxies():
                proxy = check_proxy(proxy)
                if proxy.delay != -1:
                    self.mongo_pool.insert_one(proxy)
                    print("新代理插入成功" + dict(proxy))
        except Exception as ex:
            logger.exception(ex)

    @classmethod
    def start(cls):
        rs = RunSpider()
        rs.run_spider()
        schedule.every(RUN_SPIDER_INTERVAL).hours.do(rs.run_spider)
        while True:
            schedule.run_pending()
            time.sleep(30)
예제 #20
0
    def __init__(self):
        self.app = Flask(__name__)
        self.mongo_pool = MongoPool()

        @self.app.route('/random')
        def random():
            protocol = request.args.get('protocol')
            proxy = self.mongo_pool.usable_proxy()
            if protocol:
                return f'{protocol}://{proxy.ip}:{proxy.port}'
            else:
                return f'{proxy.ip}:{proxy.port}'
예제 #21
0
class RunSpider(object):
    def __init__(self):
        self.mongo_pool = MongoPool()
        #创建协程池对象
        self.coroutine_pool = Pool()

    def get_spider_from_settings(self):
        """根据配置文件信息,获取爬虫对象列表"""
        #遍历配置文件中爬虫信息,获取每个爬虫全类名
        for full_class_name in PROXIES_SPIDERS:
            #获取模块名和类名
            module_name, class_name = full_class_name.rsplit('.', maxsplit=1)
            #根据模块名,导入模块
            module = importlib.import_module(module_name)
            # #根据类名,从模块中,获取类
            cls = getattr(module, class_name)
            #创建爬虫对象
            spider = cls()
            # print(spider)
            yield spider

    def run(self):
        # 根据配置文件信息,获取爬虫对象列表
        spiders = self.get_spider_from_settings()
        for spider in spiders:
            #把处理一个代理爬虫的代码抽到一个方法用于处理一个爬虫任务
            # self.__execute_one_spider_task(spider)
            self.coroutine_pool.apply_async(self.__execute_one_spider_task,
                                            args=(spider, ))
            #调用协程的join方法,让当前线程等待协程任务的未完成
        self.coroutine_pool.join()

    def __execute_one_spider_task(self, spider):
        try:
            for proxy in spider.get_proxies():
                # 检测代理IP(调用检测模块)
                proxy = check_proxy(proxy)
                # 如果可用,写入数据库(调用数据库模块,speed不为-1就说明可用)
                if proxy.speed != -1:
                    self.mongo_pool.insert_one(proxy)
                    # print(proxy)
        except Exception as ex:
            logger.exception(ex)

    @classmethod
    def start(cls):
        rs = RunSpider()
        rs.run()
        #每间隔多少个小时运行爬虫
        schedule.every(RUN_SPIDERS_INTERVAL).hours.do(rs.run)
        while True:
            schedule.run_pending()
            time.sleep(1)
예제 #22
0
class ProxyApi(object):
    def __init__(self):
        self.app = Flask(__name__)
        self.mongo_pool = MongoPool()

        @self.app.route('/random')
        def random():
            protocol = request.args.get('protocol')
            domain = request.args.get('domain')
            proxy = self.mongo_pool.random_proxy(protocol,
                                                 domain,
                                                 count=PROXIES_MAX_COUNT)

            if protocol:
                return '{}://{}:{}'.format(protocol, proxy.ip, proxy.port)
            else:
                return '{}:{}'.format(proxy.ip, proxy.port)

        @self.app.route('/proxies')
        def proxies():
            protocol = request.args.get('protocol')
            domain = request.args.get('domain')
            proxies = self.mongo_pool.get_proxies(protocol,
                                                  domain,
                                                  count=PROXIES_MAX_COUNT)
            #把proxies对象转换为字典
            proxies = [proxy.__dict__ for proxy in proxies]
            #返回json格式列表
            return json.dumps(proxies)

        @self.app.route('/disable_domain')
        def disable_domain():
            ip = request.args.get('ip')
            domain = request.args.get('domain')

            if ip is None:
                return '请求提供Ip参数'
            if domain is None:
                return '请提供域名domain参数'
            self.mongo_pool.disable_domain(ip, domain)
            return "{} 禁用域名 {} 成功".format(ip, domain)

    def run(self):
        self.app.run('0.0.0.0', port=16888)

    @classmethod
    def start(cls):
        proxy_api = cls()
        proxy_api.run()
예제 #23
0
class ProxyApi(object):
    def __init__(self):
        # 2. 实现初始化方法
        # 2.1 初始一个Flask的Web服务
        self.app = Flask(__name__)
        self.mongo_pool = MongoPool()

        @self.app.route('/')
        def index():
            html = '''
                   <h2 align="center">Welcome to my proxies!</h2>
                   <div align="center"><a href="http://*****:*****@self.app.route('/random')
        def random():
            protocol = request.args.get('protocol')
            domain = request.args.get('domain')
            proxy = self.mongo_pool.random_proxy(protocol,
                                                 domain,
                                                 count=PROXIES_MAX_COUNT)
            if protocol:
                return '{}://{}:{}'.format(protocol, proxy.ip, proxy.port)
            else:
                return '{}:{}'.format(proxy.ip, proxy.port)

        @self.app.route('/proxies')
        def proxies():
            protocol = request.args.get('protocol')
            domain = request.args.get('domain')

            proxies = self.mongo_pool.get_proxies(protocol,
                                                  domain,
                                                  count=PROXIES_MAX_COUNT)
            proxies = [proxy.__dict__ for proxy in proxies]
            return json.dumps(proxies, ensure_ascii=False)

    def run(self):
        self.app.run('localhost', port=16888)

    @classmethod
    def start(cls):
        proxy_api = cls()
        proxy_api.run()
예제 #24
0
class RunSpider(object):

    def __init__(self):
        self.mongo_pool = MongoPool()
        # 创建携程池对象
        self.coroutine_pool = Pool()

    @staticmethod
    def get_spiders_from_settings(self):
        for spider_full_path in PROXY_SPIDERS:
            module_name, class_name = spider_full_path.rsplit('.', maxsplit=1)
            # 根据已经获得的module_name动态导入模块
            module = importlib.import_module(module_name)
            # 根据类名,从模块中获取类
            cls = getattr(module, class_name)
            # 创建爬虫对象
            spider = cls()
            yield spider

    def __execute_one_spider_task(self, spider):
        try:
            for proxy in spider.get_proxies():
                # 检测代理IP可用性
                proxy = check_proxy(proxy)
                # print(proxy)
                # 如果可用就入数据库
                if proxy.speed != -1:
                    self.mongo_pool.insert_one(proxy)
        except Exception as e:
            logger.exception(e)

    def run(self):
        spiders = self.get_spiders_from_settings()
        for spider in spiders:
            self.coroutine_pool.apply_async(self.__execute_one_spider_task, args=(spider,))
            # self.__execute_one_spider_task(spider)
        self.coroutine_pool.join()

    # 定时调用run方法启动spiders
    @classmethod
    def start(cls):
        r = RunSpider()
        r.run()
        logger.info("*****************本次爬取完毕,等待下次爬取*****************")
        schedule.every(RUN_SPIDERS_INTERVAL).hours.do(r.run)
        while True:
            schedule.run_pending()
            time.sleep(RUN_SPIDERS_INTERVAL * 60 * 60 / 2 + 1)
예제 #25
0
    def __init__(self, count):
        self.count = count
        self.app = Flask(__name__, template_folder="../assets/templates")
        self.mongo_pool = MongoPool()

        @self.app.route('/')
        @self.app.route('/index')
        def index():
            return render_template("index.html")

        @self.app.route('/random')
        def random():
            protocol = request.args.get('protocol')
            domain = request.args.get('domain')
            proxy = self.mongo_pool.random_proxy(protocol,
                                                 domain,
                                                 count=self.count)

            if protocol:
                return f"{protocol}://{proxy.ip}:{proxy.port}"
            else:
                return f"{proxy.ip}:{proxy.port}"

        @self.app.route('/proxies')
        def proxies():
            protocol = request.args.get('protocol')
            domain = request.args.get('domain')
            proxies = self.mongo_pool.get_proxies(protocol,
                                                  domain,
                                                  count=self.count)
            # 将Proxy对象列表转化为字典列表
            proxies = [proxy.__dict__ for proxy in proxies]
            # 将字典变为json返回
            return json.dumps(proxies)

        @self.app.route('/disable_domain')
        def disable_domain():
            ip = request.args.get('ip')
            domain = request.args.get('domain')

            if ip is None:
                return '请提供IP参数\n'
            if domain is None:
                return '请提供domain参数\n'

            self.mongo_pool.disable_domain(ip, domain)
            return f"{ip} 禁用域名 {domain} 成功"
예제 #26
0
class ProxyApi(object):
    def __init__(self):
        # 初始化flask服务
        self.app = Flask(__name__)
        self.mongo_pool = MongoPool()

        @self.app.route('/random')
        def random():
            protocol = request.args.get("protocol")
            domain = request.args.get("domain")
            proxy = self.mongo_pool.get_random_proxy(protocl=protocol,
                                                     domain=domain)
            if proxy:
                return "{}://{}:{}".format(protocol, proxy.ip, proxy.port)
            return "test"

        @self.app.route('/proxies')
        def proxies():
            # 获取的协议
            protocol = request.args.get("protocol")
            # 域名
            domain = request.args.get("domain")
            proxies = self.mongo_pool.get_proxies(protocl=protocol,
                                                  domain=domain,
                                                  count=PROXIES_MAX_COUNT)
            proxies_list = [proxy.__dict__ for proxy in proxies]
            return json.dumps(proxies_list)

        @self.app.route('/disable_domain')
        def disable_domain():
            ip = request.args.get('ip')
            domain = request.args.get("domain")
            if ip is None:
                return "ip不能为空"
            if domain is None:
                return "domain不能为空"
            self.mongo_pool.disable_domain(ip, domain)
            return "{} 禁用{} 成功".format(ip, domain)

    def run(self):
        self.app.run('0.0.0.0', port=17777)

    @classmethod
    def start(cls):
        proxyApi = cls()
        proxyApi.run()
예제 #27
0
class RunSpider(object):
    def __init__(self):
        self.mongo_pool = MongoPool()
        self.corutine_pool = Pool()

    def get_spider_from_settings(self):
        # 遍历配置文件,获取信息
        for spider in PROXY_SPIDERS:
            # 获取模块名 类名
            module_name, class_name = spider.rsplit('.', maxsplit=1)
            # 根据模块名导入类名
            module = importlib.import_module(module_name)
            # 根据类名,从模块中获取类
            cls = getattr(module, class_name)
            # 创建对象
            spider = cls()
            yield spider

    def run(self):
        spiders = self.get_spider_from_settings()
        for spider in spiders:
            # 通过协程池异步执行
            self.corutine_pool.apply_async(self.__execute_one_spider,
                                           args=(spider, ))
        self.corutine_pool.join()

    # 处理一个爬虫的
    def __execute_one_spider(self, spider):
        try:
            for proxy in spider.get_proxies():
                proxy = check_proxy(proxy)
                if proxy.speed != -1:
                    self.mongo_pool.insert_one(proxy)

        except Exception as e:
            logger.error(e)

    @classmethod
    def start(self):
        rs = RunSpider()
        rs.run()
        schedule.every(RUN_SPIDERS_INTERVAL).hours.do(rs.run())
        while True:
            schedule.run_pending()
예제 #28
0
class RunSpider(object):
    def __init__(self):
        self.mongo_pool = MongoPool()
        self.coroutine_pool = Pool()

    def run(self):
        spiders = self.get_spider_from_settings()
        for spider in spiders:
            #异步的方式
            self.coroutine_pool.apply_async(self._execute_one_spider,
                                            args=(spider, ))
        self.coroutine_pool.join()

    def _execute_one_spider(self, spider):
        try:
            for proxy in spider.get_proxies():
                # 检验ip
                proxy = check_proxy(proxy)
                # speed=-1不可以
                if proxy.speed != -1:
                    self.mongo_pool.insert_one(proxy)
        except Exception as ex:
            logger.exception(ex)

    def get_spider_from_settings(self):
        # 根据配置文件获取爬虫列表
        for full_class_name in PROXIES_SPIDERS:
            #获取模块名和类名
            module_name, class_name = full_class_name.rsplit('.', maxsplit=1)
            #根据模块名导入模块 ---->import proxy_spider
            module = importlib.import_module(module_name)
            #根据类名,从模块中获取类---->from proxy_spider import ProxylistplusSpider,
            cls = getattr(module, class_name)
            spider = cls()
            yield spider

    @classmethod
    def start(cls):
        r = RunSpider()
        r.run()
        schedule.every(RUN_SPIDERS_INTERVAL).hours.do(r.run())
        while True:
            schedule.run_pending()
            time.sleep(3600)
예제 #29
0
    def __init__(self):
        self.app = Flask(__name__)
        self.mongo_pool = MongoPool()

        @self.app.route('/random')
        def random_proxy():
            protocol = request.args.get('protocol')
            domain = request.args.get('domain')
            proxies = self.mongo_pool.find_all()
            p_index = random.randint(1, 100)
            pr = proxies[p_index]
            pr = pr.__dict__
            return pr

        @self.app.route('/proxies')
        def proxies_list():
            proxies = self.mongo_pool.find_all()
            dict_list = [proxy.__dict__ for proxy in proxies]
            return json.dumps(dict_list)
예제 #30
0
class ProxyTester(object):

    def __init__(self):
        # 创建操作数据库的mongoPool对象
        self.mongo_pool = MongoPool()
        self.queue = Queue()
        self.coroutine_pool = Pool()

    def __check_callback(self,temp):
        self.coroutine_pool.apply_async(self.__check_one_proxy,callback=self.__check_callback)

    def run(self):
        # 提供一个run方法,用于处理检测代理IP核心逻辑
        # 2.1 从数据库中获取所有代理IP
        proxies = self.mongo_pool.find_all()
        for proxy in proxies:
            # self.__check_one_proxy(proxy)
            # 把代理IP添加到队列中
            self.queue.put(proxy)
        # 3.5 开启多个一个异步任务,来处理代理IP的检测,可以通过配置文件指定异步数量
        for i in range(TEST_PROXIES_ASYNC_COUNT):
            # 3.4 通过异步回调,使用死循环不断执行这个方法
            self.coroutine_pool.apply_async(self.__check_one_proxy,callback=self.__check_callback)

        # 让当前线程,等待队列任务完成
        self.queue.join()

    def __check_one_proxy(self):
        # 3.3 把检查一个代理可用性的代码,抽取到一个方法中,
        # 从队列中获取代理IP,进行检查,检查完毕
        proxy = self.queue.get()

        # 2.3 检查代理可用性
        proxy = check_proxy(proxy)
        # 如果代理不可用,让代理分数-1
        if proxy.speed == -1:
            proxy.score -= 1
            # 如果代理分数等于0,就从数据库中删除该代理
            if proxy.score <= 45:
                self.mongo_pool.delete_one(proxy)
            else:
                # 更新代理IP
                self.mongo_pool.update_one(proxy)
        else:
            # 2.5 如果代理可用,就恢复该代理分数,更新到数据库中
            proxy.score = MAX_SCORE
            self.mongo_pool.update_one(proxy)
        # 调度队列的task_done方法
        self.queue.task_done()

    @classmethod
    def start(cls):
        proxy_tester = cls()
        proxy_tester.run()
        schedule.every(TEST_PROXIES_INTERVAL).hours.do(proxy_tester.run)
        while True:
            schedule.run_pending()
            time.sleep(1)