def __init__(self): """ 初始化进程池、队列及数据库操作对象 """ self.pool = Pool() self.queue = Queue() self.mongo_pool = MongoPool()
class ProxyAip(object): def __init__(self): self.app = Flask(__name__) self.mongo = MongoPool() @self.app.route('/') def hello_world(): tips = '/random?protocol&domain&count' return tips # 随机获取一个代理 @self.app.route('/random') def random(): # 获取协议 protocol = request.args.get('protocol') # 获取域名 domain = request.args.get('domain') proxy = self.mongo.random_proxy(protocol, domain, count=PROXIES_MAX_COUT) if protocol: return '{}://{}:{}'.format(protocol, proxy.ip, proxy.prot) else: return '{}:{}'.format(proxy.ip, proxy.prot) # 获取队列中代理 @self.app.route('/proxies') def proxies(): protocol = request.args.get('protocol') domain = request.args.get('domain') proxies = self.mongo.get_proxies(protocol, domain, count=PROXIES_MAX_COUT) proxies = [proxy.__dict__ for proxy in proxies] return json.dumps(proxies) # 获取过滤掉不可用域名的代理 @self.app.route('/disable_domain') def disable_domain(): ip = request.args.get('ip') domain = request.args.get('domain') if ip is None: return '提供ip参数' if domain is None: return '提供domian参数' self.mongo.disable_domain(ip, domain) return '{}禁用()成功!'.format(ip, domain) def run(self): self.app.run('127.0.0.1', port=8000) @classmethod def start(cls): proxy_api = ProxyAip() proxy_api.run()
def __init__(self): # 创建操作数据库对象 self.mongo_pool = MongoPool() # 待检测ip队列 self.queue = Queue() # 协程池 self.coroutine_pool = Pool()
class Proxy_Api(object): def __init__(self): #初始化一个Flask服务 self.app = Flask(__name__) # 创建MongoPool对象用于操作数据库 self.mongo_pool = MongoPool() @self.app.route('/random') def random(): protocol = request.args.get('protocol') domain = request.args.get('domain') # 随机返回多个proxy对象 proxy = self.mongo_pool.random_proxy(protocol, domain, count=PROXIES_MAX_COUNT, nick_type=2) #print(proxy) if protocol: return '{}://{}:{}'.format(protocol, proxy.ip, proxy.port) else: return '{}:{}'.format(proxy.ip, proxy.port) @self.app.route('/proxies') def proxies(): protocol = request.args.get('protocol') domain = request.args.get('domain') proxies = self.mongo_pool.get_proxies(protocol, domain, count=PROXIES_MAX_COUNT) #把Proxy对象列表转换成字典才可以转换成json proxies = [proxy.__dict__ for proxy in proxies] #字典转换成json print(json.dumps(proxies)) return json.dumps(proxies, ensure_ascii=False) @self.app.route('/disable_domain') def disable_domain(): ip = request.args.get('ip') domain = request.args.get('domain') if ip is None: return '请提供ip这个参数' if domain is None: return '请提供domain这个参数' self.mongo_pool.disable_domain(ip, domain) return '{}禁用域名{}成功'.format(ip, domain) def run(self): self.app.run('127.0.0.1', port=16889) @classmethod def start(cls): Pa = cls() Pa.run()
def __init__(self): # 初始一个Flask的Web服务 self.app = Flask(__name__) self.proxy_pool = MongoPool() @self.app.route('/random') def random(): # 从传入参数中获取URL # 根据protocol参数获取协议 protocol = request.args.get('protocol') # 根据domain参数获取域名 domain = request.args.get('domain') proxy = self.proxy_pool.random(protocol=protocol, domain=domain, count=settings.AVAILABLE_IP_COUNT) # 如果有协议, 就返回带有协议代理IP和端口号 if protocol: return '{}://{}:{}'.format(protocol, proxy.ip, proxy.port) else: # 如果没有协议就返回, 不带协议的IP和端口号 return '{}:{}'.format(proxy.ip, proxy.port) @self.app.route('/proxies') def proxies(): # 根据protocol参数获取协议 protocol = request.args.get('protocol') # 根据domain参数获取域名 domain = request.args.get('domain') proxies = self.proxy_pool.get_proxies( protocol=protocol, domain=domain, count=settings.AVAILABLE_IP_COUNT) lis = [] for proxy in proxies: lis.append(proxy.__dict__) return json.dumps(lis) @self.app.route('/disable_domain') def disable_domain(): # 获取IP地址 ip = request.args.get('ip') # 获取不可用域名 domain = request.args.get('domain') if ip is None: return '请传入ip参数' if domain is None: return '请传入domain参数' # 更新域名成功 self.proxy_pool.disable_domain(ip=ip, domain=domain) return '该IP添加不可用域名成功'
class RunSpider(object): def __init__(self): self.mongo_pool = MongoPool() self.coroutine_pool = Pool() def get_spider_from_settings(self): """ 获取配置文件中的具体爬虫列表创建对象 """ for full_class_name in PROXIES_SPIDERS: module_name, class_name = full_class_name.rsplit('.', maxsplit=1) # 动态导入模块 module = importlib.import_module(module_name) cls = getattr(module, class_name) spider = cls() yield spider def run(self): ''' 遍历爬虫对象,执行get_proxies方法 ''' spiders = self.get_spider_from_settings() for spider in spiders: self.coroutine_pool.apply_async(self.__run_one_spider, args=(spider, )) # 当前线程等待爬虫执行完毕 self.coroutine_pool.join() def __run_one_spider(self, spider): try: for proxy in spider.get_proxies(): time.sleep(0.1) checked_proxy = check_proxy(proxy) if proxy.speed != -1: self.mongo_pool.insert(checked_proxy) except Exception as er: logger.exception(er) logger.exception("爬虫{} 出现错误".format(spider)) @classmethod def start(cls): """ 类方法,依据配置文件汇总的时间间隔run爬虫,单位小时 """ rs = RunSpider() rs.run() schedule.every(SPIDERS_RUN_INTERVAL).hours.do(rs.run) while 1: schedule.run_pending() time.sleep(60)
class RunSpider(object): """启动spider""" def __init__(self): '''创建数据库对象''' self.mongo_pool = MongoPool() # 创建协程池 self.coroutine_pool = Pool() def get_spider_from_settings(self): '''根据配置信息,获取爬虫列表''' for full_class_name in PROXIES_SPDERS: module_name, class_name = full_class_name.rsplit('.', maxsplit=1) # 从左往右截1次 module = importlib.import_module(module_name) cls = getattr(module, class_name) spdier = cls() yield spdier def run(self): spdiers = self.get_spider_from_settings() for spider in spdiers: # self.__execute_one_spider_task(spider) # 通过一部的方法执行 self.coroutine_pool.apply_async(self.__execute_one_spider_task, args=(spider,)) # 调用join方法,当前线程 等待 协程 任务的完成 self.coroutine_pool.join() def __execute_one_spider_task(self, spider): '''把处理一个代理爬虫的代码抽到一个方法''' try: for proxy in spider.get_proxies(): proxy = check_proxy(proxy) # print(proxy) # 写入数据库 self.mongo_pool.insert_one(proxy) except Exception as ex: logger.exception(ex) @classmethod def start(cls): '''运行时间设定,制动执行''' rs = RunSpider() rs.run() # schedule.every(RUN_SPDERS_INTERVAL).hour.do(rs.run) # 小时 schedule.every(RUN_SPDERS_INTERVAL).minutes.do(rs.run) # 分钟 while True: schedule.run_pending() time.sleep(1)
class DbProxiesCheck(object): def __init__(self): # 创建操作数据库对象 self.mongo_pool = MongoPool() # 待检测ip队列 self.queue = Queue() # 协程池 self.coroutine_pool = Pool() # 异步回调函数 def __check_callback(self, temp): self.coroutine_pool.apply_async(self.__check_one, callback=self.__check_one()) def run(self): # 处理检测代理ip核心逻辑 proxies = self.mongo_pool.find_all() for proxy in proxies: self.queue.put(proxy) # 开启多异步任务 for i in range(TEST_PROXIES_ASYNC_COUNT): # 异步回调,死循环执行该方法 self.coroutine_pool.apply_async(self.__check_one, callback=self.__check_one()) # 当前线程等待队列任务完成 self.queue.join() def __check_one(self): # 检查一个代理ip可用性 # 从队列中获取一个proxy proxy = self.queue.get() checked_proxy = check_proxy(proxy) if checked_proxy.speed == -1: checked_proxy.score -= 1 if checked_proxy.score == 0: self.mongo_pool.delete(checked_proxy) else: self.mongo_pool.update(checked_proxy) else: checked_proxy.score = MAX_SCORE self.mongo_pool.update(checked_proxy) # 调度队列的task_done方法(一个任务完成) self.queue.task_done() @classmethod def start(cls): ''' 类方法,依据配置文件的时间间隔运行检测数据库中的ip可用性,单位小时 ''' test = DbProxiesCheck() test.run() schedule.every(TEST_RUN_INTERVAL).hours.do(test.run) while 1: schedule.run_pending() time.sleep(60)
class ProxyTexter(object): def __init__(self): '''创建操作数据库的对象''' self.mongo_pool = MongoPool() self.queue = Queue() self.coroutine_proxy = Pool() def __check_callback(self, temp): # 死循环调用 self.coroutine_proxy.apply_async(self.__check_noe_proxy, callback=self.__check_callback) def run(self): '''检测核心逻辑''' proxies = self.mongo_pool.find_all() for proxy in proxies: # 检测 # self.__check_noe_proxy(proxy) # 把代理添加到队列中 self.queue.put(proxy) # 异步 for i in range(TEXT_PROXIES_AXYNC_COUT): # 异步回调 self.coroutine_proxy.apply_async(self.__check_noe_proxy, callback=self.__check_callback) # 让当前的 线程 等待 队列任务的完成 self.queue.join() def __check_noe_proxy(self): '''处理单个代理''' # 获取队列中的代理 proxy = self.queue.get() proxy = check_proxy(proxy) if proxy.speed == -1: proxy.score -= 1 if proxy.score == 0: self.mongo_pool.delete_one(proxy) else: self.mongo_pool.delete_one(proxy) else: proxy.score = MAX_SCORE self.mongo_pool.delete_one(proxy) # 调度队列的task_done方法 self.queue.task_done() @classmethod def start(cls): '''运行时间设定,制动执行''' proxy_tester = cls() proxy_tester.run() # schedule.every(TEXT_PROXIES_INTERVAL).hour.do(proxy_tester.run) # 每多少小时检测一次 schedule.every(TEXT_PROXIES_INTERVAL).minutes.do(proxy_tester.run) # 每多少分钟检测一次 while True: schedule.run_pending() time.sleep(1)
class ProxyTester(object): def __init__(self): #创建操作数据库的对象 self.mongo_pool=MongoPool() self.queue=Queue() self.corourine_pool=Pool() def __check_callback(self,temp): self.corourine_pool.apply_async(self.__check_one_proxy,callback=self.__check_callback) def __check_one_proxy(self): proxy=self.queue.get() proxy = check_proxy(proxy) if proxy.speed == -1: proxy.score -= 1 if proxy.score <= 0: self.mongo_pool.delete_one(proxy) else: self.mongo_pool.update_one(proxy) else: proxy.score = MAX_SCORE self.mongo_pool.update_one(proxy) self.queue.task_done() def run(self): proxies=self.mongo_pool.find_all() for proxy in proxies: self.queue.put(proxy) logger.info(proxy) # 开启多个异步 for i in range(TEST_PROXIES_ASYNC_COUNT): self.corourine_pool.apply_async(self.__check_one_proxy(),callback=self.__check_callback) self.queue.join() @classmethod def start(cls): pt = cls() pt.run() schedule.every(RUN_PROXY_TEST_INTERVAL).hours.do(pt.run()) while True: schedule.run_pending() time.sleep(3600)
def __init__(self): '''创建数据库对象''' self.mongo_pool = MongoPool() # 创建协程池 self.coroutine_pool = Pool()
class ProxyTest(object): """ 测试代理ip可用性 """ def __init__(self): """ 初始化进程池、队列及数据库操作对象 """ self.pool = Pool() self.queue = Queue() self.mongo_pool = MongoPool() def _test_proxies(self): """ 测试代理ip可用性,并更新到数据库 :return: """ # 从队列中取出一个代理ip proxy = self.queue.get() try: # 检测代理ip proxy = check_proxy(proxy) # 判断此代理ip此次是否有效,若有效则恢复为默认分值,否则分值减1 if proxy.speed == -1: proxy.score -= 1 # 若评分为0,则表示此代理ip不可用,则从数据库中删除此代理ip if proxy.score == 0: self.mongo_pool.delete(proxy) else: # 更新此代理ip self.mongo_pool.update(proxy) else: # 此代理ip恢复为默认分值 proxy.score = DEFAULT_SCORE # 更新此代理ip self.mongo_pool.update(proxy) except Exception as e: print(e) def _test_callback(self, source): """ 检测代理ip(_test_proxies)的回调函数 :param source: 回调函数所需参数 :return: """ # 使其死循环,不断检测代理ip self.pool.apply_async(self._test_proxies, callback=self._test_callback) def run(self): """ 启动代理ip的检测 :return: """ # 从数据库中获取所有代理ip proxies = self.mongo_pool.find() # 判断是否有代理ip if proxies is None or len(proxies) == 0: print("代理ip池为空") return # 依次将代理ip添加到队列中 for proxy in proxies: self.queue.put(proxy) # 开启若干进程,用于检测代理ip for test in range(TEST_ANSYC_COUNT): # 异步非阻塞 self.pool.apply_async(self._test_proxies, callback=self._test_callback) # 让主线程等待异步任务完成 self.pool.join() @classmethod def start(cls): """ 开启检测代理ip服务 :return: """ # 创建并启动检测 test = cls() test.run() # 定时启动检测 schedule.every(TEST_INTERVAL).hours.do(test.run) while True: # 运行任务 schedule.run_pending() time.sleep(1)
def __init__(self): self.queue = Queue() self.pool = Pool() # 协程池 self.proxy_pool = MongoPool() # 基于MongoDB的代理池
def __init__(self): self.pool = Pool() self.proxy_pool = MongoPool()
def __init__(self): self.mongo_pool = MongoPool() self.coroutine_pool = Pool()
def __init__(self): '''创建操作数据库的对象''' self.mongo_pool = MongoPool() self.queue = Queue() self.coroutine_proxy = Pool()
class RunSpider(object): def __init__(self): self.pool = Pool() self.proxy_pool = MongoPool() def _auto_import_instances(self): """根据配置信息, 自动导入爬虫""" instances = [] # 遍历配置的爬虫, 获取爬虫路径 for path in settings.PROXIES_SPIDERS: # 根据路径, 获取模块名 和 类名 module_name, cls_name = path.rsplit('.', maxsplit=1) # 根据模块名导入模块 module = importlib.import_module(module_name) # 根据类名, 从模块中, 获取爬虫类 cls = getattr(module, cls_name) # 创建爬虫对象, 添加到列表中 instances.append(cls()) # 返回爬虫对象列表 return instances def run(self): """启动爬虫""" # 获取代理爬虫 spiders = self._auto_import_instances() # 执行爬虫获取代理 for spider in spiders: # 使用协程异步调用该方法,提高爬取的效率 self.pool.apply_async(self.__run_one_spider, args=(spider, )) # 等待所有爬虫任务执行完毕 self.pool.join() def __run_one_spider(self, spider): try: for proxy in spider.get_proxies(): if proxy is None: # 如果是None继续一个 continue # 检查代理, 获取代理协议类型, 匿名程度, 和速度 proxy = check_proxy(proxy) # 如果代理速度不为-1, 就是说明该代理可用 if proxy.speed != -1: # 保存该代理到数据库中 self.proxy_pool.save(proxy) except Exception as e: logger.exception(e) logger.exception("爬虫{} 出现错误".format(spider)) @classmethod def start(cls): # 创建本类对象 run_spider = RunSpider() run_spider.run() # 每隔 SPIDER_INTERVAL 小时检查下代理是否可用 schedule.every(settings.SPIDER_INTERVAL).hours.do(run_spider.run()) while True: schedule.run_pending() time.sleep(1)
class RunSpiders(object): """ 启动各个爬虫 """ def __init__(self): """ 创建协程池及数据库操作对象 """ self.pool = Pool() self.proxy_pool = MongoPool() @staticmethod def _import_spider_instance(): """ 动态导入各爬虫模块并创建爬虫对象 :return: """ # 存放爬虫对象 instances = [] # 依次导入各爬虫模块并创建爬虫对象 for instance_path in PROXIES_SPIDERS: # 获取爬虫模块路径及爬虫类名称 module_name, class_name = instance_path.rsplit('.', maxsplit=1) # 导入模块 module = importlib.import_module(module_name) # 获取模块中的爬虫对象 _class = getattr(module, class_name) # 创建爬虫对象,并加入爬虫列表中 instances.append(_class()) return instances def _run_spider(self, spider): """ 开启爬虫 :param spider: :return: """ try: # 获取代理ip数据 for proxy in spider.get_proxies(): if proxy is None: continue # 检测此代理ip proxy = check_proxy(proxy) # 判断此代理ip是否有效 if proxy.speed != -1: # 将此代理ip保存到数据库中 self.proxy_pool.save(proxy) except Exception as e: logger.exception("爬虫{}出错,原因:{}".format(spider, e)) def run(self): """ 将各个爬虫加入到协程池中,并启动 :return: """ # 导入爬虫模块,并创建对象 spiders = self._import_spider_instance() # 将各个爬虫加入到协程池中 for spider in spiders: # 异步非阻塞 self.pool.apply_async(self._run_spider, args=(spider, )) # 让主线程等待异步任务完成 self.pool.join() @classmethod def start(cls): """ 开启爬虫服务 :return: """ # 创建并启动爬虫 spiders = cls() spiders.run() # 设置定时启动爬虫 schedule.every(SPIDER_INTERVAL).hours.do(spiders.run) while True: # 运行任务 schedule.run_pending() time.sleep(1)
class ProxyApi(object): """ 代理ip的web服务 """ def __init__(self): """ 初始化flask服务 """ # 创建flask服务 self.app = Flask(__name__) # 创建数据库操作对象 self.proxy_pool = MongoPool() @self.app.route('/random') def _random(): """ 根据条件随意获取一个代理ip :return: """ # 获取url中的请求参数 protocol = request.args.get('protocol') nick_type = request.args.get('nick_type') domain = request.args.get('domain') nick_type = int(nick_type) if nick_type else None # 从数据库中取出数据 proxy = self.proxy_pool.random(protocol=protocol, domain=domain, nick_type=nick_type, count=AVAILABLE_IP_COUNT) if proxy: # 判断是否有协议类型 if protocol: return "{}://{}:{}".format(protocol, proxy.ip, proxy.port) else: return "{}:{}".format(proxy.ip, proxy.port) else: return '' @self.app.route('/proxies') def proxies(): """ 根据条件获取若干个代理ip :return: """ # 获取url中的请求参数 protocol = request.args.get('protocol') nick_type = request.args.get('nick_type') domain = request.args.get('domain') count = request.args.get('count') nick_type = int(nick_type) if nick_type else None count = int(count) if count else AVAILABLE_IP_COUNT # 从数据库中取出数据 proxies_list = self.proxy_pool.get_proxies(protocol=protocol, domain=domain, nick_type=nick_type, count=count) # 用于存放若干个代理ip result = [] # 依次将代理ip转化为字典格式并存入结果列表中 for proxy in proxies_list: result.append(proxy.__dict__) # 将结果列表转化为字符串格式 return json.dumps(result) @self.app.route('/disable_domain') def disable_domain(): """ 更新此代理ip下的不可用域名列表 """ # 获取url中的请求参数 ip = request.args.get('ip') domain = request.args.get('domain') # 判断传入参数是否为空 if ip is None: return "请传入有效ip" if domain is None: return "请传入有效domain" # 更新此代理ip下的不可用域名列表 self.proxy_pool.disable_domain(ip=ip, domain=domain) return "此代理ip不可用域名列表更新成功" @self.app.route('/headers') def headers(): # 请求头 with open('user-agent.txt', 'r', encoding='utf-8') as f: USER_AGENTS_LIST = f.readlines() # 去掉开头的",末尾的"\n return str(random.choice(USER_AGENTS_LIST)[1:-2]) def run(self): """ 启动flask服务 :return: """ self.app.run(host=FLASK_HOST, port=FLASK_PORT) @classmethod def start(cls): """ 开启web服务 :return: """ # 创建并运行web服务 api = cls() api.run()
def __init__(self): #创建操作数据库的对象 self.mongo_pool=MongoPool() self.queue=Queue() self.corourine_pool=Pool()
class ProxyTester(object): def __init__(self): self.queue = Queue() self.pool = Pool() # 协程池 self.proxy_pool = MongoPool() # 基于MongoDB的代理池 def _test_proxy(self): # 从代理队列中, 获取请求 proxy = self.queue.get() try: # 验证当前的代理 proxy = check_proxy(proxy) # 如果速度为-1就说明请求失败了 if proxy.speed == -1: # 代理的分数-1 proxy.score -= 1 # 如果分数为0, 就删除该代理 if proxy.score == 0: self.proxy_pool.delete(proxy) logger.info('删除代理:{}'.format(proxy)) else: # 如果分数不为0 ,就更新当前的代理 self.proxy_pool.update(proxy) else: # 如果请求成功了, 恢复为最高分数 proxy.score = settings.MAX_SCORE self.proxy_pool.update(proxy) except Exception as ex: logger.exception(ex) self.queue.task_done() def _test_proxy_finish(self, temp): self.pool.apply_async(self._test_proxy, callback=self._test_proxy_finish) def run(self): # 1. 获取所有代理IP proxies = self.proxy_pool.find() # 2. 如果代理池为空, 直接返回 if proxies is None or len(proxies) == 0: print("代理池为空") return # 获取所有的代理, 放到队列中 for proxy in proxies: self.queue.put(proxy) # 开启多个异步任务执行检查IP的任务 for i in range(settings.TESTER_ANSYC_COUNT): self.pool.apply_async(self._test_proxy,callback=self._test_proxy_finish) # 让主线程等待异步任务完成 self.queue.join() @staticmethod def start(): tester = ProxyTester() tester.run() # 每隔2小时检查下代理是否可用 schedule.every(settings.TESTER_INTERVAL).hours.do(tester.run) while True: schedule.run_pending() time.sleep(1)
def __init__(self): """ 创建协程池及数据库操作对象 """ self.pool = Pool() self.proxy_pool = MongoPool()