def __init__(self): # 创建MongoPool对象 self.mongopool = MongoPool() # 3,使用异步来执行每一个爬虫任务 # 3.1 在init方法中创建协程池队象 self.coroutine_pool = Pool()
class RunSpider(object): def __init__(self): self.mongo_pool = MongoPool() self.coroutine_pool = Pool() def get_spider_from_settings(self): for full_class_name in PROXIES_SPIDERS: module_name, class_name = full_class_name.rsplit('.', maxsplit=1) module = importlib.import_module(module_name) cls = getattr(module, class_name) spider = cls() yield spider def run(self): spiders = self.get_spider_from_settings() for spider in spiders: # self.__execute_one_spider_task(spider) self.coroutine_pool.apply_async(self.__execute_one_spider_task, args=(spider, )) self.coroutine_pool.join() def __execute_one_spider_task(self, spider): try: for proxy in spider.get_proxies(): proxy = check_proxy(proxy) if proxy.speed != -1: self.mongo_pool.insert_one(proxy) except Exception as ex: logger.exception(ex)
class RunSpider(object): def __init__(self): # 创建mongopool对象 self.mongo_pool = MongoPool() # 创建协程池 self.coroutine_pool = Pool() def get_spider_from_settings(self): ''' 根据配置文件获取爬虫对象列表, :return: ''' # 遍历文件爬虫的全类名 for full_class_name in PROXIES_SPIDERS: # 获取模块名和类名 module_name, class_name = full_class_name.rsplit('.', maxsplit=1) # print(full_class_name.rsplit('.', maxsplit=1)) # 根据模块名导入模块 module = importlib.import_module(module_name) # 根据类名,从模块中获取类 cls = getattr(module, class_name) spider = cls() # print(spider) yield spider def run(self): # 根据配置文件获取爬虫对象列表, spiders = self.get_spider_from_settings() for spider in spiders: # 异步调用执行的方法 self.coroutine_pool.apply_async(self._execute_one_spider_task, args=(spider,)) # 调用协程的join,让当前线程等待 协程的任务完成 self.coroutine_pool.join() def _execute_one_spider_task(self, spider): # 用于处理爬虫的方法 try: # 遍历爬虫对象的方法 for proxy in spider.get_proxies(): # print(proxy) # 检测代理可用性 proxy = check_proxy(proxy) # 如果speed不为-1 就说明可用 if proxy.speed != -1: self.mongo_pool.insert_one(proxy) except Exception as ex: logger.exception(ex) @classmethod def start(cls): rs = RunSpider() rs.run() # 每间隔多长时间进行一次执行 # settings里面配置 schedule.every(RUN_SPIDERS_INTERVAL).hours.do(rs.run) while True: # 检测时间 每隔一秒钟检查一次是否到了时间 schedule.run_pending() time.sleep(1)
def __init__(self): self.app = Flask(__name__) self.mongo_pool = MongoPool() @self.app.route('/random') def random(): protocol = request.args.get('protocol') domain = request.args.get('domain') proxy = self.mongo_pool.random_proxy(protocol, domain, count=API_COUNT) print(proxy) if protocol: return '{}://{}:{}'.format(protocol, proxy.ip, proxy.port) else: return '{}:{}'.format(proxy.ip, proxy.port) @self.app.route('/all') def all(): protocol = request.args.get('protocol') domain = request.args.get('domain') proxies = self.mongo_pool.get_proxies(protocol, domain, count=API_COUNT) proxies = [proxy.__dict__ for proxy in proxies] return json.dumps(proxies) @self.app.route('/disable_domain') def disable_domain(): ip = request.args.get('ip') domain = request.args.get('domain') if ip is None: return "请提供ip参数" if domain is None: return "请传入域名" self.mongo_pool.disable_domain(ip, domain) return "{} 禁用域名{} 成功".format(ip, domain)
class ProxyApi(): def __init__(self): #初始一个Flask的web服务 self.app=Flask(__name__) #创建MongoPool对象,用于操作数据库 self.mongo_pool=MongoPool() @self.app.route('/random') def random(): """ # 实现根据协议类型和域名,提供随机的获取高可用代理IP的服务 # 可用通过protocol和domain 参数对ip进行过滤 # protocol: 当前请求的协议类型 # domain: 当前请求域名 :return: """ protocol=request.args.get('protocol') domain=request.args.get('domain') print(protocol) print(domain) proxy=self.mongo_pool.random_proxy(protocol,domain,count=PROXIES_MAX_COUNT) if protocol: return '{}://{}:{}'.format(protocol,proxy.ip,proxy.port) else: return '{}:{}'.format(proxy.ip,proxy.port) @self.app.route('/proxies') def proxies(): # 实现根据协议类型和域名,提供获取多个高可用代理IP的服务 # 可用通过protocol和domain参数对ip进行过滤 # 实现给指定的IP上追加不可用域名的服务 #获取协议:http/https protocol=request.args.get('proxies') #区域名:jd.com domain=request.args.get('domain') proxies=self.mongo_pool.get_proxies(protocol,domain,count=PROXIES_MAX_COUNT) #proxies 是一个 Proxy对象的列表,但是Proxy对象不能进行josn序列化,需要转化字典列表 proxies=[proxy.__dict__ for proxy in proxies] #返回json格式值串 return json.dumps(proxies) @self.app.route('/disable_domain') def disable_domain(): # 如果在获取ip的时候,有指定域名参数,将不再获取IP,从而进一步提高代理IP的可用性 ip=request.args.get('ip') domain=request.args.get('domain') if ip is None: return "请提供ip参数" if domain is None: return '请提供域名domain参数' self.mongo_pool.disable_domain(ip,domain) return "{} 禁用域名 {} 成功".format(ip,domain) # 实现run方法,用于启动Flask的web服务 def run(self): self.app.run('0.0.0.0',port=16888) # 实现start的类方法,用于通过类名,启动服务 @classmethod def start(cls): proxy_api = ProxyApi() proxy_api.run()
class RunSpider(object): def __init__(self): #在init中,建立数据连接,获取要操作的集合 self.mongo_pool=MongoPool() # 在init方法中创建协程池对象 self.coroutine_pool=Pool() def get_spider_from_settings(self): #根据配置文件信息,获取爬虫对象列表 #遍历配置文件中爬虫信息,获取每个爬虫全类名 for full_class_name in PROXIES_SPIDERS: #core.proxy_spider.proxy_spiders.XiciSpider #获取模块名 和 类名 module_name, class_name = full_class_name.rsplit('.', maxsplit=1) # 根据模块名,导入模块 module = importlib.import_module(module_name) # 根据类名,从模块中,获取类 cls = getattr(module, class_name) # 3创建爬虫对象 spider = cls() print(spider, "666") yield spider def run(self): #根据配置文件信息,获取爬虫对象列表, spiders=self.get_spider_from_settings() # 遍历爬虫对象列表,获取爬虫对象,遍历爬虫对象的get_proxies方法,获取IP for spider in spiders: # 使用异步执行这个方法 # self._execute_one_spider_task(spider) self.coroutine_pool.apply_async(self._execute_one_spider_task,args=(spider,)) # 调用协程的join方法,让当前线程等待 协程 任务完成 self.coroutine_pool.join() # 把处理一个代理爬虫的代码抽到一个方法,用于处理一个爬虫任务的 def _execute_one_spider_task(self, spider): try: for proxy in spider.get_proxies(): # print(proxy) # 检验代理IP(代理ip检验模块) proxy = check_proxy(proxy) # 如果可用,写入数据库(数据库模块),如果speed不为-1,就说明可用 if proxy.speed != -1: # 写入数据库(数据库模块) self.mongo_pool.insert_one(proxy) except Exception as ex: logger.exception(ex) @classmethod def start(cls): # 1,定义一个start的类方法 # 2,创建当前类的对象,调用run方法 rs=RunSpider() rs.run() # 3,使用schedule模块,每隔一定时间,执行当前对象的run方法 #修改配置文件,增加爬虫运行时间间隔的配置,单位为小时 schedule.every(RUN_SPIDERS_INTERVAL).hours.do(rs.run) while True: schedule.run_pending() time.sleep(2)
def __init__(self): self.app = Flask(__name__) self.mongo_pool = MongoPool() @self.app.route('/random/') def random(): protocol = request.args.get('protocol') domain = request.args.get('domain') proxy = self.mongo_pool.random_proxy(protocol, domain, count=PROXIES_DEFAULT_COUNT) if protocol: return '{}://{}:{}'.format(protocol, proxy.ip, proxy.port) else: return '{}:{}'.format(proxy.ip, proxy.port) @self.app.route('/proxies/') def proxies(): protocol = request.args.get('protocol') domain = request.args.get('domain') proxies = self.mongo_pool.get_proxies(protocol, domain, count=PROXIES_DEFAULT_COUNT) proxies = [{'ip': proxy.ip, 'port': proxy.port} for proxy in proxies] return json.dumps(proxies) @self.app.route('/disabled_domain/') def disable_domain(): ip = request.args.get('ip', None) domain = request.args.get('domain', None) if ip is None: return '填写ip' if domain is None: return '填写域名' self.mongo_pool.disabled_domain(ip, domain) return '成功设置{}禁止访问{}'.format(ip, domain)
class ProxyAPI(object): def __init__(self): self.app = Flask(__name__) self.mongo_pool = MongoPool() @self.app.route('/random') def random_proxy(): protocol = request.args.get('protocol') domain = request.args.get('domain') proxies = self.mongo_pool.find_all() p_index = random.randint(1, 100) pr = proxies[p_index] pr = pr.__dict__ return pr @self.app.route('/proxies') def proxies_list(): proxies = self.mongo_pool.find_all() dict_list = [proxy.__dict__ for proxy in proxies] return json.dumps(dict_list) def run(self): self.app.run('0.0.0.0', port=9999, debug=True) @classmethod def start(cls): pa = cls() pa.run()
def __init__(self): # 2.1 初始一个Flask的Web服务 self.app = Flask(__name__) # 创建MongoPool对象,用于操作数据库 self.mongo_pool = MongoPool() @self.app.route('/random') def random(): """ 2.2 实现根据协议类型和域名,提供随机的获取高可用代理IP的服务 可用通过 protocol 和 domain 参数对IP进行过滤 protocol:当前请求的协议类型 domain:当前请求域名 :return: """ protocol = request.args.get('protocol') domain = request.args.get('domain') proxy = self.mongo_pool.random_proxy(protocol, domain, count=PROXIES_MAX_COUNT) if protocol: return f'{protocol}://{proxy.ip}:{proxy.port}' else: return f'{proxy.ip}:{proxy.port}' @self.app.route('/proxies') def proxies(): """ 2.3 实现根据协议类型和域名,提供获取多个高可用代理IP的服务 可用通过protocol 和 domain 参数对IP进行guolv 实现给指定的IP上追加不可用域名的服务 :return: """ # 获取协议: http/https protocol = request.args.get('protocol') # 域名:如jd.com domain = request.args.get('domain') proxies = self.mongo_pool.get_proxies(protocol, domain, count=PROXIES_MAX_COUNT) # proxies 是一个Proxy对象的列表,但是Proxy对象不饿能进行json序列化,需要转换为字典的列表 # 转换为字典列表 proxies = [proxy.__dict__ for proxy in proxies] return {"proxies": proxies} @self.app.route('/disable_domain') def disable_domain(): # 2.4 如果在获取IP的时候,有指定域名参数,将不再获取该IP,从而进一步提高代理IP的可用性 ip = request.args.get('ip') domain = request.args.get('domain') if ip is None: return '请提供ip参数' if domain is None: return '请提供域名domain参数' self.mongo_pool.disable_domain(ip, domain) return f"{ip} 禁用域名 {domain} 成功"
class ProxyApi(object): def __init__(self): # 2. 实现初始化方法 # 2.1 初始一个Flask的Web服务 self.app = Flask(__name__) self.mongo_pool = MongoPool() @self.app.route('/random') def random(): protocol = request.args.get('protocol') domain = request.args.get('domain') proxy = self.mongo_pool.random_proxy(protocol, domain, count=PROXIES_MAX_COUNT) if protocol: return '{}://{}:{}'.format(protocol, proxy.ip, proxy.port) else: return '{}:{}'.format(proxy.ip, proxy.port) @self.app.route('/proxies') def proxies(): protocol = request.args.get('protocol') domain = request.args.get('domain') proxies = self.mongo_pool.get_proxies(protocol, domain, count=PROXIES_MAX_COUNT) proxies = [proxy.__dict__ for proxy in proxies] return json.dumps(proxies) def run(self): self.app.run('0.0.0.0', port=9000)
def __init__(self): # 初始化一个Flask的Web服务 self.app = Flask(__name__) # 创建MongoPool对象,用于操作数据库 self.mongo_pool = MongoPool() @self.app.route('/random') def random(): ''' 根据协议类型和域名,提供随机的获取高可用代理IP的服务 :protocol: 当前请求的协议类型 :domain: 当前请求域名 ''' protocol = request.args.get('protocol') domain = request.args.get('domain') proxy = self.mongo_pool.random_proxy(protocol, domain, count=PROXIES_MAX_COUNT, nick_type=2) if protocol: return "{}://{}:{}".format(protocol, proxy.ip, proxy.port) else: return "{}:{}".format(proxy.ip, proxy.port) @self.app.route('/proxies') def proxies(): ''' 实现根据协议类型和域名,提供获取多个高可用代理的IP服务 :return: ''' protocol = request.args.get('protocol') domain = request.args.get('domain') proxies = self.mongo_pool.get_proxies(protocol, domain, count=PROXIES_MAX_COUNT, nick_type=0) # proxies 是一个Proxy对象列表,需要转化为字典列表 # 转化为字典列表 proxies = [proxy.__dict__ for proxy in proxies] # 返回json格式的字符串 return json.dumps(proxies) @self.app.route('/disable_domain') def disable_domain(): ''' 如果在获取IP的时候,有指定域名参数,将不再获取该IP,从而进一步提高代理IP的可用性 :return: ''' ip = request.args.get('ip') domain = request.args.get('domain') if ip is None: return "请提供ip参数" if domain is None: return "请提供域名domain参数" self.mongo_pool.disable_domain(ip, domain) return f"{ip}禁用域名{domain}成功"
def __init__(self, module_name='', spider_list=[]): if module_name: self.module_name = module_name if spider_list: self.spider_list = spider_list self.mongo_pool = MongoPool() #创建协程池 self.coroutine_pool = Pool()
def __init__(self): #初始化一个Flask的Web服务 self.app = Flask(__name__) #创建MongoPool对象,用于操作数据库 self.mongo_pool = MongoPool() @self.app.route('/random') def random(): """ 2.2实现根据协议类型和域名,提供随机的获取高可用的代理IP服务 可通过protocol 和 domain 参数对Ip进行过滤 protocol :当前请求的协议 domain :当前请求域名 :return: """ protocol = request.args.get('protocol') domain = request.args.get('domain') proxy = self.mongo_pool.random_proxy(protocol, domain, count=PROXIES_MAX_COUNT) if protocol: return '{}://{}:{}'.format(protocol, proxy.ip, proxy.port) else: return '{}:{}'.format(proxy.ip, proxy.port) @self.app.route('/proxies') def proxies(): """ 2.3实现根据协议和域名,提供获取多个高可用代理的服务 可指定potocol 和domain 参数对IP进行过滤 """ protocol = request.args.get('protocol') domain = request.args.get('domain') proxies = self.mongo_pool.get_proxies(protocol, domain, count=PROXIES_MAX_COUNT) #proxies 是一个Proxy对象的列表,但是Proxy对象不能进行json序列化,需要转换成字典列表 #转化为字典 proxies = [proxy.__dict__ for proxy in proxies] #返回json字符串 return json.dumps(proxies) @self.app.route('/disable_domain') def disable_domain(): #2.4实现给指定的IP上追加不可用域名的服务 #如果在获取IP的时候,有指定域名参数,将不在获取该ip,从而进一步提高代理IP的可用性 ip = request.args.get('ip') domain = request.args.get('domain') if ip is None: return '情提供ip参数' if domain is None: return '情提供域名domain参数' self.mongo_pool.disable_domain(ip, domain) return '{}禁用域名{}成功'.format(ip, domain)
class ProxyAPI_Flask(object): def __init__(self, count): self.count = count self.app = Flask(__name__, template_folder="../assets/templates") self.mongo_pool = MongoPool() @self.app.route('/') @self.app.route('/index') def index(): return render_template("index.html") @self.app.route('/random') def random(): protocol = request.args.get('protocol') domain = request.args.get('domain') proxy = self.mongo_pool.random_proxy(protocol, domain, count=self.count) if protocol: return f"{protocol}://{proxy.ip}:{proxy.port}" else: return f"{proxy.ip}:{proxy.port}" @self.app.route('/proxies') def proxies(): protocol = request.args.get('protocol') domain = request.args.get('domain') proxies = self.mongo_pool.get_proxies(protocol, domain, count=self.count) # 将Proxy对象列表转化为字典列表 proxies = [proxy.__dict__ for proxy in proxies] # 将字典变为json返回 return json.dumps(proxies) @self.app.route('/disable_domain') def disable_domain(): ip = request.args.get('ip') domain = request.args.get('domain') if ip is None: return '请提供IP参数\n' if domain is None: return '请提供domain参数\n' self.mongo_pool.disable_domain(ip, domain) return f"{ip} 禁用域名 {domain} 成功" def run(self): self.app.run('0.0.0.0', port=16888) @classmethod def start(cls): pf = cls(100) logger.info( '*****************Flask启动在localhost:16888端口,监听中*****************') pf.run()
def __init__(self): # 实现初始方法 # 初始一个Flask的Web服务 self.app = Flask(__name__) # 创建MongoPool对象 self.mongo_pool = MongoPool() @self.app.route('/random') def random(): """ 实现根据协议类型和域名,提供随机的获取高可用代理IP的服务 可用通过protocol和domain参数对IP进行过滤 protocol:当前请求的协议类型 domain:当前请求域名 :return: """ protocol = request.args.get('protocal') domain = request.args.get('domain') proxy = self.mongo_pool.random_proxy(protocol, domain, count=PROXIES_MAX_COUNT) if protocol: return '{}://{}:{}'.format(protocol, proxy.ip, proxy.port) else: return '{}:{}'.format(proxy.ip, proxy.port) @self.app.route('/proxies') def proxies(): """实现根据协议类型和域名,提供获取多个高可用代理IP的服务""" protocol = request.args.get('protocal') domain = request.args.get('domain') proxies = self.mongo_pool.get_proxies(protocol, domain, count=PROXIES_MAX_COUNT) # proxies是对象的列表,不能进行json序列化,需要转化为字典列表 # 转换为字典列表 proxies = [proxy.__dict__ for proxy in proxies] # 返回json格式字符串 return json.dumps(proxies) @self.app.route('/disable_domain') def disable_domain(): """如果在获取IP的时候,有指定域名参数,将不在获取该IP从而进一步提高代理IP的可用性""" ip = request.args.get('ip') domain = request.args.get('domain') if ip is None: return '请求提供ip参数' if domain is None: return '请提供域名domain参数' self.mongo_pool.disable_domain(ip, domain) return "{} 禁用域名{} 成功".format(ip, domain)
def __init__(self): # 实现初始方法 # 2.1 初始一个Flask的web服务器 self.app = Flask(__name__) # 创建MongoPool对象用于操作数据库 self.mongo_pool = MongoPool() @self.app.route('/random') def random(): """ 2.2 实现根据协议类型和域名,提供随机获取高可用代理ip的服务 - 可通过protocol和domain参数对ip进行过滤 - protocol:当前请求的协议类型 - domain:当前请求域名 """ protocol = request.args.get('protocol') domain = request.args.get('domain') # print(protocol) # print(domain) proxy = self.mongo_pool.random_proxies(protocol, domain, count=PROXIES_MAX_COUNT) if protocol: return f"{protocol}://{proxy.ip}:{proxy.port}" else: return f"{proxy.ip}:{proxy.port}" # return '测试' @self.app.route('/proxies') def proxies(): """ 2.3 实现根据协议类型和域名,提供获取多个高可用代理ip的服务 可用通过protocol和domain参数对ip进行过滤 :return: """ protocol = request.args.get('protocol') domain = request.args.get('domain') proxies = self.mongo_pool.get_proxies(protocol, domain, count=PROXIES_MAX_COUNT) # proxies是一个Proxy对象的列表,但是proxy对象不能进行json序列化,需要转换为字典列表 proxies = [proxy.__dict__ for proxy in proxies] # 返回json格式值串 return json.dumps(proxies) # 2.4 实现给指定ip追加不可用域名的服务 @self.app.route('/disable') def disable_domain(): ip = request.args.get('ip') domain = request.args.get('domain') if ip is None: return '请提供ip参数' if domain is None: return '请提供domain参数' self.mongo_pool.disable_domain(ip, domain) return f"{ip}禁用域名{domain}成功"
class ProxyApi(object): def __init__(self): self.app = Flask(__name__) # 创建数据库 self.mongo_pool = MongoPool() @self.app.route('/random') def random(): ''' 实现根据协议类型和域名, 提供随机的获取高可用代理ip服务 可以通过 protocol 和 domain 参数对ip进行过滤 protocol:当前请求的协议类型 domain: 当前请求域名 :return: ''' protocol = request.args.get('protocol') domain = request.args.get('domain') proxy = self.mongo_pool.random_proxy(protocol, domain, count=PROXIES_MAX_COUNT) if protocol: return '{}://{}:{}'.format(protocol, proxy.ip, proxy.port) else: return '{}:{}'.format(proxy.ip, proxy.port) @self.app.route('/proxies') def proxies(): protocol = request.args.get('protocol') domain = request.args.get('domain') proxies = self.mongo_pool.get_proxies(protocol, domain, count=PROXIES_MAX_COUNT) # proxies 是一个对象的列表,不能json序列化 ,需要转化为字典列表 proxies = [proxy.__dict__ for proxy in proxies] # 返回json return json.dumps(proxies) @self.app.route('/disable_domain') def disable_domain(): ip = request.args.get('ip') domain = request.args.get('domain') if ip is None: return "请提供ip参数" if domain is None: return "请提供domain参数" self.mongo_pool.disable_domain(ip, domain) return '{} 禁用域名 {} 成功'.format(ip, domain) def run(self): self.app.run('0.0.0.0', port=16888) @classmethod def start(cls): # 4. 实现start的类方法啊,用于通过类名,启动服务 proxy_api = cls() proxy_api.run()
class RunSpider(object): def __init__(self): # 创建MongoPool对象 self.mongo_pool = MongoPool() # 3.1 在init方法中创建协程池对象 self.coroutine_pool = Pool() def get_spider_from_settings(self): """根据胚子文件信息, 获取爬虫对象列表""" # 首先遍历配置文件中的爬虫信息, 获取每个爬虫全类名 for full_class_name in PROXIES_SPIDERS: # core.proxy_spider.proxy_spiders.Ip66Spider # 获取模块名和类名,然后根据模块名动态创建类对象 module_name, class_name = full_class_name.rsplit('.', maxsplit=1) # 根据模块名导入模块 module = importlib.import_module(module_name) # 根据模块获取爬虫对象 cls = getattr(module, class_name) # 创建爬虫对象 spider = cls() yield spider def run(self): # 2.1 根据配置文件信息,获取爬虫对象列表 spiders = self.get_spider_from_settings() for spider in spiders: # 2.2 遍历爬虫对象列表, 获取爬虫对象, 遍历爬虫对象的get_proxies方法, 获取代理IP # self._execute_one_spider_task(spiders) # 抽取出的方法使用线程池调度 self.coroutine_pool.apply_async(self._execute_one_spider_task, args=(spider, )) # 3.4 调用协程的 join方法, 让当前线程等待协程任务的完成 self.coroutine_pool.join() def _execute_one_spider_task(self, spider): try: # 遍历爬虫对象的get_proxies方法, 获取代理IP for proxy in spider.get_proxies(): # 2.3 检测代理IP(代理IP检测模块) proxy = check_proxy(proxy) # 如果速度不为-1, 说明可用 if proxy.speed != -1: # 写入数据库 self.mongo_pool.insert_one(proxy) except Exception as e: logger.exception(e) @classmethod def start(cls): cls().run() schedule.every(SPIDER_TIME_DELAY).hours.do(cls().run) while True: schedule.run_pending() time.sleep(1)
class RunSpider(object): spider_list = [ 'kuaiSpider', 'jiangxianSpider', 'xilaSpider', 'xiaohuanSpider', 'zhimaSpider', 'nimaSpider', 'qiyunSpider', 'spider89', ] module_name = 'core.proxy_spider.proxy_spiders' def __init__(self, module_name='', spider_list=[]): if module_name: self.module_name = module_name if spider_list: self.spider_list = spider_list self.mongo_pool = MongoPool() #创建协程池 self.coroutine_pool = Pool() def get_spider_cls(self, spider_list, module_name): module = importlib.import_module(module_name) for spider_name in spider_list: spider_cls = getattr(module, spider_name) yield spider_cls def run_spider(self): for spider in self.get_spider_cls(self.spider_list, self.module_name): #self.__execute_one_spider_task(spider) self.coroutine_pool.apply_async(self.__execute_one_spider_task, args=(spider, )) self.coroutine_pool.join() def __execute_one_spider_task(self, spider): try: for proxy in spider.get_proxies(): proxy = check_proxy(proxy) if proxy.delay != -1: self.mongo_pool.insert_one(proxy) print("新代理插入成功" + dict(proxy)) except Exception as ex: logger.exception(ex) @classmethod def start(cls): rs = RunSpider() rs.run_spider() schedule.every(RUN_SPIDER_INTERVAL).hours.do(rs.run_spider) while True: schedule.run_pending() time.sleep(30)
def __init__(self): self.app = Flask(__name__) self.mongo_pool = MongoPool() @self.app.route('/random') def random(): protocol = request.args.get('protocol') proxy = self.mongo_pool.usable_proxy() if protocol: return f'{protocol}://{proxy.ip}:{proxy.port}' else: return f'{proxy.ip}:{proxy.port}'
class RunSpider(object): def __init__(self): self.mongo_pool = MongoPool() #创建协程池对象 self.coroutine_pool = Pool() def get_spider_from_settings(self): """根据配置文件信息,获取爬虫对象列表""" #遍历配置文件中爬虫信息,获取每个爬虫全类名 for full_class_name in PROXIES_SPIDERS: #获取模块名和类名 module_name, class_name = full_class_name.rsplit('.', maxsplit=1) #根据模块名,导入模块 module = importlib.import_module(module_name) # #根据类名,从模块中,获取类 cls = getattr(module, class_name) #创建爬虫对象 spider = cls() # print(spider) yield spider def run(self): # 根据配置文件信息,获取爬虫对象列表 spiders = self.get_spider_from_settings() for spider in spiders: #把处理一个代理爬虫的代码抽到一个方法用于处理一个爬虫任务 # self.__execute_one_spider_task(spider) self.coroutine_pool.apply_async(self.__execute_one_spider_task, args=(spider, )) #调用协程的join方法,让当前线程等待协程任务的未完成 self.coroutine_pool.join() def __execute_one_spider_task(self, spider): try: for proxy in spider.get_proxies(): # 检测代理IP(调用检测模块) proxy = check_proxy(proxy) # 如果可用,写入数据库(调用数据库模块,speed不为-1就说明可用) if proxy.speed != -1: self.mongo_pool.insert_one(proxy) # print(proxy) except Exception as ex: logger.exception(ex) @classmethod def start(cls): rs = RunSpider() rs.run() #每间隔多少个小时运行爬虫 schedule.every(RUN_SPIDERS_INTERVAL).hours.do(rs.run) while True: schedule.run_pending() time.sleep(1)
class ProxyApi(object): def __init__(self): self.app = Flask(__name__) self.mongo_pool = MongoPool() @self.app.route('/random') def random(): protocol = request.args.get('protocol') domain = request.args.get('domain') proxy = self.mongo_pool.random_proxy(protocol, domain, count=PROXIES_MAX_COUNT) if protocol: return '{}://{}:{}'.format(protocol, proxy.ip, proxy.port) else: return '{}:{}'.format(proxy.ip, proxy.port) @self.app.route('/proxies') def proxies(): protocol = request.args.get('protocol') domain = request.args.get('domain') proxies = self.mongo_pool.get_proxies(protocol, domain, count=PROXIES_MAX_COUNT) #把proxies对象转换为字典 proxies = [proxy.__dict__ for proxy in proxies] #返回json格式列表 return json.dumps(proxies) @self.app.route('/disable_domain') def disable_domain(): ip = request.args.get('ip') domain = request.args.get('domain') if ip is None: return '请求提供Ip参数' if domain is None: return '请提供域名domain参数' self.mongo_pool.disable_domain(ip, domain) return "{} 禁用域名 {} 成功".format(ip, domain) def run(self): self.app.run('0.0.0.0', port=16888) @classmethod def start(cls): proxy_api = cls() proxy_api.run()
class ProxyApi(object): def __init__(self): # 2. 实现初始化方法 # 2.1 初始一个Flask的Web服务 self.app = Flask(__name__) self.mongo_pool = MongoPool() @self.app.route('/') def index(): html = ''' <h2 align="center">Welcome to my proxies!</h2> <div align="center"><a href="http://*****:*****@self.app.route('/random') def random(): protocol = request.args.get('protocol') domain = request.args.get('domain') proxy = self.mongo_pool.random_proxy(protocol, domain, count=PROXIES_MAX_COUNT) if protocol: return '{}://{}:{}'.format(protocol, proxy.ip, proxy.port) else: return '{}:{}'.format(proxy.ip, proxy.port) @self.app.route('/proxies') def proxies(): protocol = request.args.get('protocol') domain = request.args.get('domain') proxies = self.mongo_pool.get_proxies(protocol, domain, count=PROXIES_MAX_COUNT) proxies = [proxy.__dict__ for proxy in proxies] return json.dumps(proxies, ensure_ascii=False) def run(self): self.app.run('localhost', port=16888) @classmethod def start(cls): proxy_api = cls() proxy_api.run()
class RunSpider(object): def __init__(self): self.mongo_pool = MongoPool() # 创建携程池对象 self.coroutine_pool = Pool() @staticmethod def get_spiders_from_settings(self): for spider_full_path in PROXY_SPIDERS: module_name, class_name = spider_full_path.rsplit('.', maxsplit=1) # 根据已经获得的module_name动态导入模块 module = importlib.import_module(module_name) # 根据类名,从模块中获取类 cls = getattr(module, class_name) # 创建爬虫对象 spider = cls() yield spider def __execute_one_spider_task(self, spider): try: for proxy in spider.get_proxies(): # 检测代理IP可用性 proxy = check_proxy(proxy) # print(proxy) # 如果可用就入数据库 if proxy.speed != -1: self.mongo_pool.insert_one(proxy) except Exception as e: logger.exception(e) def run(self): spiders = self.get_spiders_from_settings() for spider in spiders: self.coroutine_pool.apply_async(self.__execute_one_spider_task, args=(spider,)) # self.__execute_one_spider_task(spider) self.coroutine_pool.join() # 定时调用run方法启动spiders @classmethod def start(cls): r = RunSpider() r.run() logger.info("*****************本次爬取完毕,等待下次爬取*****************") schedule.every(RUN_SPIDERS_INTERVAL).hours.do(r.run) while True: schedule.run_pending() time.sleep(RUN_SPIDERS_INTERVAL * 60 * 60 / 2 + 1)
def __init__(self, count): self.count = count self.app = Flask(__name__, template_folder="../assets/templates") self.mongo_pool = MongoPool() @self.app.route('/') @self.app.route('/index') def index(): return render_template("index.html") @self.app.route('/random') def random(): protocol = request.args.get('protocol') domain = request.args.get('domain') proxy = self.mongo_pool.random_proxy(protocol, domain, count=self.count) if protocol: return f"{protocol}://{proxy.ip}:{proxy.port}" else: return f"{proxy.ip}:{proxy.port}" @self.app.route('/proxies') def proxies(): protocol = request.args.get('protocol') domain = request.args.get('domain') proxies = self.mongo_pool.get_proxies(protocol, domain, count=self.count) # 将Proxy对象列表转化为字典列表 proxies = [proxy.__dict__ for proxy in proxies] # 将字典变为json返回 return json.dumps(proxies) @self.app.route('/disable_domain') def disable_domain(): ip = request.args.get('ip') domain = request.args.get('domain') if ip is None: return '请提供IP参数\n' if domain is None: return '请提供domain参数\n' self.mongo_pool.disable_domain(ip, domain) return f"{ip} 禁用域名 {domain} 成功"
class ProxyApi(object): def __init__(self): # 初始化flask服务 self.app = Flask(__name__) self.mongo_pool = MongoPool() @self.app.route('/random') def random(): protocol = request.args.get("protocol") domain = request.args.get("domain") proxy = self.mongo_pool.get_random_proxy(protocl=protocol, domain=domain) if proxy: return "{}://{}:{}".format(protocol, proxy.ip, proxy.port) return "test" @self.app.route('/proxies') def proxies(): # 获取的协议 protocol = request.args.get("protocol") # 域名 domain = request.args.get("domain") proxies = self.mongo_pool.get_proxies(protocl=protocol, domain=domain, count=PROXIES_MAX_COUNT) proxies_list = [proxy.__dict__ for proxy in proxies] return json.dumps(proxies_list) @self.app.route('/disable_domain') def disable_domain(): ip = request.args.get('ip') domain = request.args.get("domain") if ip is None: return "ip不能为空" if domain is None: return "domain不能为空" self.mongo_pool.disable_domain(ip, domain) return "{} 禁用{} 成功".format(ip, domain) def run(self): self.app.run('0.0.0.0', port=17777) @classmethod def start(cls): proxyApi = cls() proxyApi.run()
class RunSpider(object): def __init__(self): self.mongo_pool = MongoPool() self.corutine_pool = Pool() def get_spider_from_settings(self): # 遍历配置文件,获取信息 for spider in PROXY_SPIDERS: # 获取模块名 类名 module_name, class_name = spider.rsplit('.', maxsplit=1) # 根据模块名导入类名 module = importlib.import_module(module_name) # 根据类名,从模块中获取类 cls = getattr(module, class_name) # 创建对象 spider = cls() yield spider def run(self): spiders = self.get_spider_from_settings() for spider in spiders: # 通过协程池异步执行 self.corutine_pool.apply_async(self.__execute_one_spider, args=(spider, )) self.corutine_pool.join() # 处理一个爬虫的 def __execute_one_spider(self, spider): try: for proxy in spider.get_proxies(): proxy = check_proxy(proxy) if proxy.speed != -1: self.mongo_pool.insert_one(proxy) except Exception as e: logger.error(e) @classmethod def start(self): rs = RunSpider() rs.run() schedule.every(RUN_SPIDERS_INTERVAL).hours.do(rs.run()) while True: schedule.run_pending()
class RunSpider(object): def __init__(self): self.mongo_pool = MongoPool() self.coroutine_pool = Pool() def run(self): spiders = self.get_spider_from_settings() for spider in spiders: #异步的方式 self.coroutine_pool.apply_async(self._execute_one_spider, args=(spider, )) self.coroutine_pool.join() def _execute_one_spider(self, spider): try: for proxy in spider.get_proxies(): # 检验ip proxy = check_proxy(proxy) # speed=-1不可以 if proxy.speed != -1: self.mongo_pool.insert_one(proxy) except Exception as ex: logger.exception(ex) def get_spider_from_settings(self): # 根据配置文件获取爬虫列表 for full_class_name in PROXIES_SPIDERS: #获取模块名和类名 module_name, class_name = full_class_name.rsplit('.', maxsplit=1) #根据模块名导入模块 ---->import proxy_spider module = importlib.import_module(module_name) #根据类名,从模块中获取类---->from proxy_spider import ProxylistplusSpider, cls = getattr(module, class_name) spider = cls() yield spider @classmethod def start(cls): r = RunSpider() r.run() schedule.every(RUN_SPIDERS_INTERVAL).hours.do(r.run()) while True: schedule.run_pending() time.sleep(3600)
def __init__(self): self.app = Flask(__name__) self.mongo_pool = MongoPool() @self.app.route('/random') def random_proxy(): protocol = request.args.get('protocol') domain = request.args.get('domain') proxies = self.mongo_pool.find_all() p_index = random.randint(1, 100) pr = proxies[p_index] pr = pr.__dict__ return pr @self.app.route('/proxies') def proxies_list(): proxies = self.mongo_pool.find_all() dict_list = [proxy.__dict__ for proxy in proxies] return json.dumps(dict_list)
class ProxyTester(object): def __init__(self): # 创建操作数据库的mongoPool对象 self.mongo_pool = MongoPool() self.queue = Queue() self.coroutine_pool = Pool() def __check_callback(self,temp): self.coroutine_pool.apply_async(self.__check_one_proxy,callback=self.__check_callback) def run(self): # 提供一个run方法,用于处理检测代理IP核心逻辑 # 2.1 从数据库中获取所有代理IP proxies = self.mongo_pool.find_all() for proxy in proxies: # self.__check_one_proxy(proxy) # 把代理IP添加到队列中 self.queue.put(proxy) # 3.5 开启多个一个异步任务,来处理代理IP的检测,可以通过配置文件指定异步数量 for i in range(TEST_PROXIES_ASYNC_COUNT): # 3.4 通过异步回调,使用死循环不断执行这个方法 self.coroutine_pool.apply_async(self.__check_one_proxy,callback=self.__check_callback) # 让当前线程,等待队列任务完成 self.queue.join() def __check_one_proxy(self): # 3.3 把检查一个代理可用性的代码,抽取到一个方法中, # 从队列中获取代理IP,进行检查,检查完毕 proxy = self.queue.get() # 2.3 检查代理可用性 proxy = check_proxy(proxy) # 如果代理不可用,让代理分数-1 if proxy.speed == -1: proxy.score -= 1 # 如果代理分数等于0,就从数据库中删除该代理 if proxy.score <= 45: self.mongo_pool.delete_one(proxy) else: # 更新代理IP self.mongo_pool.update_one(proxy) else: # 2.5 如果代理可用,就恢复该代理分数,更新到数据库中 proxy.score = MAX_SCORE self.mongo_pool.update_one(proxy) # 调度队列的task_done方法 self.queue.task_done() @classmethod def start(cls): proxy_tester = cls() proxy_tester.run() schedule.every(TEST_PROXIES_INTERVAL).hours.do(proxy_tester.run) while True: schedule.run_pending() time.sleep(1)