def __init__(self, queue: Queue): self.queue = queue app = schedule.app with app.app_context(): self.store_type = current_app.config.get('DATA_STORE_TYPE') if self.store_type == 'mysql': self.db = sql else: self.db = RedisHelper(jobs.PROXY_VALID_KEY)
def FetchJob(): # 通过scheduler得到对应的app context对象 app = schedule.app with app.app_context(): store_type = current_app.config.get('DATA_STORE_TYPE') if store_type == 'mysql': db = sql else: db = RedisHelper(jobs.PROXY_RAW_KEY) import inspect member_list = inspect.getmembers(FetchFreeProxy, predicate=inspect.isfunction) proxy_set = set() for func_name, func in member_list: log.debug(u"开始获取代理: {}".format(func_name)) try: for proxy in func(): proxy = proxy.strip() if not proxy or not verifyProxyFormat(proxy): log.error('ProxyFetch - {func}: ' '{proxy} illegal'.format(func=func_name, proxy=proxy.ljust(20))) continue elif proxy in proxy_set: log.debug('ProxyFetch - {func}: ' '{proxy} exist'.format(func=func_name, proxy=proxy.ljust(20))) continue else: log.debug('ProxyFetch - {func}: ' '{proxy} success'.format(func=func_name, proxy=proxy.ljust(20))) # 保存数据 last_time = datetime.datetime.now().strftime( "%Y-%m-%d %H:%M:%S") p = Proxy(name=func_name, proxy=proxy, last_time=last_time) # 持久化保存 if store_type == 'mysql': record = ProxyRaw(name=func_name, proxy=proxy, https=False, gmt_create=last_time, gmt_modified=last_time) with app.app_context(): db.session.add(record) db.session.commit() else: db.add(p.proxy, p.Json) # 保存到set中检查是否重复 proxy_set.add(p) except Exception as e: log.error(u"代理获取函数 {} 运行出错!".format(func_name)) # 执行相关统计数据 log.debug('本次插入代理总数: %s', len(proxy_set))
def checkRawProxyJob(): app = schedule.app with app.app_context(): store_type = current_app.config.get('DATA_STORE_TYPE') if store_type == 'mysql': db = sql else: db = RedisHelper(jobs.PROXY_RAW_KEY) proxy_queue = Queue() # 此处直接装填队列,因为进程已经启动好了,此时只要一个队列有数据,就有一个进程进行处理 if store_type == 'mysql': with app.app_context(): data = db.session.query(ProxyRaw).all() data = [Proxy(name=item.name, proxy=item.proxy, https=item.https, success=item.success if item.success else 0, fail=item.fail if item.fail else 0, total=item.total if item.total else 0, quality=item.quality if item.quality else 0, last_time=item.gmt_modified) for item in data] else: data = db.getAll() if len(data) > 0: for proxy in data: proxy_queue.put(proxy) # 很奇怪的问题,会导致Queue退出,BrokenPipeError: [Errno 32] Broken pipe time.sleep(0.01) # 清除临时旧数据库 if store_type == 'mysql': with app.app_context(): db.session.query(ProxyRaw).delete() else: db.clear() # 启动多进行进行检查所有的代理地址是否合法 process_list = list() num = getCpuNumber() for index in range(num): process = Process(target=CheckProcess(proxy_queue).run()) process.daemon = True process_list.append(process) for work in process_list: work.start() # 终止所有的进程操作 for _ in process_list: proxy_queue.put(None) for work in process_list: work.join() log.debug("RawProxyCheck - 本次验证执行结束,验证数量: {}".format(len(data)))
def get(self): response = WebResponse() store_type = current_app.config.get('DATA_STORE_TYPE') if store_type == 'mysql': db = sql data = db.session.query(ProxyValid).all() else: db = RedisHelper(jobs.PROXY_VALID_KEY) data = db.getAll() if data: if store_type != 'mysql': data = [Proxy.fromJson(item) for item in data] else: data = [item.serialize() for item in data] response.data = data else: response.code = WebResponseCode.NO_RECORD return response.tojson()
class CheckProcess(object): def __init__(self, queue: Queue): self.queue = queue app = schedule.app with app.app_context(): self.store_type = current_app.config.get('DATA_STORE_TYPE') if self.store_type == 'mysql': self.db = sql else: self.db = RedisHelper(jobs.PROXY_VALID_KEY) def run(self): if self.store_type != 'mysql': self.db.change(jobs.PROXY_VALID_KEY) while True: if self.queue.empty(): break proxy_data = self.queue.get() if proxy_data is None: break if self.store_type != 'mysql': proxyObj = Proxy.fromJson(proxy_data) else: proxyObj = proxy_data proxy, status = proxyObj.validateProxy() if status: # 保存到数据库中 if self.store_type != 'mysql': self.db.add(proxy.proxy, proxy.Json) else: proxy_valid = ProxyValid(name=proxy.name, proxy=proxy.proxy, https=proxy.https, proxy_type=proxy.type, china=proxy.china, location=proxy.location, success=proxy.success, fail=proxy.fail, total=proxy.total, quality=proxy.quality, last_status=proxy.last_status, gmt_modified=proxy.last_time) app = schedule.app with app.app_context(): self.db.session.add(proxy_valid) self.db.session.commit() log.debug('RawProxyCheck - {} : {} validation pass'.format(proxy.name, proxy.proxy.ljust(20))) else: log.error( 'RawProxyCheck - {} : {}, into time: {} validation fail'.format(proxy.name, proxy.proxy.ljust(20), proxy.last_time))
class CheckProcess(object): def __init__(self, queue: Queue): self.queue = queue app = schedule.app with app.app_context(): self.store_type = current_app.config.get('DATA_STORE_TYPE') if self.store_type == 'mysql': self.db = sql else: self.db = RedisHelper(jobs.PROXY_VALID_KEY) def run(self): if self.store_type != 'mysql': self.db.change(jobs.PROXY_VALID_KEY) while True: if self.queue.empty(): break # log.info('正在运行代理检查,检查队列是: {}'.format(self.queue.qsize())) proxy_data = self.queue.get() if proxy_data is None: break if self.store_type != 'mysql': proxyObj = Proxy.fromJson(proxy_data) else: proxyObj = proxy_data proxy, status = proxyObj.validateProxy() # log.info('执行检查结果: {}, 结果是: {}'.format(str(proxy),status)) if status or proxy.fail < FAIL_COUNT: # 保存到数据库中 if self.store_type != 'mysql': if self.db.exists(proxy.proxy): log.debug('ValidProxyCheck - {} : {} validation exists'.format(proxy.name, proxy.proxy.ljust(20))) self.db.add(proxy.proxy, proxy.Json) else: app = schedule.app with app.app_context(): proxy_info=self.db.session.query(ProxyValid).filter(ProxyValid.proxy==proxy.proxy).first() proxy_info.success=proxy.success proxy_info.fail=proxy.fail proxy_info.total=proxy.total proxy_info.quality=proxy.quality proxy_info.last_status=proxy.last_status proxy_info.gmt_modified= proxy.last_time self.db.session.commit() log.debug('ValidProxyCheck - {} : {} validation pass'.format(proxy.name, proxy.proxy.ljust(20))) else: log.debug('ValidProxyCheck - {} : {} validation fail'.format(proxy.name, proxy.proxy.ljust(20))) if self.store_type != 'mysql': self.db.delete(proxy.proxy) else: app = schedule.app with app.app_context(): self.db.session.query(ProxyValid).filter(ProxyValid.proxy==proxy.proxy).delete()
def get(self): ''' 请求样例: /get /get?type=valid /get?type=raw :return: ''' # 是否需要未处理的数据 proxy_type = request.args.get('type') type = proxy_type if proxy_type else 'valid' response = WebResponse() store_type = current_app.config.get('DATA_STORE_TYPE') if store_type == 'mysql': db = sql if type == 'valid': # .order_by( # desc(ProxyValid.total)) data = db.session.query(ProxyValid).filter( ProxyValid.quality == 100).order_by(desc( ProxyValid.total)).all() else: data = db.session.query(ProxyRaw).order_by( desc(ProxyRaw.gmt_modified)).all() else: if type == 'valid': db = RedisHelper(jobs.PROXY_VALID_KEY) else: db = RedisHelper(jobs.PROXY_RAW_KEY) data = db.getAll() if data: random_choice = random.choice(data) if store_type == 'mysql': response.data = random_choice.serialize() else: response.data = json.loads(random_choice) else: response.code = WebResponseCode.NO_RECORD return response.tojson()
def get(self): response = WebResponse() store_type = current_app.config.get('DATA_STORE_TYPE') if store_type == 'mysql': db = sql data = db.session.query(ProxyValid).all() else: db = RedisHelper(jobs.PROXY_VALID_KEY) data = db.getAll() if data: items = [ item for item in data if item.serialize().get('total') == 1 and item.serialize().get('success') == 1 ] random_choice = random.choice(items) if store_type == 'mysql': item = random_choice.serialize() else: item = json.loads(random_choice) response.data = item else: response.code = WebResponseCode.NO_RECORD return response.tojson()
def get(self): response = WebResponse() store_type = current_app.config.get('DATA_STORE_TYPE') if store_type == 'mysql': db = sql valid_number = db.session.query(ProxyValid).count() raw_number = db.session.query(ProxyRaw).count() else: db = RedisHelper(jobs.PROXY_VALID_KEY) valid_number = db.len() db.change(jobs.PROXY_RAW_KEY) raw_number = db.len() data = {'raw': raw_number, 'valid': valid_number} response.data = data return response.tojson()