def task2json(self, parent_pid): task = TaskManager.get_task(self.task_id) JobList = [] for job in task.jobs: dict_obj = {"job_id": job.id, "params": json_decode(job.params), "command": job.command, "parent_pid": parent_pid} JobList.append(dict_obj) return json.dumps(JobList)
def rerun(request, job_id): """ Повторно выполнить конкретное задание по номеру job_id :param request: :param job_id: :return: """ # TODO rewrite it there is no that TaskManager t = get_object_or_404(JobDataModel, id=job_id) task_manager = TaskManager.get_instance() task_manager.rerun_task(t) return {'task': t.to_dict()}
def do(self): """ Do uninstall """ products = ProductCollection([product_name for product_name in self.product_names], feed=Core.get_instance().current_storage.feed, fail_on_product_not_found=False) TornadoWorker.create_instance() # всё ок, создаём задание на установку task = TaskFactory.create_task("uninstall", products, EmptyParameters()) task_id = TaskManager.queue_task(task) callback_exit = lambda job, exit_code: UninstallCommand.console_exit(self, job, exit_code) TornadoWorker.start_new_task(task, callback_exit, False) core.core.core_event_loop_start() sys.exit(0)
def uninstall(request): """ Обрабатывает запросы на деинсталляцию продуктов. """ if request.method != 'POST': # принимаем только пост-запросы return HttpResponseNotAllowed(['POST']) # парсим джейсон запрос req = json_request(request) # это начальный запрос ? # список имён продуктов для деинсталляции requested_products = req['requested_products'] # добываем список продуктов из списка имён products = ProductCollection( [product_name for product_name in requested_products], feed=Core.get_instance().current_storage.feed, # feed=Core.get_instance().feed, fail_on_product_not_found=False) if req["command"] == "start": # для начального запроса отдает список продуктов resp = { 'task': None, 'state': 'product_list', 'items': [ { 'product': product.to_dict(True), 'parameters': [], 'error': None } for product in products ] } else: # создаём задачу на деинсталляцию task = TaskFactory.create_task("uninstall", products) task_id = TaskManager.queue_task(task) TornadoWorker.start_new_task(task) # и готовим ответ веб-интерфейсу, что уставнока началась resp = {'task': {'id': task_id}, "state": "uninstalling"} return resp
def do(self): """ Install products by tornado worker """ parameters = None dm = DependencyManager() product_flat_list = [dm.get_dependencies(product_name) for product_name in self.product_names] product_list = InstallCommand.flat_deps4products(product_flat_list, False) if self.args_parameters.yml_params: parameters = ParametersParserYmlFile(self.args_parameters.yml_params).get() elif self.args_parameters.json_params: parameters = ParametersParserJsonFile(self.args_parameters.json_params).get() elif self.args_parameters.parameters: parameters = ParametersParserStr(self.args_parameters.parameters).get() else: # then fill with empty params parameters = EmptyParameters().get(product_list) # добывает список продуктов из списка имён products = ProductCollection(product_list, feeds=(self.core.feed, self.core.current), ready_json=True) # парсим параметры установки из запроса # создаём менеджер параметров parameter_manager = ParametersManager(self.core, products, parameters) # все ли параметры заполнены? if parameter_manager.are_all_parameters_filled(): # TODO move TornadoWorker to core # create tornado worker web.taskqueue.tornado_worker.TornadoWorker.create_instance() # всё ок, создаём задание на установку task = TaskFactory.create_task("install", products, parameter_manager) task_id = TaskManager.queue_task(task) callback_exit = lambda job, exit_code: InstallCommand.console_exit(self, job, exit_code) TornadoWorker.start_new_task(task, callback_exit, False) core.core.core_event_loop_start() else: raise InstallCommandError("Not all parameters specified")
def upgrade(request): """ Обрабатывает запросы на апгрейд продуктов. Форматы входных запросов и выходных ответов такие же как для install() """ if request.method != 'POST': # принимаем только пост-запросы return HttpResponseNotAllowed(['POST']) # парсим джейсон запрос req = json_request(request) initial = 'initial' in req dm = DependencyManager() requested_products = req['requested_products'] core = Core.get_instance() if initial: # если это начальный запрос, то отдаем дерево зависимостей resp = { 'task': None, 'items': [dm.get_dependencies(product_name) for product_name in requested_products] } else: # это запрос на апгрейд # список имён продуктов, которые нужно апгрейдить (с зависимостями) product_list = [item['product'] for item in req['install_products']] product_list.reverse() # добывает спсисоко продуктов из списка имён products = ProductCollection(product_list) parsed_parameters = ParametersParserJson(req['install_products']).get() # создаём менеджер параметров parameter_manager = ParametersManager(core, products, parsed_parameters) # создаёт задачу на апгрейд task = TaskFactory.create_task("upgrade", products, parameter_manager) task_id = TaskManager.queue_task(task) TornadoWorker.start_new_task(task) resp = { 'task': { 'id': task_id, }, 'items': None } return resp
def __init__(self): self._task_manager = TaskManager() self._task_manager.reset_task()
class DealData(): def __init__(self): self._task_manager = TaskManager() self._task_manager.reset_task() def __parse_account_info(self, data, req_url): ''' @summary: --------- @param data: --------- @result: ''' __biz = tools.get_param(req_url, '__biz') regex = 'id="nickname">(.*?)</strong>' account = tools.get_info(data, regex, fetch_one=True).strip() regex = 'profile_avatar">.*?<img src="(.*?)"' head_url = tools.get_info(data, regex, fetch_one=True) regex = 'class="profile_desc">(.*?)</p>' summary = tools.get_info(data, regex, fetch_one=True).strip() # 认证信息(关注的账号直接点击查看历史消息,无认证信息) regex = '<i class="icon_verify success">.*?</i>(.*?)</span>' verify = tools.get_info(data, regex, fetch_one=True) verify = verify.strip() if verify else '' # 二维码 regex = 'var username = "" \|\| "(.*?)";' # || 需要转译 qr_code = tools.get_info(data, regex, fetch_one=True) qr_code = 'http://open.weixin.qq.com/qr/code?username='******'__biz': __biz, 'account': account, 'head_url': head_url, 'summary': summary, 'qr_code': qr_code, 'verify': verify, 'spider_time': tools.get_current_date() } if account_data: data_pipeline.save_account(account_data) def __parse_article_list(self, article_list, __biz, is_first_page=False): ''' @summary: 解析文章列表 --------- @param article_list: 文章列表信息 str --------- @result: True / None (True: 继续向下抓取; None: 停止向下抓取) ''' # log.debug(tools.dumps_json(article_list)) # 解析json内容里文章信息 def parse_article_info(article_info, comm_msg_info): if not article_info: return # log.debug(tools.dumps_json(article_info)) title = article_info.get('title') digest = article_info.get('digest') url = article_info.get('content_url').replace('\\', '').replace('amp;', '') source_url = article_info.get('source_url').replace('\\', '') # 引用的文章链接 cover = article_info.get('cover').replace('\\', '') subtype = article_info.get('subtype') is_multi = article_info.get('is_multi') author = article_info.get('author') copyright_stat = article_info.get('copyright_stat') duration = article_info.get('duration') del_flag = article_info.get('del_flag') type = comm_msg_info.get('type') publish_time = tools.timestamp_to_date(comm_msg_info.get('datetime')) sn = tools.get_param(url, 'sn') if sn: # 缓存文章信息 article_data = { 'title': title, 'digest': digest, 'url': url, 'source_url': source_url, 'cover': cover, 'subtype': subtype, 'is_multi': is_multi, 'author': author, 'copyright_stat': copyright_stat, 'duration': duration, 'del_flag': del_flag, 'type': type, 'publish_time': publish_time, 'sn': sn, '__biz': __biz, 'spider_time': tools.get_current_date() } return article_data # log.debug(tools.dumps_json(article_list)) article_list = tools.get_json(article_list) article_list_data = [] publish_time = None is_need_get_more = True article_list = article_list.get('list', []) is_first_article = True for article in article_list: comm_msg_info = article.get('comm_msg_info', {}) publish_timestamp = comm_msg_info.get('datetime') publish_time = tools.timestamp_to_date(publish_timestamp) # 记录最新发布时间 if is_first_page and is_first_article: self._task_manager.record_new_last_article_publish_time(__biz, publish_time) is_first_article = False if publish_timestamp and self._task_manager.is_zombie_account(publish_timestamp): # 首页检测是否为最新发布的文章 若最近未发布 则为僵尸账号 log.info('公众号 {} 为僵尸账号 不再监控'.format(__biz)) self._task_manager.sign_account_is_zombie(__biz, publish_time) is_need_get_more = False break # 对比时间 若采集到上次时间,则跳出 is_reach = self._task_manager.is_reach_last_article_publish_time(__biz, publish_time) if is_reach: log.info('采集到上次发布时间 公众号 {} 采集完成'.format(__biz)) new_last_publish_time = self._task_manager.get_new_last_article_publish_time(__biz) self._task_manager.update_account_last_publish_time(__biz, new_last_publish_time) is_need_get_more = False break elif is_reach is None: log.info('公众号 {} 为爬虫启动时的手点公众号。不遍历历史消息,即将抓取监控池里的公众号'.format(__biz)) return article_type = comm_msg_info.get('type') if article_type != 49: # 49为常见的图文消息、其他消息有文本、语音、视频,此处不采集,格式不统一 continue # 看是否在抓取时间范围 publish_time_status = self._task_manager.is_in_crawl_time_range(publish_time) if publish_time_status == TaskManager.OVER_MIN_TIME_RANGE: log.info('公众号 {} 超过采集时间范围 采集完成'.format(__biz)) new_last_publish_time = self._task_manager.get_new_last_article_publish_time(__biz) self._task_manager.update_account_last_publish_time(__biz, new_last_publish_time) is_need_get_more = False break elif publish_time_status == TaskManager.NOT_REACH_TIME_RANGE: log.info('公众号 {} 当前采集到的时间 {} 未到采集时间范围 不采集'.format(__biz, publish_time)) continue # 在时间范围 # 微信公众号每次可以发多个图文消息 # 第一个图文消息 app_msg_ext_info = article.get('app_msg_ext_info', {}) article_data = parse_article_info(app_msg_ext_info, comm_msg_info) if article_data: article_list_data.append(article_data) # 同一天附带的图文消息 multi_app_msg_item_list = app_msg_ext_info.get('multi_app_msg_item_list') for multi_app_msg_item in multi_app_msg_item_list: article_data = parse_article_info(multi_app_msg_item, comm_msg_info) if article_data: article_list_data.append(article_data) if article_list_data: data_pipeline.save_article_list(article_list_data) if is_need_get_more: return publish_time def deal_article_list(self, req_url, text): ''' @summary: 获取文章列表 分为两种 1、第一次查看历史消息 返回的是html格式 包含公众号信息 2、下拉显示更多时 返回json格式 但是文章列表都是json格式 且合适相同 抓取思路: 1、如果是第一种格式,直接解析文章内容,拼接下一页json格式的地址 2、如果是第二种格式, --------- @param data: --------- @result: ''' try: # 判断是否为被封的账号, 被封账号没有文章列表 __biz = tools.get_param(req_url, '__biz') if 'list' in text: # 取html格式里的文章列表 if 'action=home' in req_url: # 解析公众号信息 self.__parse_account_info(text, req_url) # 解析文章列表 regex = "msgList = '(.*?})';" article_list = tools.get_info(text, regex, fetch_one=True) article_list = article_list.replace('"', '"') publish_time = self.__parse_article_list(article_list, __biz, is_first_page=True) # 判断是否还有更多文章 没有跳转到下个公众号,有则下拉显示更多 regex = "can_msg_continue = '(\d)'" can_msg_continue = tools.get_info(text, regex, fetch_one=True) if can_msg_continue == '0': # 无更多文章 log.info('抓取到列表底部 无更多文章,公众号 {} 抓取完毕'.format(__biz)) new_last_publish_time = self._task_manager.get_new_last_article_publish_time(__biz) if not new_last_publish_time: # 标记成僵尸号 log.info('公众号 {} 为僵尸账号 不再监控'.format(__biz)) self._task_manager.sign_account_is_zombie(__biz) else: self._task_manager.update_account_last_publish_time(__biz, new_last_publish_time) elif publish_time: # 以下是拼接下拉显示更多的历史文章 跳转 # 取appmsg_token 在html中 regex = 'appmsg_token = "(.*?)";' appmsg_token = tools.get_info(text, regex, fetch_one=True) # 取其他参数 在url中 __biz = tools.get_param(req_url, '__biz') pass_ticket = tools.get_param(req_url, 'pass_ticket') next_page_url = 'https://mp.weixin.qq.com/mp/profile_ext?action=getmsg&__biz={__biz}&f=json&offset={offset}&count=10&is_ok=1&scene=124&uin=777&key=777&pass_ticket={pass_ticket}&wxtoken=&appmsg_token={appmsg_token}&x5=0&f=json'.format(__biz=__biz, offset=10, pass_ticket=pass_ticket, appmsg_token=appmsg_token) return self._task_manager.get_task(next_page_url, tip='正在抓取列表 next_offset {} 抓取到 {}'.format(10, publish_time)) else: # json格式 text = tools.get_json(text) article_list = text.get('general_msg_list', {}) publish_time = self.__parse_article_list(article_list, __biz) # 判断是否还有更多文章 没有跳转到下个公众号,有则下拉显示更多 can_msg_continue = text.get('can_msg_continue') if not can_msg_continue: # 无更多文章 log.info('抓取到列表底部 无更多文章,公众号 {} 抓取完毕'.format(__biz)) new_last_publish_time = self._task_manager.get_new_last_article_publish_time(__biz) self._task_manager.update_account_last_publish_time(__biz, new_last_publish_time) pass elif publish_time: # 以下是拼接下拉显示更多的历史文章 跳转 # 取参数 在url中 __biz = tools.get_param(req_url, '__biz') pass_ticket = tools.get_param(req_url, 'pass_ticket') appmsg_token = tools.get_param(req_url, 'appmsg_token') # 取offset 在json中 offset = text.get('next_offset', 0) next_page_url = 'https://mp.weixin.qq.com/mp/profile_ext?action=getmsg&__biz={__biz}&f=json&offset={offset}&count=10&is_ok=1&scene=124&uin=777&key=777&pass_ticket={pass_ticket}&wxtoken=&appmsg_token={appmsg_token}&x5=0&f=json'.format( __biz=__biz, offset=offset, pass_ticket=pass_ticket, appmsg_token=appmsg_token) return self._task_manager.get_task(next_page_url, tip='正在抓取列表 next_offset {} 抓取到 {}'.format(offset, publish_time)) else: # 该__biz 账号已被封 self._task_manager.sign_account_is_zombie(__biz) pass except Exception as e: log.exception(e) return self._task_manager.get_task() def deal_article(self, req_url, text): sn = tools.get_param(req_url, 'sn') if not text: self._task_manager.update_article_task_state(sn, -1) return None selector = Selector(text) content = selector.xpath('//div[@class="rich_media_content "]|//div[@class="rich_media_content"]|//div[@class="share_media"]') title = selector.xpath('//h2[@class="rich_media_title"]/text()').extract_first(default='').strip() account = selector.xpath('//a[@id="js_name"]/text()').extract_first(default='').strip() author = selector.xpath('//span[@class="rich_media_meta rich_media_meta_text"]//text()').extract_first(default='').strip() publish_timestamp = selector.re_first('n="(\d{10})"') publish_timestamp = int(publish_timestamp) if publish_timestamp else None publish_time = tools.timestamp_to_date(publish_timestamp) if publish_timestamp else None pics_url = content.xpath('.//img/@src|.//img/@data-src').extract() biz = tools.get_param(req_url, '__biz') digest = selector.re_first('var msg_desc = "(.*?)"') cover = selector.re_first('var cover = "(.*?)";') or selector.re_first('msg_cdn_url = "(.*?)"') source_url = selector.re_first("var msg_source_url = '(.*?)';") content_html = content.extract_first(default='') comment_id = selector.re_first('var comment_id = "(\d+)"') article_data = { 'account': account, 'title': title, 'url': req_url, 'author': author, 'publish_time': publish_time, '__biz': biz, 'digest': digest, 'cover': cover, "pics_url": pics_url, "content_html": content_html, "source_url": source_url, "comment_id": comment_id, "sn": sn, "spider_time": tools.get_current_date() } # 入库 if article_data and data_pipeline.save_article(article_data) is not None: self._task_manager.update_article_task_state(sn, 1) return self._task_manager.get_task() def deal_article_dynamic_info(self, req_data, text): """ 取文章动态信息 阅读 点赞 评论 :param req_data: post 请求的data str格式 :param text: :return: """ data = tools.get_json(text) dynamic_data = dict( sn=tools.get_param(req_data, 'sn'), __biz=tools.get_param(req_data, '__biz').replace('%3D', '='), read_num=data.get('appmsgstat', {}).get('read_num'), like_num=data.get('appmsgstat', {}).get('like_num'), comment_count=data.get('comment_count'), spider_time=tools.get_current_date() ) if dynamic_data: data_pipeline.save_article_dynamic(dynamic_data) def deal_comment(self, req_url, text): data = tools.get_json(text) __biz = tools.get_param(req_url, '__biz') comment_id = tools.get_param(req_url, 'comment_id') # 与文章关联 elected_comment = data.get('elected_comment', []) comment_datas = [ dict( __biz=__biz, comment_id=comment_id, nick_name=comment.get('nick_name'), logo_url=comment.get('logo_url'), content=comment.get('content'), create_time=tools.timestamp_to_date(comment.get('create_time')), content_id=comment.get('content_id'), like_num=comment.get('like_num'), is_top=comment.get('is_top'), spider_time=tools.get_current_date() ) for comment in elected_comment ] if comment_datas: data_pipeline.save_article_commnet(comment_datas) def get_task(self): return self._task_manager.get_task()
def install(request): """ Этот вызов используется для передать в веб-интерфейс дерево зависимостей и принять от него запрос на установку. Примеры запросов и ответов: начальный запрос: command: "start" requested_products: [JavaJettyTemplate] ответ: task: id: 168 url: /task/168/ items: paramters: [...] product: name: ... title: ... and: - item 1... - item 2... """ # TODO do not give ability to install application through this function if request.method != 'POST': # принимаем только пост-запросы resp = HttpResponseNotAllowed(['POST']) return resp # джейсон запрос в питоний словарь req = json_request(request) # это начальный запрос? dm = DependencyManager() core = Core.get_instance() # продукты, которые запрошены на установку (без зависимостей) requested_products = req['requested_products'] if req["command"] == "start": # если это начальный запрос, то отдаем дерево зависимостей resp = { 'task': None, "state": "requirements", 'items': [dm.get_dependencies(product_name) for product_name in requested_products] } else: # это запрос на установку # список имён продуктов, которые нужно установить # ВАЖНЫЙ МОМЕНТ (с зависимостями) product_list = [item['product'] for item in req['install_products']] # переворачиваем его product_list.reverse() # добывает спсисоко продуктов из списка имён products = ProductCollection(product_list, feeds=(core.feed, core.current)) # парсим параметры установки из запроса parsed_parameters = ParametersParserJson(req['install_products']).get() # создаём менеджер параметров parameter_manager = ParametersManager(core, products, parsed_parameters) # все ли параметры заполнены? if parameter_manager.are_all_parameters_filled(): # всё ок, создаём задание на установку task = TaskFactory.create_task("install", products, parameter_manager) task_id = TaskManager.queue_task(task) TornadoWorker.start_new_task(task) # и готовим ответ веб-интерфейсу, что уставнока началась resp = { 'task': { 'id': task_id, }, } else: # что-то не так с параметрами, возвращаем в веб морду ошибку resp = { 'task': None, 'items': [dm.get_dependencies(product_name) for product_name in req['requested_products']], 'error': [parameter_manager.get_error(product) for product in products] } return resp
class DealData: def __init__(self): self._task_manager = TaskManager() self._task_manager.reset_task() def __parse_account_info(self, data, req_url): __biz = tools.get_param(req_url, "__biz") regex = 'id="nickname">(.*?)</strong>' account = tools.get_info(data, regex, fetch_one=True).strip() regex = 'profile_avatar">.*?<img src="(.*?)"' head_url = tools.get_info(data, regex, fetch_one=True) regex = 'class="profile_desc">(.*?)</p>' summary = tools.get_info(data, regex, fetch_one=True).strip() # 认证信息(关注的账号直接点击查看历史消息,无认证信息) regex = '<i class="icon_verify success">.*?</i>(.*?)</span>' verify = tools.get_info(data, regex, fetch_one=True) verify = verify.strip() if verify else "" # 二维码 regex = 'var username = "" \|\| "(.*?)";' # || 需要转译 qr_code = tools.get_info(data, regex, fetch_one=True) qr_code = "http://open.weixin.qq.com/qr/code?username="******"__biz": __biz, "account": account, "head_url": head_url, "summary": summary, "qr_code": qr_code, "verify": verify, "spider_time": tools.get_current_date(), } if account_data: data_pipeline.save_account(account_data) def __parse_article_list(self, article_list, __biz, is_first_page=False): """ @summary: 解析文章列表 --------- @param article_list: 文章列表信息 str --------- @result: True / None (True: 继续向下抓取; None: 停止向下抓取) """ # log.debug(tools.dumps_json(article_list)) # 解析json内容里文章信息 def parse_article_info(article_info, comm_msg_info): if not article_info: return # log.debug(tools.dumps_json(article_info)) title = article_info.get("title") digest = article_info.get("digest") url = article_info.get("content_url").replace("\\", "").replace( "amp;", "") source_url = article_info.get("source_url").replace("\\", "") # 引用的文章链接 cover = article_info.get("cover").replace("\\", "") subtype = article_info.get("subtype") is_multi = article_info.get("is_multi") author = article_info.get("author") copyright_stat = article_info.get("copyright_stat") duration = article_info.get("duration") del_flag = article_info.get("del_flag") type = comm_msg_info.get("type") publish_time = tools.timestamp_to_date( comm_msg_info.get("datetime")) sn = tools.get_param(url, "sn") if sn: # 缓存文章信息 article_data = { "title": title, "digest": digest, "url": url, "source_url": source_url, "cover": cover, "subtype": subtype, "is_multi": is_multi, "author": author, "copyright_stat": copyright_stat, "duration": duration, "del_flag": del_flag, "type": type, "publish_time": publish_time, "sn": sn, "__biz": __biz, "spider_time": tools.get_current_date(), } return article_data # log.debug(tools.dumps_json(article_list)) article_list = tools.get_json(article_list) article_list_data = [] publish_time = None is_need_get_more = True article_list = article_list.get("list", []) is_first_article = True for article in article_list: comm_msg_info = article.get("comm_msg_info", {}) publish_timestamp = comm_msg_info.get("datetime") publish_time = tools.timestamp_to_date(publish_timestamp) # 记录最新发布时间 if is_first_page and is_first_article: self._task_manager.record_new_last_article_publish_time( __biz, publish_time) is_first_article = False if publish_timestamp and self._task_manager.is_zombie_account( publish_timestamp): # 首页检测是否为最新发布的文章 若最近未发布 则为僵尸账号 log.info("公众号 {} 为僵尸账号 不再监控".format(__biz)) self._task_manager.sign_account_is_zombie( __biz, publish_time) is_need_get_more = False break # 对比时间 若采集到上次时间,则跳出 is_reach = self._task_manager.is_reach_last_article_publish_time( __biz, publish_time) if is_reach: log.info("采集到上次发布时间 公众号 {} 采集完成".format(__biz)) new_last_publish_time = self._task_manager.get_new_last_article_publish_time( __biz) self._task_manager.update_account_last_publish_time( __biz, new_last_publish_time) is_need_get_more = False break elif is_reach is None: log.info( "公众号 {} 为爬虫启动时的手点公众号。不遍历历史消息,即将抓取监控池里的公众号".format(__biz)) return article_type = comm_msg_info.get("type") if article_type != 49: # 49为常见的图文消息、其他消息有文本、语音、视频,此处不采集,格式不统一 continue # 看是否在抓取时间范围 publish_time_status = self._task_manager.is_in_crawl_time_range( publish_time) if publish_time_status == TaskManager.OVER_MIN_TIME_RANGE: log.info("公众号 {} 超过采集时间范围 采集完成".format(__biz)) new_last_publish_time = self._task_manager.get_new_last_article_publish_time( __biz) self._task_manager.update_account_last_publish_time( __biz, new_last_publish_time) is_need_get_more = False break elif publish_time_status == TaskManager.NOT_REACH_TIME_RANGE: log.info("公众号 {} 当前采集到的时间 {} 未到采集时间范围 不采集".format( __biz, publish_time)) continue # 在时间范围 # 微信公众号每次可以发多个图文消息 # 第一个图文消息 app_msg_ext_info = article.get("app_msg_ext_info", {}) article_data = parse_article_info(app_msg_ext_info, comm_msg_info) if article_data: article_list_data.append(article_data) # 同一天附带的图文消息 multi_app_msg_item_list = app_msg_ext_info.get( "multi_app_msg_item_list") for multi_app_msg_item in multi_app_msg_item_list: article_data = parse_article_info(multi_app_msg_item, comm_msg_info) if article_data: article_list_data.append(article_data) if article_list_data: data_pipeline.save_article_list(article_list_data) if is_need_get_more: return publish_time def deal_article_list(self, req_url, text): """ @summary: 获取文章列表 分为两种 1、第一次查看历史消息 返回的是html格式 包含公众号信息 2、下拉显示更多时 返回json格式 但是文章列表都是json格式 且合适相同 抓取思路: 1、如果是第一种格式,直接解析文章内容,拼接下一页json格式的地址 2、如果是第二种格式, --------- @param data: --------- @result: """ try: # 判断是否为被封的账号, 被封账号没有文章列表 __biz = tools.get_param(req_url, "__biz") if "list" in text: # 取html格式里的文章列表 if "action=home" in req_url: # 解析公众号信息 self.__parse_account_info(text, req_url) # 解析文章列表 regex = "msgList = '(.*?})';" article_list = tools.get_info(text, regex, fetch_one=True) article_list = article_list.replace(""", '"') publish_time = self.__parse_article_list( article_list, __biz, is_first_page=True) # 判断是否还有更多文章 没有跳转到下个公众号,有则下拉显示更多 regex = "can_msg_continue = '(\d)'" can_msg_continue = tools.get_info(text, regex, fetch_one=True) if can_msg_continue == "0": # 无更多文章 log.info("抓取到列表底部 无更多文章,公众号 {} 抓取完毕".format(__biz)) new_last_publish_time = self._task_manager.get_new_last_article_publish_time( __biz) if not new_last_publish_time: # 标记成僵尸号 log.info("公众号 {} 为僵尸账号 不再监控".format(__biz)) self._task_manager.sign_account_is_zombie(__biz) else: self._task_manager.update_account_last_publish_time( __biz, new_last_publish_time) elif publish_time: # 以下是拼接下拉显示更多的历史文章 跳转 # 取appmsg_token 在html中 regex = 'appmsg_token = "(.*?)";' appmsg_token = tools.get_info(text, regex, fetch_one=True) # 取其他参数 在url中 __biz = tools.get_param(req_url, "__biz") pass_ticket = tools.get_param(req_url, "pass_ticket") next_page_url = "https://mp.weixin.qq.com/mp/profile_ext?action=getmsg&__biz={__biz}&f=json&offset={offset}&count=10&is_ok=1&scene=124&uin=777&key=777&pass_ticket={pass_ticket}&wxtoken=&appmsg_token={appmsg_token}&x5=0&f=json".format( __biz=__biz, offset=10, pass_ticket=pass_ticket, appmsg_token=appmsg_token, ) return self._task_manager.get_task( next_page_url, tip="正在抓取列表 next_offset {} 抓取到 {}".format( 10, publish_time), ) else: # json格式 text = tools.get_json(text) article_list = text.get("general_msg_list", {}) publish_time = self.__parse_article_list( article_list, __biz) # 判断是否还有更多文章 没有跳转到下个公众号,有则下拉显示更多 can_msg_continue = text.get("can_msg_continue") if not can_msg_continue: # 无更多文章 log.info("抓取到列表底部 无更多文章,公众号 {} 抓取完毕".format(__biz)) new_last_publish_time = self._task_manager.get_new_last_article_publish_time( __biz) self._task_manager.update_account_last_publish_time( __biz, new_last_publish_time) pass elif publish_time: # 以下是拼接下拉显示更多的历史文章 跳转 # 取参数 在url中 __biz = tools.get_param(req_url, "__biz") pass_ticket = tools.get_param(req_url, "pass_ticket") appmsg_token = tools.get_param(req_url, "appmsg_token") # 取offset 在json中 offset = text.get("next_offset", 0) next_page_url = "https://mp.weixin.qq.com/mp/profile_ext?action=getmsg&__biz={__biz}&f=json&offset={offset}&count=10&is_ok=1&scene=124&uin=777&key=777&pass_ticket={pass_ticket}&wxtoken=&appmsg_token={appmsg_token}&x5=0&f=json".format( __biz=__biz, offset=offset, pass_ticket=pass_ticket, appmsg_token=appmsg_token, ) return self._task_manager.get_task( next_page_url, tip="正在抓取列表 next_offset {} 抓取到 {}".format( offset, publish_time), ) else: # 该__biz 账号已被封 self._task_manager.sign_account_is_zombie(__biz) pass except Exception as e: log.exception(e) return self._task_manager.get_task() def deal_article(self, req_url, text): """ 解析文章 :param req_url: :param text: :return: """ sn = tools.get_param(req_url, "sn") if not text: self._task_manager.update_article_task_state(sn, -1) return None selector = Selector(text) content = selector.xpath( '//div[@class="rich_media_content "]|//div[@class="rich_media_content"]|//div[@class="share_media"]' ).extract_first(default="") title = (selector.xpath('//h2[@class="rich_media_title"]/text()'). extract_first(default="").strip()) account = (selector.xpath('//a[@id="js_name"]/text()').extract_first( default="").strip()) author = (selector.xpath( '//span[@class="rich_media_meta rich_media_meta_text"]//text()'). extract_first(default="").strip()) publish_timestamp = selector.re_first('n="(\d{10})"') publish_timestamp = int( publish_timestamp) if publish_timestamp else None publish_time = (tools.timestamp_to_date(publish_timestamp) if publish_timestamp else None) biz = tools.get_param(req_url, "__biz") text = remove_tags(content).strip() spider_name = 'wechat' collection_mode = 'spider' data_source_type = '微信公众号' article_data = { "data_type": account, "title": title, "data_address": req_url, "author": author, "publish_time": publish_time, "__biz": biz, "text": text, "spider_name": spider_name, "collection_mode": collection_mode, "data_source_type": data_source_type, "sn": sn, "collection_time": tools.get_current_date(), } # 入库 if article_data and data_pipeline.save_article( article_data) is not None: self._task_manager.update_article_task_state(sn, 1) return self._task_manager.get_task() # def deal_article_dynamic_info(self, req_data, text): # """ # 取文章动态信息 阅读 点赞 评论 # :param req_data: post 请求的data str格式 # :param text: # :return: # """ # data = tools.get_json(text) # # dynamic_data = dict( # sn=tools.get_param(req_data, "sn"), # __biz=tools.get_param(req_data, "__biz").replace("%3D", "="), # read_num=data.get("appmsgstat", {}).get("read_num"), # like_num=data.get("appmsgstat", {}).get("like_num"), # comment_count=data.get("comment_count"), # spider_time=tools.get_current_date(), # ) # # if dynamic_data: # data_pipeline.save_article_dynamic(dynamic_data) # # def deal_comment(self, req_url, text): # """ # 解析评论 # :param req_url: # :param text: # :return: # """ # # data = tools.get_json(text) # # __biz = tools.get_param(req_url, "__biz") # # comment_id = tools.get_param(req_url, "comment_id") # 与文章关联 # elected_comment = data.get("elected_comment", []) # # comment_datas = [ # dict( # __biz=__biz, # comment_id=comment_id, # nick_name=comment.get("nick_name"), # logo_url=comment.get("logo_url"), # content=comment.get("content"), # create_time=tools.timestamp_to_date(comment.get("create_time")), # content_id=comment.get("content_id"), # like_num=comment.get("like_num"), # is_top=comment.get("is_top"), # spider_time=tools.get_current_date(), # ) # for comment in elected_comment # ] # # if comment_datas: # data_pipeline.save_article_commnet(comment_datas) def get_task(self): return self._task_manager.get_task()
class OctoBot: """ Constructor : - Load configs """ def __init__(self, config, ignore_config=False, reset_trading_history=False): self.start_time = time.time() self.config = config self.reset_trading_history = reset_trading_history self.startup_config = copy.deepcopy(config) self.edited_config = copy.deepcopy(config) # tools: used for alternative operations on a bot on the fly (ex: backtesting started from web interface) self.tools = { BOT_TOOLS_BACKTESTING: None, BOT_TOOLS_STRATEGY_OPTIMIZER: None, BOT_TOOLS_RECORDER: None, } # unique aiohttp session: to be initialized from getter in a task self._aiohttp_session = None # metrics if enabled self.metrics_handler = None # Logger self.logger = get_logger(self.__class__.__name__) self.initializer = Initializer(self) self.task_manager = TaskManager(self) self.exchange_factory = ExchangeFactory(self, ignore_config=ignore_config) self.evaluator_factory = EvaluatorFactory(self) async def initialize(self): await self.initializer.create() self.task_manager.init_async_loop() await self.exchange_factory.create() self.evaluator_factory.create() async def start(self, run_in_new_thread=False): await self.task_manager.start_tasks(run_in_new_thread=run_in_new_thread ) def stop(self): self.task_manager.stop_threads() def run_in_main_asyncio_loop(self, coroutine): return self.task_manager.run_in_main_asyncio_loop(coroutine) def set_watcher(self, watcher): self.task_manager.watcher = watcher def get_symbols_tasks_manager(self): return self.evaluator_factory.symbol_tasks_manager def get_exchange_traders(self): return self.exchange_factory.exchange_traders def get_exchange_trader_simulators(self): return self.exchange_factory.exchange_trader_simulators def get_exchange_trading_modes(self): return self.exchange_factory.exchange_trading_modes def get_exchanges_list(self): return self.exchange_factory.exchanges_list def get_symbol_evaluator_list(self): return self.evaluator_factory.symbol_evaluator_list def get_symbols_list(self): return self.evaluator_factory.symbol_evaluator_list.keys() def get_crypto_currency_evaluator_list(self): return self.evaluator_factory.crypto_currency_evaluator_list def get_dispatchers_list(self): return self.evaluator_factory.dispatchers_list def get_global_updaters_by_exchange(self): return self.exchange_factory.global_updaters_by_exchange def get_trading_mode(self): return self.exchange_factory.trading_mode def is_ready(self): return self.task_manager.ready def get_config(self): return self.config def get_tools(self): return self.tools def get_time_frames(self): return self.initializer.time_frames def get_relevant_evaluators(self): return self.initializer.relevant_evaluators def get_async_loop(self): return self.task_manager.async_loop def get_aiohttp_session(self): if self._aiohttp_session is None: self._aiohttp_session = aiohttp.ClientSession() return self._aiohttp_session
def install(request): """ Этот вызов используется для передать в веб-интерфейс дерево зависимостей и принять от него запрос на установку. Примеры запросов и ответов: начальный запрос: command: "start" requested_products: [JavaJettyTemplate] ответ: task: id: 168 url: /task/168/ items: paramters: [...] product: name: ... title: ... and: - item 1... - item 2... """ # TODO do not give ability to install application through this function if request.method != 'POST': # принимаем только пост-запросы resp = HttpResponseNotAllowed(['POST']) return resp # джейсон запрос в питоний словарь req = json_request(request) # это начальный запрос? dm = DependencyManager() core = Core.get_instance() # продукты, которые запрошены на установку (без зависимостей) requested_products = req['requested_products'] if req["command"] == "start": # если это начальный запрос, то отдаем дерево зависимостей resp = { 'task': None, "state": "requirements", 'items': [ dm.get_dependencies(product_name) for product_name in requested_products ] } else: # это запрос на установку # список имён продуктов, которые нужно установить # ВАЖНЫЙ МОМЕНТ (с зависимостями) product_list = [item['product'] for item in req['install_products']] # переворачиваем его product_list.reverse() # добывает спсисоко продуктов из списка имён products = ProductCollection(product_list, feeds=(core.feed, core.current)) # парсим параметры установки из запроса parsed_parameters = ParametersParserJson(req['install_products']).get() # создаём менеджер параметров parameter_manager = ParametersManager(core, products, parsed_parameters) # все ли параметры заполнены? if parameter_manager.are_all_parameters_filled(): # всё ок, создаём задание на установку task = TaskFactory.create_task("install", products, parameter_manager) task_id = TaskManager.queue_task(task) TornadoWorker.start_new_task(task) # и готовим ответ веб-интерфейсу, что уставнока началась resp = { 'task': { 'id': task_id, }, } else: # что-то не так с параметрами, возвращаем в веб морду ошибку resp = { 'task': None, 'items': [ dm.get_dependencies(product_name) for product_name in req['requested_products'] ], 'error': [parameter_manager.get_error(product) for product in products] } return resp