def deal_article(self, req_url, text): """ 解析文章 :param req_url: :param text: :return: """ sn = tools.get_param(req_url, "sn") if not text: self._task_manager.update_article_task_state(sn, -1) return None selector = Selector(text) content = selector.xpath( '//div[@class="rich_media_content "]|//div[@class="rich_media_content"]|//div[@class="share_media"]' ).extract_first(default="") title = (selector.xpath('//h2[@class="rich_media_title"]/text()'). extract_first(default="").strip()) account = (selector.xpath('//a[@id="js_name"]/text()').extract_first( default="").strip()) author = (selector.xpath( '//span[@class="rich_media_meta rich_media_meta_text"]//text()'). extract_first(default="").strip()) publish_timestamp = selector.re_first('n="(\d{10})"') publish_timestamp = int( publish_timestamp) if publish_timestamp else None publish_time = (tools.timestamp_to_date(publish_timestamp) if publish_timestamp else None) biz = tools.get_param(req_url, "__biz") text = remove_tags(content).strip() spider_name = 'wechat' collection_mode = 'spider' data_source_type = '微信公众号' article_data = { "data_type": account, "title": title, "data_address": req_url, "author": author, "publish_time": publish_time, "__biz": biz, "text": text, "spider_name": spider_name, "collection_mode": collection_mode, "data_source_type": data_source_type, "sn": sn, "collection_time": tools.get_current_date(), } # 入库 if article_data and data_pipeline.save_article( article_data) is not None: self._task_manager.update_article_task_state(sn, 1) return self._task_manager.get_task()
def deal_article(self, req_url, text): sn = tools.get_param(req_url, 'sn') if not text: self._task_manager.update_article_task_state(sn, -1) return None selector = Selector(text) content = selector.xpath('//div[@class="rich_media_content "]|//div[@class="rich_media_content"]|//div[@class="share_media"]') title = selector.xpath('//h2[@class="rich_media_title"]/text()').extract_first(default='').strip() account = selector.xpath('//a[@id="js_name"]/text()').extract_first(default='').strip() author = selector.xpath('//span[@class="rich_media_meta rich_media_meta_text"]//text()').extract_first(default='').strip() publish_timestamp = selector.re_first('n="(\d{10})"') publish_timestamp = int(publish_timestamp) if publish_timestamp else None publish_time = tools.timestamp_to_date(publish_timestamp) if publish_timestamp else None pics_url = content.xpath('.//img/@src|.//img/@data-src').extract() biz = tools.get_param(req_url, '__biz') digest = selector.re_first('var msg_desc = "(.*?)"') cover = selector.re_first('var cover = "(.*?)";') or selector.re_first('msg_cdn_url = "(.*?)"') source_url = selector.re_first("var msg_source_url = '(.*?)';") content_html = content.extract_first(default='') comment_id = selector.re_first('var comment_id = "(\d+)"') article_data = { 'account': account, 'title': title, 'url': req_url, 'author': author, 'publish_time': publish_time, '__biz': biz, 'digest': digest, 'cover': cover, "pics_url": pics_url, "content_html": content_html, "source_url": source_url, "comment_id": comment_id, "sn": sn, "spider_time": tools.get_current_date() } # 入库 if article_data and data_pipeline.save_article(article_data) is not None: self._task_manager.update_article_task_state(sn, 1) return self._task_manager.get_task()
def deal_article(self, req_url, text): """ 解析文章 :param req_url: :param text: :return: """ sn = tools.get_param(req_url, "sn") if not text: self._task_manager.update_article_task_state(sn, -1) return None selector = Selector(text) content = selector.xpath( '//div[@class="rich_media_content "]|//div[@class="rich_media_content"]|//div[@class="share_media"]' ) title = (selector.xpath('//h2[@class="rich_media_title"]/text()'). extract_first(default="").strip()) account = (selector.xpath('//a[@id="js_name"]/text()').extract_first( default="").strip()) author = (selector.xpath( '//span[@class="rich_media_meta rich_media_meta_text"]//text()'). extract_first(default="").strip()) publish_timestamp = selector.re_first('n="(\d{10})"') publish_timestamp = int( publish_timestamp) if publish_timestamp else None publish_time = (tools.timestamp_to_date(publish_timestamp) if publish_timestamp else None) pics_url = content.xpath(".//img/@src|.//img/@data-src").extract() biz = tools.get_param(req_url, "__biz") digest = selector.re_first('var msg_desc = "(.*?)"') cover = selector.re_first('var cover = "(.*?)";') or selector.re_first( 'msg_cdn_url = "(.*?)"') source_url = selector.re_first("var msg_source_url = '(.*?)';") content_html = content.extract_first(default="") comment_id = selector.re_first('var comment_id = "(\d+)"') article_data = { "account": account, "title": title, "url": req_url, "author": author, "publish_time": publish_time, "__biz": biz, "digest": digest, "cover": cover, "pics_url": pics_url, "content_html": content_html, "source_url": source_url, "comment_id": comment_id, "sn": sn, "spider_time": tools.get_current_date(), } # 入库 if article_data and data_pipeline.save_article( article_data) is not None: self._task_manager.update_article_task_state(sn, 1) return self._task_manager.get_task()