示例#1
0
    def deal_article(self, req_url, text):
        """
        解析文章
        :param req_url:
        :param text:
        :return:
        """
        sn = tools.get_param(req_url, "sn")

        if not text:
            self._task_manager.update_article_task_state(sn, -1)
            return None

        selector = Selector(text)

        content = selector.xpath(
            '//div[@class="rich_media_content "]|//div[@class="rich_media_content"]|//div[@class="share_media"]'
        ).extract_first(default="")
        title = (selector.xpath('//h2[@class="rich_media_title"]/text()').
                 extract_first(default="").strip())
        account = (selector.xpath('//a[@id="js_name"]/text()').extract_first(
            default="").strip())
        author = (selector.xpath(
            '//span[@class="rich_media_meta rich_media_meta_text"]//text()').
                  extract_first(default="").strip())

        publish_timestamp = selector.re_first('n="(\d{10})"')
        publish_timestamp = int(
            publish_timestamp) if publish_timestamp else None
        publish_time = (tools.timestamp_to_date(publish_timestamp)
                        if publish_timestamp else None)
        biz = tools.get_param(req_url, "__biz")

        text = remove_tags(content).strip()
        spider_name = 'wechat'
        collection_mode = 'spider'
        data_source_type = '微信公众号'

        article_data = {
            "data_type": account,
            "title": title,
            "data_address": req_url,
            "author": author,
            "publish_time": publish_time,
            "__biz": biz,
            "text": text,
            "spider_name": spider_name,
            "collection_mode": collection_mode,
            "data_source_type": data_source_type,
            "sn": sn,
            "collection_time": tools.get_current_date(),
        }

        # 入库
        if article_data and data_pipeline.save_article(
                article_data) is not None:
            self._task_manager.update_article_task_state(sn, 1)

        return self._task_manager.get_task()
示例#2
0
    def deal_article(self, req_url, text):
        sn = tools.get_param(req_url, 'sn')

        if not text:
            self._task_manager.update_article_task_state(sn, -1)
            return None

        selector = Selector(text)

        content = selector.xpath('//div[@class="rich_media_content "]|//div[@class="rich_media_content"]|//div[@class="share_media"]')
        title = selector.xpath('//h2[@class="rich_media_title"]/text()').extract_first(default='').strip()
        account = selector.xpath('//a[@id="js_name"]/text()').extract_first(default='').strip()
        author = selector.xpath('//span[@class="rich_media_meta rich_media_meta_text"]//text()').extract_first(default='').strip()

        publish_timestamp = selector.re_first('n="(\d{10})"')
        publish_timestamp = int(publish_timestamp) if publish_timestamp else None
        publish_time = tools.timestamp_to_date(publish_timestamp) if publish_timestamp else None

        pics_url = content.xpath('.//img/@src|.//img/@data-src').extract()
        biz = tools.get_param(req_url, '__biz')

        digest = selector.re_first('var msg_desc = "(.*?)"')
        cover = selector.re_first('var cover = "(.*?)";') or selector.re_first('msg_cdn_url = "(.*?)"')
        source_url = selector.re_first("var msg_source_url = '(.*?)';")

        content_html = content.extract_first(default='')
        comment_id = selector.re_first('var comment_id = "(\d+)"')

        article_data = {
            'account': account,
            'title': title,
            'url': req_url,
            'author': author,
            'publish_time': publish_time,
            '__biz': biz,
            'digest': digest,
            'cover': cover,
            "pics_url": pics_url,
            "content_html": content_html,
            "source_url": source_url,
            "comment_id": comment_id,
            "sn": sn,
            "spider_time": tools.get_current_date()

        }

        # 入库
        if article_data and data_pipeline.save_article(article_data) is not None:
            self._task_manager.update_article_task_state(sn, 1)

        return self._task_manager.get_task()
示例#3
0
    def deal_article(self, req_url, text):
        """
        解析文章
        :param req_url:
        :param text:
        :return:
        """
        sn = tools.get_param(req_url, "sn")

        if not text:
            self._task_manager.update_article_task_state(sn, -1)
            return None

        selector = Selector(text)

        content = selector.xpath(
            '//div[@class="rich_media_content "]|//div[@class="rich_media_content"]|//div[@class="share_media"]'
        )
        title = (selector.xpath('//h2[@class="rich_media_title"]/text()').
                 extract_first(default="").strip())
        account = (selector.xpath('//a[@id="js_name"]/text()').extract_first(
            default="").strip())
        author = (selector.xpath(
            '//span[@class="rich_media_meta rich_media_meta_text"]//text()').
                  extract_first(default="").strip())

        publish_timestamp = selector.re_first('n="(\d{10})"')
        publish_timestamp = int(
            publish_timestamp) if publish_timestamp else None
        publish_time = (tools.timestamp_to_date(publish_timestamp)
                        if publish_timestamp else None)

        pics_url = content.xpath(".//img/@src|.//img/@data-src").extract()
        biz = tools.get_param(req_url, "__biz")

        digest = selector.re_first('var msg_desc = "(.*?)"')
        cover = selector.re_first('var cover = "(.*?)";') or selector.re_first(
            'msg_cdn_url = "(.*?)"')
        source_url = selector.re_first("var msg_source_url = '(.*?)';")

        content_html = content.extract_first(default="")
        comment_id = selector.re_first('var comment_id = "(\d+)"')

        article_data = {
            "account": account,
            "title": title,
            "url": req_url,
            "author": author,
            "publish_time": publish_time,
            "__biz": biz,
            "digest": digest,
            "cover": cover,
            "pics_url": pics_url,
            "content_html": content_html,
            "source_url": source_url,
            "comment_id": comment_id,
            "sn": sn,
            "spider_time": tools.get_current_date(),
        }

        # 入库
        if article_data and data_pipeline.save_article(
                article_data) is not None:
            self._task_manager.update_article_task_state(sn, 1)

        return self._task_manager.get_task()