Пример #1
0
 def parse_article_detail_html(self, response):
     """
     文章详情解析 html 版
     :param response:
     :return:
     """
     article_title = response.xpath(
         '//div[@class="title"]/text()').extract_first(default='')
     article_pub_time = response.xpath(
         '//span[@class="time"]/text()').extract_first(default='')
     article_content = response.xpath(
         '//div[@class="WB_editor_iframe"]').extract_first(default='')
     fetch_result_item = FetchResultItem()
     fetch_result_item['task_id'] = response.meta['task_id']
     fetch_result_item['platform_id'] = response.meta['platform_id']
     fetch_result_item['platform_name'] = platform_name_map.get(
         response.meta['platform_id'], '')
     fetch_result_item['channel_id'] = response.meta['channel_id']
     fetch_result_item['channel_name'] = channel_name_map.get(
         response.meta['channel_id'], '')
     fetch_result_item['article_id'] = response.meta['article_id']
     fetch_result_item['article_title'] = article_title
     fetch_result_item['article_author_id'] = response.meta['follow_id']
     fetch_result_item['article_author_name'] = response.meta['follow_name']
     fetch_result_item['article_pub_time'] = article_pub_time
     fetch_result_item['article_url'] = response.url
     fetch_result_item['article_tags'] = ''
     fetch_result_item['article_abstract'] = response.meta[
         'article_abstract']
     fetch_result_item['article_content'] = article_content
     yield fetch_result_item
Пример #2
0
    def parse_article_detail_js(self, response):
        """
        文章详情解析 js 版
        :param response:
        :return:
        """
        article_detail_body = response.body_as_unicode()
        article_detail_rule = r'<script>FM.view\({"ns":.*?"html":"(.*?)"}\)</script>'
        article_detail_re_parse = re.compile(article_detail_rule,
                                             re.S).findall(article_detail_body)
        if not article_detail_re_parse:
            return
        article_detail_html = ''.join(article_detail_re_parse)

        # 转义字符处理
        article_detail_html = article_detail_html.replace('\\r', '')
        article_detail_html = article_detail_html.replace('\\t', '')
        article_detail_html = article_detail_html.replace('\\n', '')
        article_detail_html = article_detail_html.replace('\\"', '"')
        article_detail_html = article_detail_html.replace('\\/', '/')

        article_detail_doc = fromstring(article_detail_html)

        article_title_parse = article_detail_doc.xpath(
            '//h1[@class="title"]/text()')
        article_title = article_title_parse[0].strip(
        ) if article_title_parse else ''

        article_pub_time_parse = article_detail_doc.xpath(
            '//span[@class="time"]/text()')
        article_pub_time = self.trans_time(article_pub_time_parse[0].strip(
        )) if article_pub_time_parse else time.strftime('%Y-%m-%d %H:%M:%S')

        article_content_parse = article_detail_doc.xpath(
            '//div[@class="WBA_content"]')
        article_content = tostring(
            article_content_parse[0],
            encoding='unicode').strip() if article_content_parse else ''

        fetch_result_item = FetchResultItem()
        fetch_result_item['task_id'] = response.meta['task_id']
        fetch_result_item['platform_id'] = response.meta['platform_id']
        fetch_result_item['platform_name'] = platform_name_map.get(
            response.meta['platform_id'], '')
        fetch_result_item['channel_id'] = response.meta['channel_id']
        fetch_result_item['channel_name'] = channel_name_map.get(
            response.meta['channel_id'], '')
        fetch_result_item['article_id'] = response.meta['article_id']
        fetch_result_item['article_title'] = article_title
        fetch_result_item['article_author_id'] = response.meta['follow_id']
        fetch_result_item['article_author_name'] = response.meta['follow_name']
        fetch_result_item['article_pub_time'] = time_local_to_utc(
            article_pub_time).strftime('%Y-%m-%d %H:%M:%S')
        fetch_result_item['article_url'] = response.url
        fetch_result_item['article_tags'] = ''
        fetch_result_item['article_abstract'] = response.meta[
            'article_abstract']
        fetch_result_item['article_content'] = article_content
        yield fetch_result_item
Пример #3
0
    def parse_article_detail_js(self, response):
        """
        文章详情解析 js 版
        :param response:
        :return:
        """
        article_detail_body = response.body_as_unicode()
        article_detail_rule = ur'<script>FM.view\({"ns":.*?"html":"(.*?)"}\)</script>'
        article_detail_re_parse = re.compile(article_detail_rule,
                                             re.S).findall(article_detail_body)
        if not article_detail_re_parse:
            return
        article_detail_html = u''.join(article_detail_re_parse)

        article_detail_doc = fromstring(article_detail_html.replace(
            u'\\', u''))

        article_title_parse = article_detail_doc.xpath(
            '//h1[@class="title"]/text()')
        article_title = article_title_parse[0].strip(
        ) if article_title_parse else u''

        article_pub_time_parse = article_detail_doc.xpath(
            '//span[@class="time"]/text()')
        article_pub_time = self.trans_time(article_pub_time_parse[0].strip())

        article_content_parse = article_detail_doc.xpath(
            '//div[@class="WBA_content"]/text()')
        article_content = article_content_parse[0].strip(
        ) if article_content_parse else u''

        fetch_result_item = FetchResultItem()
        fetch_result_item['task_id'] = response.meta['task_id']
        fetch_result_item['platform_id'] = response.meta['platform_id']
        fetch_result_item['platform_name'] = platform_name_map.get(
            response.meta['platform_id'], u'')
        fetch_result_item['channel_id'] = response.meta['channel_id']
        fetch_result_item['channel_name'] = channel_name_map.get(
            response.meta['channel_id'], u'')
        fetch_result_item['article_id'] = response.meta['article_id']
        fetch_result_item['article_title'] = article_title
        fetch_result_item['article_author_id'] = response.meta['follow_id']
        fetch_result_item['article_author_name'] = response.meta['follow_name']
        fetch_result_item['article_pub_time'] = article_pub_time
        fetch_result_item['article_url'] = response.url
        fetch_result_item['article_tags'] = u''
        fetch_result_item['article_abstract'] = response.meta[
            'article_abstract']
        fetch_result_item['article_content'] = article_content
        yield fetch_result_item
Пример #4
0
    def parse_detail(self, response):
        """
        详细页面
        :param response:
        :return:
        """
        print(response.meta)

        article_content = ''.join([
            i.strip()
            for i in response.xpath('//div[@id="js_content"]/*').extract()
        ])

        # 原创内容处理(处理内容为空)
        if not article_content:
            share_source_url = response.xpath(
                '//a[@id="js_share_source"]/@href').extract_first()
            yield scrapy.Request(url=share_source_url,
                                 callback=self.parse_detail,
                                 meta=response.meta)
            return

        fetch_result_item = FetchResultItem()
        fetch_result_item['task_id'] = response.meta['task_id']
        fetch_result_item['platform_id'] = response.meta['platform_id']
        fetch_result_item['platform_name'] = platform_name_map.get(
            response.meta['platform_id'], u'')
        fetch_result_item['channel_id'] = response.meta['channel_id']
        fetch_result_item['channel_name'] = channel_name_map.get(
            response.meta['channel_id'], u'')
        fetch_result_item['article_id'] = response.meta['article_id']
        fetch_result_item['article_title'] = response.meta['article_title']
        fetch_result_item['article_author_id'] = response.meta['follow_id']
        fetch_result_item['article_author_name'] = response.meta['follow_name']
        fetch_result_item['article_pub_time'] = response.meta[
            'article_pub_time']
        fetch_result_item['article_url'] = response.meta['article_url']
        fetch_result_item['article_tags'] = u''
        fetch_result_item['article_abstract'] = response.meta[
            'article_abstract']
        fetch_result_item['article_content'] = article_content

        yield fetch_result_item
Пример #5
0
    def parse_article_detail(self, response):
        """
        文章详情
        :param response:
        :return:
        """
        toutiao_body = response.body_as_unicode()
        js_body = parse_toutiao_js_body(toutiao_body,
                                        response.meta['detail_url'])
        if not js_body:
            return
        pj = ParseJsTt(js_body=js_body)

        article_id = pj.parse_js_item_id()
        article_title = pj.parse_js_title()
        article_abstract = pj.parse_js_abstract()
        article_content = pj.parse_js_content()
        article_pub_time = pj.parse_js_pub_time()
        article_tags = pj.parse_js_tags()

        fetch_result_item = FetchResultItem()
        fetch_result_item['task_id'] = response.meta['task_id']
        fetch_result_item['platform_id'] = response.meta['platform_id']
        fetch_result_item['platform_name'] = platform_name_map.get(
            response.meta['platform_id'], '')
        fetch_result_item['channel_id'] = response.meta['channel_id']
        fetch_result_item['channel_name'] = channel_name_map.get(
            response.meta['channel_id'], '')
        fetch_result_item['article_id'] = article_id
        fetch_result_item['article_title'] = article_title
        fetch_result_item['article_author_id'] = response.meta['follow_id']
        fetch_result_item['article_author_name'] = response.meta['follow_name']
        fetch_result_item['article_pub_time'] = time_local_to_utc(
            article_pub_time).strftime('%Y-%m-%d %H:%M:%S')
        fetch_result_item[
            'article_url'] = response.url or response.meta['detail_url']
        fetch_result_item['article_tags'] = article_tags
        fetch_result_item['article_abstract'] = article_abstract
        fetch_result_item['article_content'] = article_content

        yield fetch_result_item
Пример #6
0
    def parse_article_detail(self, response):
        """
        文章详情
        :param response:
        :return:
        """
        body = response.body_as_unicode()
        result = json.loads(body)['data']

        # print("==article body:", toutiao_body)
        # fixme add 评论数,阅读数;
        print('\n====result:', result)
        impression_count = result[
            'impression_count'] if 'impression_count' in result else 0
        comment_count = result[
            'comment_count'] if 'comment_count' in result else 0

        fetch_result_item = FetchResultItem()
        fetch_result_item['task_id'] = 0
        fetch_result_item['platform_id'] = 0
        fetch_result_item['platform_name'] = platform_name_map.get(3, '')
        fetch_result_item['channel_id'] = 0
        fetch_result_item['channel_name'] = '0'
        fetch_result_item['article_id'] = response.meta['item_id']
        fetch_result_item['article_title'] = response.meta['article_title']
        fetch_result_item['article_pub_time'] = time_local_to_utc(
            response.meta['article_pub_time']).strftime('%Y-%m-%d %H:%M:%S')
        fetch_result_item['article_url'] = response.meta['article_url']
        fetch_result_item['article_tags'] = response.meta.get('keywords')
        fetch_result_item['article_abstract'] = ''
        fetch_result_item['article_content'] = result['content']
        fetch_result_item['impression_count'] = impression_count
        fetch_result_item['comment_count'] = comment_count

        print("===crawl url:", fetch_result_item['article_url'])

        yield fetch_result_item