def scrape_news_and_text(self, response):
        maybe_json_ld = extract_json_ld_or_none(response)
        title = strip_join(
            response.css('h1.title_tyodebu').xpath('.//text()').getall(),
            sep=' ')
        body = strip_join(
            response.css('section.container_cz8tiun').xpath(
                './/p/text()').getall())

        news = build_news(response.url, self.publisher)
        news.title = title
        news.is_paid = 'この記事は会員限定です' in response.body.decode('UTF-8')
        if maybe_json_ld:
            json_ld = maybe_json_ld
            maybe_thumbnail = extract_thumbnail_or_none(json_ld)
            if maybe_thumbnail:
                news.thumbnail = maybe_thumbnail
            news.published_at = self.to_datetime(json_ld['datePublished'])
            news.last_modified_at = self.to_datetime(json_ld['dateModified'])
        else:
            maybe_published_at_str = response.css(
                'div.TimeStamp_t165nkxq').xpath('.//time/@datetime').get()
            if maybe_published_at_str:
                news.published_at = self.to_datetime2(maybe_published_at_str)

        news_text = NewsText({'id': news.id})
        news_text.title = title
        news_text.body = body

        return news, news_text
    def scrape_news_and_text(self, response):
        maybe_json_ld = extract_json_ld_or_none(response)
        title = response.xpath(
            '//h1[@class="title-page"]/text()').get().strip()
        article = response.css('section#articledetail-body')
        body = strip_join(article.xpath('.//p/text()').getall())

        news = build_news(response.url, self.publisher)
        news.title = title
        news.is_paid = 'この記事は有料記事です' in response.body.decode('UTF-8')

        news_text = NewsText({'id': news.id})
        news_text.title = title
        news_text.body = body

        if maybe_json_ld:
            json_ld = maybe_json_ld
            maybe_thumbnail = extract_thumbnail_or_none(json_ld)
            if maybe_thumbnail:
                news.thumbnail = maybe_thumbnail
            news.published_at = self.to_datetime(json_ld['datePublished'])
            news.last_modified_at = self.to_datetime(json_ld['dateModified'])
            news_text.date = to_date_str(news.published_at)

        return news, news_text
示例#3
0
    def scrape_news_and_text(self, response):
        maybe_json_ld = extract_json_ld_or_none(response)
        title = response.xpath('//h1/text()').get().strip()
        body = strip_join(response.xpath('//div[@class="ArticleBodyWrapper"]/p/text()').getall())

        news = build_news(response.url, self.publisher)
        news.title = title
        news.is_paid = False

        news_text = NewsText({'id': news.id})
        news_text.title = title
        news_text.body = body

        if maybe_json_ld:
            json_ld = maybe_json_ld
            maybe_thumbnail = extract_thumbnail_or_none(json_ld)
            if maybe_thumbnail:
                news.thumbnail = maybe_thumbnail
            news.published_at = self.to_datetime(json_ld['datePublished'])
            news.last_modified_at = self.to_datetime(json_ld['dateModified'])
            news_text.date = to_date_str(news.published_at)

        return news, news_text