def scrape_news_and_text(self, response): maybe_json_ld = extract_json_ld_or_none(response) title = strip_join( response.css('h1.title_tyodebu').xpath('.//text()').getall(), sep=' ') body = strip_join( response.css('section.container_cz8tiun').xpath( './/p/text()').getall()) news = build_news(response.url, self.publisher) news.title = title news.is_paid = 'この記事は会員限定です' in response.body.decode('UTF-8') if maybe_json_ld: json_ld = maybe_json_ld maybe_thumbnail = extract_thumbnail_or_none(json_ld) if maybe_thumbnail: news.thumbnail = maybe_thumbnail news.published_at = self.to_datetime(json_ld['datePublished']) news.last_modified_at = self.to_datetime(json_ld['dateModified']) else: maybe_published_at_str = response.css( 'div.TimeStamp_t165nkxq').xpath('.//time/@datetime').get() if maybe_published_at_str: news.published_at = self.to_datetime2(maybe_published_at_str) news_text = NewsText({'id': news.id}) news_text.title = title news_text.body = body return news, news_text
def scrape_news_and_text(self, response): maybe_json_ld = extract_json_ld_or_none(response) title = response.xpath( '//h1[@class="title-page"]/text()').get().strip() article = response.css('section#articledetail-body') body = strip_join(article.xpath('.//p/text()').getall()) news = build_news(response.url, self.publisher) news.title = title news.is_paid = 'この記事は有料記事です' in response.body.decode('UTF-8') news_text = NewsText({'id': news.id}) news_text.title = title news_text.body = body if maybe_json_ld: json_ld = maybe_json_ld maybe_thumbnail = extract_thumbnail_or_none(json_ld) if maybe_thumbnail: news.thumbnail = maybe_thumbnail news.published_at = self.to_datetime(json_ld['datePublished']) news.last_modified_at = self.to_datetime(json_ld['dateModified']) news_text.date = to_date_str(news.published_at) return news, news_text
def scrape_news_and_text(self, response): maybe_json_ld = extract_json_ld_or_none(response) title = response.xpath('//h1/text()').get().strip() body = strip_join(response.xpath('//div[@class="ArticleBodyWrapper"]/p/text()').getall()) news = build_news(response.url, self.publisher) news.title = title news.is_paid = False news_text = NewsText({'id': news.id}) news_text.title = title news_text.body = body if maybe_json_ld: json_ld = maybe_json_ld maybe_thumbnail = extract_thumbnail_or_none(json_ld) if maybe_thumbnail: news.thumbnail = maybe_thumbnail news.published_at = self.to_datetime(json_ld['datePublished']) news.last_modified_at = self.to_datetime(json_ld['dateModified']) news_text.date = to_date_str(news.published_at) return news, news_text