Python extract_thumbnail_or_none示例

编程语言: Python

命名空间/包名称: crawler.utils

方法/功能: extract_thumbnail_or_none

hotexamples.com的示例: 3

Python extract_thumbnail_or_none - 已找到3个示例。这些是从开源项目中提取的最受好评的crawler.utils.extract_thumbnail_or_none现实Python示例。您可以评价示例，以帮助我们提高示例质量。

示例#1

显示文件

文件： nikkei_spider.py 项目： sarcastic555/politylink-crawler

    def scrape_news_and_text(self, response):
        maybe_json_ld = extract_json_ld_or_none(response)
        title = strip_join(
            response.css('h1.title_tyodebu').xpath('.//text()').getall(),
            sep=' ')
        body = strip_join(
            response.css('section.container_cz8tiun').xpath(
                './/p/text()').getall())

        news = build_news(response.url, self.publisher)
        news.title = title
        news.is_paid = 'この記事は会員限定です' in response.body.decode('UTF-8')
        if maybe_json_ld:
            json_ld = maybe_json_ld
            maybe_thumbnail = extract_thumbnail_or_none(json_ld)
            if maybe_thumbnail:
                news.thumbnail = maybe_thumbnail
            news.published_at = self.to_datetime(json_ld['datePublished'])
            news.last_modified_at = self.to_datetime(json_ld['dateModified'])
        else:
            maybe_published_at_str = response.css(
                'div.TimeStamp_t165nkxq').xpath('.//time/@datetime').get()
            if maybe_published_at_str:
                news.published_at = self.to_datetime2(maybe_published_at_str)

        news_text = NewsText({'id': news.id})
        news_text.title = title
        news_text.body = body

        return news, news_text

示例#2

显示文件

文件： mainichi_spider.py 项目： politylink/politylink-crawler

    def scrape_news_and_text(self, response):
        maybe_json_ld = extract_json_ld_or_none(response)
        title = response.xpath(
            '//h1[@class="title-page"]/text()').get().strip()
        article = response.css('section#articledetail-body')
        body = strip_join(article.xpath('.//p/text()').getall())

        news = build_news(response.url, self.publisher)
        news.title = title
        news.is_paid = 'この記事は有料記事です' in response.body.decode('UTF-8')

        news_text = NewsText({'id': news.id})
        news_text.title = title
        news_text.body = body

        if maybe_json_ld:
            json_ld = maybe_json_ld
            maybe_thumbnail = extract_thumbnail_or_none(json_ld)
            if maybe_thumbnail:
                news.thumbnail = maybe_thumbnail
            news.published_at = self.to_datetime(json_ld['datePublished'])
            news.last_modified_at = self.to_datetime(json_ld['dateModified'])
            news_text.date = to_date_str(news.published_at)

        return news, news_text

示例#3

显示文件

    def scrape_news_and_text(self, response):
        maybe_json_ld = extract_json_ld_or_none(response)
        title = response.xpath('//h1/text()').get().strip()
        body = strip_join(response.xpath('//div[@class="ArticleBodyWrapper"]/p/text()').getall())

        news = build_news(response.url, self.publisher)
        news.title = title
        news.is_paid = False

        news_text = NewsText({'id': news.id})
        news_text.title = title
        news_text.body = body

        if maybe_json_ld:
            json_ld = maybe_json_ld
            maybe_thumbnail = extract_thumbnail_or_none(json_ld)
            if maybe_thumbnail:
                news.thumbnail = maybe_thumbnail
            news.published_at = self.to_datetime(json_ld['datePublished'])
            news.last_modified_at = self.to_datetime(json_ld['dateModified'])
            news_text.date = to_date_str(news.published_at)

        return news, news_text