Python newsItem示例，onlinemedia.items.newsItem Python示例

示例#1

0

显示文件

文件： spider.py 项目： SAP-samples/data-intelligence-text-analysis

    def parse_homepage(self, response):
        article_urls_raw = response.xpath(
            "//a[@class = 'js-hlp-LinkSwap js-tsr-Base_ContentLink tsr-Base_ContentLink']//@href"
        ).extract()
        article_urls = [url for url in article_urls_raw if url[0:5] == "https"]
        # article_titles = response.xpath("//a[@class = 'js-hlp-LinkSwap js-tsr-Base_ContentLink tsr-Base_ContentLink']//@title").extract()

        for ii, article_url in enumerate(article_urls):
            # In principle, only one kind of item should be generated, because they will be all equally treated by the pipeline
            article = newsItem()
            article['index'] = ii
            article['url'] = article_url
            # check the parse_rubrics
            if 'politik' in article_url.split('/'):
                article['rubrics'] = 'politics'
            elif 'wirtschaft' in article_url.split('/'):
                article['rubrics'] = 'economics'
            else:
                article['rubrics'] = 'homepage'

            article_request = scrapy.Request(article_url,
                                             callback=self.parse_article)
            # the parser seems to be only able to catch response, not items
            # the item can be stored in the request/response and transfered to the next parser
            article_request.meta['item'] = article
            yield article_request

示例#2

0

显示文件

文件： spider.py 项目： SAP-samples/data-intelligence-text-analysis

    def parse_homepage(self, response):
        article_urls_raw = response.xpath(
            "//article[contains(@class, 'articulo')]//@href").extract()
        article_urls = [
            url for url in article_urls_raw
            if url[0:18] == "https://elpais.com"
        ]
        # article_titles = response.xpath("//a[contains(@class, 'sz-teaser')]//h3[contains(@class, 'sz-teaser__title')]//text()").extract()

        for ii, article_url in enumerate(article_urls):
            # In principle, only one kind of item should be generated, because they will be all equally treated by the pipeline
            article = newsItem()
            article['index'] = ii
            article['url'] = article_url
            # check the parse_rubrics
            if 'politica' in article_url.split('/'):
                article['rubrics'] = 'politics'
            elif 'economia' in article_url.split('/'):
                article['rubrics'] = 'economics'
            else:
                article['rubrics'] = 'homepage'
            article_request = scrapy.Request(article_url,
                                             callback=self.parse_article)
            # the parser seems to be only able to catch response, not items
            # the item can be stored in the request/response and transfered to the next parser
            article_request.meta['item'] = article
            yield article_request

示例#3

0

显示文件

文件： spider.py 项目： SAP-samples/data-intelligence-text-analysis

    def parse_homepage(self, response):
        article_urls_raw = response.xpath(
            "//a[contains(@class, 'css-m47150 esdb6og4')]//@href").extract()
        # links starting with http are likely to external sites
        article_urls = [
            'https://www.lefigaro.fr' + url for url in article_urls_raw
            if not (url[0:4] == "http")
        ]
        # article_titles = response.xpath("//a[contains(@class, 'sz-teaser')]//h3[contains(@class, 'sz-teaser__title')]//text()").extract()

        for ii, article_url in enumerate(article_urls):
            # In principle, only one kind of item should be generated, because they will be all equally treated by the pipeline
            article = newsItem()
            article['index'] = ii
            article['url'] = article_url
            # check the parse_rubrics
            if 'politique' in article_url.split('/'):
                article['rubrics'] = 'politics'
            elif 'economie' in article_url.split('/'):
                article['rubrics'] = 'economics'
            else:
                article['rubrics'] = 'homepage'
            article_request = scrapy.Request(article_url,
                                             callback=self.parse_article)
            # the parser seems to be only able to catch response, not items
            # the item can be stored in the request/response and transfered to the next parser
            article_request.meta['item'] = article
            yield article_request

示例#4

0

显示文件

文件： spider.py 项目： SAP-samples/data-intelligence-text-analysis

    def parse_homepage(self, response):
        # most bild internal articles ends with .html
        article_urls_raw = response.xpath(
            "//a[contains(@href, '.html')]//@href").extract()
        # Bild.de does not use absolute path for its internal web pages
        article_urls = [
            'https://www.bild.de' + url for url in article_urls_raw
            if not (url[0:4] == "http")
        ]
        # article_titles = response.xpath("//a[contains(@class, 'sz-teaser')]//h3[contains(@class, 'sz-teaser__title')]//text()").extract()

        for ii, article_url in enumerate(article_urls):
            # In principle, only one kind of item should be generated, because they will be all equally treated by the pipeline
            article = newsItem()
            article['index'] = ii
            article['url'] = article_url
            article_request = scrapy.Request(article_url,
                                             callback=self.parse_article)
            # the parser seems to be only able to catch response, not items
            # the item can be stored in the request/response and transfered to the next parser
            article_request.meta['item'] = article
            yield article_request