示例#1
0
    def parse_page(self, response):
        """@url http://www.dailymail.co.uk/news/article-4242322/Milo-Yiannopoulos-BANNED-CPAC-conference.html?ITO=1490
        @returns items 1
        @scrapes bodytext bylines fetchtime firstpubtime modtime headline
        @scrapes keywords section source summary url
        """

        s = response.selector
        # Remove some content from the tree before passing it to the loader.
        # There aren't native scrapy loader/selector methods for this.
        mutate_selector_del_xpath(s, '//script')
        mutate_selector_del_xpath(s, '//*[@style="display:none"]')
        mutate_selector_del_xpath(
            s, '//div[contains(@class, "related-carousel")]')

        l = NewsLoader(selector=s)

        # Get alternative to RSS-source URL fluff
        l.add_xpath('url', 'head/link[@rel="canonical"]/@href')
        l.add_value(
            'bylines',
            list(
                map(
                    lambda x: x.get(),
                    response.xpath(
                        "//span[@data-component='Byline']//span[@data-component='Text']//a/text()"
                    ))))
        l.add_value('headline',
                    response.xpath("//h1[@data-component='Headline']/text()"))
        bodytext = ""
        for r in response.xpath('//div[@id="body"]//p//text()'):
            bodytext += r.extract()
        l.add_value('bodytext', bodytext)
        #l.add_xpath('keywords',
        #               'head/meta[@property="keywords"]/@content')
        # Add a number of items of data that should be standardised across
        # providers. Can override these (for TakeFirst() fields) by making
        # l.add_* calls above this line, or supplement gaps by making them
        # below.
        l.add_scrapymeta(response)
        l.add_fromresponse(response)
        l.add_htmlmeta()
        l.add_schemaorg(response)
        l.add_opengraph()
        l.add_readability(response)

        # TODO: JS dross in body; might need a standard solution to keep this
        #       out.
        # TODO: Related article dross in body. <div class=related-carousel>

        item = l.load_item()

        #        self.logger.debug('bodytext', item['bodytext'])

        return item
示例#2
0
    def parse_page(self, response):
        """@url https://www.bild.de/politik/ausland/politik-ausland/wef-in-davos-die-top-gaeste-und-die-wichtigsten-themen-67441554.bild.html
        @returns items 1
        @scrapes bodytext bylines fetchtime firstpubtime modtime headline
        @scrapes keywords section source summary url language
        """
        s = response.selector

        # Remove any content from the tree before passing it to the loader.
        # There aren't native scrapy loader/selector methods for this.
        # Drop noscript JS warnings (which are otherwise included as the
        # bodytext for video pages).
        mutate_selector_del(s, 'xpath',
                            '//noscript[contains(@class, "warning")]')
        # Remove BildPLUS subscribe notice (note that BildPLUS articles are
        # paywalled, and the text fetched will be only the opening paragraphs.
        mutate_selector_del(
            s, 'xpath',
            '//strong[text()="Ihre neuesten Erkenntnisse lesen Sie mit BILDplus."]'
        )
        # Remove "related topics" etc.
        mutate_selector_del(s, 'xpath',
                            '//aside[contains(@class, "related-topics")]')
        mutate_selector_del(
            s, 'xpath',
            '//div[contains(@class, "tsr-info") and contains(text(), "Lesen Sie auch")]'
        )

        l = NewsLoader(selector=s)

        # Breadcrumbs section
        l.add_xpath('section',
                    '//div[@id="breadcrumb"]//a[@rel="home"]//text()')

        # Add a number of items of data that should be standardised across
        # providers. Can override these (for TakeFirst() fields) by making
        # l.add_* calls above this line, or supplement gaps by making them
        # below.
        l.add_fromresponse(response)
        l.add_htmlmeta()
        l.add_schemaorg(response)
        l.add_opengraph()
        l.add_scrapymeta(response)
        l.add_readability(response)
        #l.add_schemaorg_bylines()
        #l.add_dublincore()

        # Readability is pretty good on Bild, but sometimes misses the body.
        # Try fallback (this won't be as clean as the readability version as
        # we haven't removed all the "more reading" sections etc.)
        l.add_xpath('bodytext', '//div[contains(@class, "txt")]//text()')
        l.add_xpath('bodytext',
                    '//div[contains(@class, "article-body")]//text()')

        return l.load_item()
 def parse_page(self, response):
     s = response.selector
     mutate_selector_del_xpath(s, '//script')
     mutate_selector_del_xpath(s, '//*[@style="display:none"]')
     l = NewsLoader(selector=s)
     l.add_fromresponse(response)
     l.add_htmlmeta()
     l.add_schemaorg(response)
     l.add_opengraph()
     l.add_readability(response)
     item = l.load_item()
     return item
示例#4
0
    def parse_page(self, response):
        """@url https://www.theguardian.com/business/2017/feb/20/how-unilever-foiled-kraft-heinzs-115m-takeover-bid-warren-buffett
        @returns items 1
        @scrapes bodytext fetchtime firstpubtime headline bylines
        @scrapes section source summary url modtime keywords
        """
        s = response.selector
        # Remove any content from the tree before passing it to the loader.
        # There aren't native scrapy loader/selector methods for this.
        # CSS is better for operating on classes than XPath, otherwise
        # either will do.
        #    1. Strip the submeta footer
        mutate_selector_del(s, 'xpath', '//div[contains(@class, "submeta")]')
        #    2. All the <aside> boxes
        mutate_selector_del(s, 'xpath', '//aside')

        l = NewsLoader(selector=s)

        l.add_value('source', 'The Guardian')
        try:
            l.add_value(
                'bylines',
                list(
                    map(lambda x: x['name'],
                        response.meta['json-ld'][0][0]['author'])))
            l.add_value('headline', response.meta['json-ld'][0][0]['headline'])
        except:
            print("No bylines in json-ld")

        # Add a number of items of data that should be standardised across
        # providers. Can override these (for TakeFirst() fields) by making
        # l.add_* calls above this line, or supplement gaps by making them
        # below.
        l.add_fromresponse(response)
        l.add_htmlmeta()
        l.add_schemaorg(response)
        l.add_opengraph()

        # Some Guardian articles are missing their OpenGraph article section
        # tag. These data-link-name tags are often multiple.
        l.add_xpath('section', '//a[@data-link-name="article section"]/text()',
                    TakeFirst(), lambda x: x.strip())

        # The body tagging varies depending on the type of article, so let's
        # try several
        # TODO: There's still a bit of grot in this: <aside> tags, the social links
        #       under videos etc.
        # TODO: The <span class="drop-cap"> setup leaves a spurious line break
        #       after the first letter, which results in a space in the output
        #       text.
        l.add_xpath(
            'bodytext',
            '//article//div[contains(@class, "content__main-column")]/*[not(contains(@class, "meta"))]//text()'
        )  # Eyewitness, plus video?
        l.add_readability(response)
        #        l.add_xpath('bodytext', '//div[@data-component="body"]//*[not(contains(@class, "meta"))]//text()') # Video
        #        l.add_xpath('bodytext', '//div[@id="mainCol"]//text()') # Australian poll briefing
        #        l.add_xpath('bodytext', '//ul[contains(@class, "gallery")]//text()') # In Pictures
        #        l.add_xpath('bodytext', '//div[contains(@class, "gv-slice") and contains(@class, "second-strip")]//text()') # Interactive
        #        #item['headline'] = join_strip_list(response.xpath('//h1//text()').extract()),
        #        #item['bylines'] = response.xpath('//p[@class="byline"]//span[@itemprop="name"]/text()').extract(),

        return l.load_item()
    def parse_page(self, response):
        """@url https://www.liverpoolecho.co.uk/news/liverpool-news/police-issue-warning-over-gift-19660932
        @returns items 1
        @scrapes bodytext bylines fetchtime firstpubtime modtime headline
        @scrapes keywords section source summary url language
        """
        s = response.selector
        # Remove any content from the tree before passing it to the loader.
        # There aren't native scrapy loader/selector methods for this.
        # CSS is better for operating on classes than XPath, otherwise
        # either will do.
        mutate_selector_del(s, 'xpath', '//aside')
        #mutate_selector_del(s, 'css', '.classname')

        l = NewsLoader(selector=s)

        # Add a number of items of data that should be standardised across
        # providers. Can override these (for TakeFirst() fields) by making
        # l.add_* calls above this line, or supplement gaps by making them
        # below.
        l.add_fromresponse(response)
        l.add_htmlmeta()
        l.add_schemaorg(response)
        l.add_opengraph()
        l.add_scrapymeta(response)
        l.add_readability(response)
        #l.add_schemaorg_bylines()
        #l.add_dublincore()

        l.add_xpath('articleid', '//meta[@property="article:id"]/@content')

        # We're going to try to scrape the comments. This is a little involved.
        # In order to get comments, we need two things:
        #   1. The site_uuid (which *should* probably be permanently fixed, but
        #      which live pages nevertheless bootstrap every time
        #   2. The content_container_uuid
        # The content_container_uuid can't be had without knowing both the site
        # uuid and the container_id (which is embedded in the page metadata).
        #
        # We have obtained the site_uuid at crawler startup using this class's
        # start_requests() function. But the content_container_uuid is different
        # for every article. So we need to fetch it now, and continue the
        # processing of this article in the callback.
        #
        # We are, therefore, going to yield a new Request, containing our
        # half-finished loader as a metadata item. The callback function for the
        # new Request will try to fetch comments and add them in, finishing by
        # yielding a complete Item for the crawler to handle.
        if l.get_xpath('//vf-conversations') and self.comments_bootstrap:
            site_uuid = self.comments_bootstrap['settings']['site_uuid']
            containerid = l.get_xpath(
                '//meta[@name="vf:container_id"]/@content')[0]

            yield Request(
                f"https://livecomments.viafoura.co/v4/livecomments/{site_uuid}/contentcontainer/id?container_id={containerid}",
                method="GET",
                priority=5,
                callback=self.parse_comments_get_contentcontainer,
                errback=self.errback_comments,
                cb_kwargs={
                    'l': l,
                    'site_uuid': site_uuid
                },
                meta={  # We don't even want to fetch robots.txt here.
                    'dont_obey_robotstxt': True
                })
        else:
            logger.debug(f'No comments section: {response.url}')
            l.add_value('notes', 'No comments section')
            yield l.load_item()