def parse_page(self, response):
        """@url https://www.liverpoolecho.co.uk/news/liverpool-news/police-issue-warning-over-gift-19660932
        @returns items 1
        @scrapes bodytext bylines fetchtime firstpubtime modtime headline
        @scrapes keywords section source summary url language
        """
        s = response.selector
        # Remove any content from the tree before passing it to the loader.
        # There aren't native scrapy loader/selector methods for this.
        # CSS is better for operating on classes than XPath, otherwise
        # either will do.
        mutate_selector_del(s, 'xpath', '//aside')
        #mutate_selector_del(s, 'css', '.classname')

        l = NewsLoader(selector=s)

        # Add a number of items of data that should be standardised across
        # providers. Can override these (for TakeFirst() fields) by making
        # l.add_* calls above this line, or supplement gaps by making them
        # below.
        l.add_fromresponse(response)
        l.add_htmlmeta()
        l.add_schemaorg(response)
        l.add_opengraph()
        l.add_scrapymeta(response)
        l.add_readability(response)
        #l.add_schemaorg_bylines()
        #l.add_dublincore()

        l.add_xpath('articleid', '//meta[@property="article:id"]/@content')

        # We're going to try to scrape the comments. This is a little involved.
        # In order to get comments, we need two things:
        #   1. The site_uuid (which *should* probably be permanently fixed, but
        #      which live pages nevertheless bootstrap every time
        #   2. The content_container_uuid
        # The content_container_uuid can't be had without knowing both the site
        # uuid and the container_id (which is embedded in the page metadata).
        #
        # We have obtained the site_uuid at crawler startup using this class's
        # start_requests() function. But the content_container_uuid is different
        # for every article. So we need to fetch it now, and continue the
        # processing of this article in the callback.
        #
        # We are, therefore, going to yield a new Request, containing our
        # half-finished loader as a metadata item. The callback function for the
        # new Request will try to fetch comments and add them in, finishing by
        # yielding a complete Item for the crawler to handle.
        if l.get_xpath('//vf-conversations') and self.comments_bootstrap:
            site_uuid = self.comments_bootstrap['settings']['site_uuid']
            containerid = l.get_xpath(
                '//meta[@name="vf:container_id"]/@content')[0]

            yield Request(
                f"https://livecomments.viafoura.co/v4/livecomments/{site_uuid}/contentcontainer/id?container_id={containerid}",
                method="GET",
                priority=5,
                callback=self.parse_comments_get_contentcontainer,
                errback=self.errback_comments,
                cb_kwargs={
                    'l': l,
                    'site_uuid': site_uuid
                },
                meta={  # We don't even want to fetch robots.txt here.
                    'dont_obey_robotstxt': True
                })
        else:
            logger.debug(f'No comments section: {response.url}')
            l.add_value('notes', 'No comments section')
            yield l.load_item()
Пример #2
0
    def parse_page(self, response):
        """@url http://www.dailymail.co.uk/news/article-4242322/Milo-Yiannopoulos-BANNED-CPAC-conference.html?ITO=1490
        @returns items 1
        @scrapes bodytext bylines fetchtime firstpubtime modtime headline
        @scrapes keywords section source summary url
        """

        s = response.selector
        # Remove some content from the tree before passing it to the loader.
        # There aren't native scrapy loader/selector methods for this.
        mutate_selector_del_xpath(s, '//script')
        mutate_selector_del_xpath(s, '//*[@style="display:none"]')
        mutate_selector_del_xpath(
            s, '//div[contains(@class, "related-carousel")]')

        l = NewsLoader(selector=s)

        # Get alternative to RSS-source URL fluff
        l.add_xpath('url', 'head/link[@rel="canonical"]/@href')
        l.add_value(
            'bylines',
            list(
                map(
                    lambda x: x.get(),
                    response.xpath(
                        "//span[@data-component='Byline']//span[@data-component='Text']//a/text()"
                    ))))
        l.add_value('headline',
                    response.xpath("//h1[@data-component='Headline']/text()"))
        bodytext = ""
        for r in response.xpath('//div[@id="body"]//p//text()'):
            bodytext += r.extract()
        l.add_value('bodytext', bodytext)
        #l.add_xpath('keywords',
        #               'head/meta[@property="keywords"]/@content')
        # Add a number of items of data that should be standardised across
        # providers. Can override these (for TakeFirst() fields) by making
        # l.add_* calls above this line, or supplement gaps by making them
        # below.
        l.add_scrapymeta(response)
        l.add_fromresponse(response)
        l.add_htmlmeta()
        l.add_schemaorg(response)
        l.add_opengraph()
        l.add_readability(response)

        # TODO: JS dross in body; might need a standard solution to keep this
        #       out.
        # TODO: Related article dross in body. <div class=related-carousel>

        item = l.load_item()

        #        self.logger.debug('bodytext', item['bodytext'])

        return item
Пример #3
0
    def parse_page(self, response):
        s = response.selector
        # Remove any content from the tree before passing it to the loader.
        # There aren't native scrapy loader/selector methods for this.
        #mutate_selector_del_xpath(s, '//*[@style="display:none"]')

        l = NewsLoader(selector=s)

        l.add_xpath(
            'bylines',
            '//*[contains(@class, "author-card__details__name")]//text()')
        # UK
        l.add_xpath('bodytext',
                    '//div[contains(@class, "entry__body")]//text()')
        # DE
        l.add_xpath('bodytext', '//div[@id="mainentrycontent"]//text()')

        # Add a number of items of data that should be standardised across
        # providers. Can override these (for TakeFirst() fields) by making
        # l.add_* calls above this line, or supplement gaps by making them
        # below.
        l.add_fromresponse(response)
        l.add_htmlmeta()
        l.add_schemaorg(response)
        l.add_opengraph()
        l.add_scrapymeta(response)

        return l.load_item()
Пример #4
0
    def parse_page(self, response):
        """@url https://www.yahoo.com/news/school-principal-trump-chants-crossed-line-hate-speech-155230984.html
        @returns items 1
        @scrapes bodytext bylines fetchtime firstpubtime headline
        @scrapes source summary url keywords
        @noscrapes modtime section
        """
        # Depressing lack of modtime, keywords or section.

        s = response.selector
        # Remove any content from the tree before passing it to the loader.
        # There aren't native scrapy loader/selector methods for this.        
        #mutate_selector_del_xpath(s, '//*[@style="display:none"]')

        l = NewsLoader(selector=s)

        l.add_value('source', 'Yahoo! News [US]')

        # Add a number of items of data that should be standardised across
        # providers. Can override these (for TakeFirst() fields) by making
        # l.add_* calls above this line, or supplement gaps by making them
        # below.
        l.add_fromresponse(response)
        l.add_htmlmeta()
#        l.add_schemaorg(response)
        l.add_opengraph()
        l.add_scrapymeta(response)

        l.add_xpath('bodytext', '//div[contains(@class, "canvas-body")]/p/text()')
        # NOTE: Maybe modtime
        l.add_xpath('firstpubtime', '//div[contains(@class, "auth-attr")]//time/@datetime')
        l.add_xpath('bylines',      '//div[contains(@class, "auth-attr")]//div[contains(@class, "author-name")]//text()')

        return l.load_item()
Пример #5
0
    def parse_page(self, response):
        """@url https://www.washingtonpost.com/news/politics/wp/2017/03/27/trumps-approval-hits-a-new-low-of-36-percent-but-thats-not-the-bad-news/
        @returns items 1
        @scrapes bodytext bylines fetchtime firstpubtime headline
        @scrapes keywords section source summary url
        @noscrapes modtime
        """
        s = response.selector
        # Remove any content from the tree before passing it to the loader.
        # There aren't native scrapy loader/selector methods for this.
        #mutate_selector_del_xpath(s, '//*[@style="display:none"]')

        l = NewsLoader(selector=s)

        # WaPo's ISO date/time strings are invalid: <datetime>-500 instead of
        # <datetime>-05:00. Note that the various standardised l.add_* methods
        # will generate 'Failed to parse data' log items. We've got it properly
        # here, so they aren't important.
        l.add_xpath('firstpubtime', '//*[@itemprop="datePublished" or '
                    '@property="datePublished"]/@content',
                    MapCompose(self.fix_iso_date))  # CreativeWork

        # These are duplicated in the markup, so uniquise them.
        l.add_xpath('bylines',
                    '//*[@itemprop="author"]//*[@itemprop="name"]//text()',
                    set)
        l.add_xpath('section',
                    '//*[contains(@class, "headline-kicker")]//text()')

        # Add a number of items of data that should be standardised across
        # providers. Can override these (for TakeFirst() fields) by making
        # l.add_* calls above this line, or supplement gaps by making them
        # below.
        l.add_fromresponse(response)
        l.add_htmlmeta()
        l.add_schemaorg(response)
        l.add_opengraph()
        l.add_scrapymeta(response)

        return l.load_item()
Пример #6
0
    def parse_page(self, response):
        """@url http://www.prnewswire.com/news-releases/xti-aircraft-company-and-bye-aerospace-form-alliance-on-hybridelectric-vertical-takeoff-airplane-300418161.html
        @returns items 1
        @scrapes bodytext bylines fetchtime firstpubtime headline
        @scrapes keywords source summary url
        @noscrapes modtime section
        """
        s = response.selector
        # Remove any content from the tree before passing it to the loader.
        # There aren't native scrapy loader/selector methods for this.
        #mutate_selector_del_xpath(s, '//*[contains(@class, "classname")]')

        l = NewsLoader(selector=s)

        l.add_value('source', 'PR Newswire')
        # Not parsing as head/meta for some reason
        l.add_xpath('summary', '//meta[@name="description"]/@content')
        l.add_xpath('bylines', '//meta[@name="author"]/@content')

        # Add a number of items of data that should be standardised across
        # providers. Can override these (for TakeFirst() fields) by making
        # l.add_* calls above this line, or supplement gaps by making them
        # below.
        l.add_fromresponse(response)
        l.add_htmlmeta()
        l.add_schemaorg(response)
        l.add_opengraph()
        l.add_scrapymeta(response)
        #l.add_schemaorg_bylines()
        #l.add_dublincore()

        l.add_xpath('firstpubtime', '//meta[@name="date"]/@content')

        return l.load_item()
Пример #7
0
    def parse_page(self, response):
        """@url http://www.mirror.co.uk/news/uk-news/lesbian-couple-who-launched-crowdfunding-9902318
        @returns items 1
        @scrapes bodytext bylines fetchtime firstpubtime modtime headline
        @scrapes keywords section source summary url
        """
        s = response.selector
        # Remove any content from the tree before passing it to the loader.
        # There aren't native scrapy loader/selector methods for this.
        mutate_selector_del_xpath(s, '//form')
        mutate_selector_del_xpath(
            s, '//aside[contains(@class,"read-more-links")]')

        l = NewsLoader(selector=s)

        # Add a number of items of data that should be standardised across
        # providers. Can override these (for TakeFirst() fields) by making
        # l.add_* calls above this line, or supplement gaps by making them
        # below.
        l.add_fromresponse(response)
        l.add_htmlmeta()
        l.add_schemaorg(response)
        l.add_opengraph()
        l.add_scrapymeta(response)

        l.add_css('bodytext', '.article-body ::text')  # Live

        return l.load_item()
Пример #8
0
    def parse_page(self, response):
        """@url http://edition.cnn.com/2017/03/01/politics/joe-biden-hunter-beau/index.html
        @returns items 1
        @scrapes bodytext bylines fetchtime firstpubtime modtime headline
        @scrapes keywords section source summary url
        """
        s = response.selector
        # Remove any content from the tree before passing it to the loader.
        # There aren't native scrapy loader/selector methods for this.
        mutate_selector_del_xpath(
            s, '//div[contains(@class, "read-more-button")]')
        mutate_selector_del_xpath(s, '//div[contains(@class, "el__embedded")]')
        mutate_selector_del_xpath(s, '//div[contains(@class, "owl-carousel")]')

        l = NewsLoader(selector=s)

        # Add a number of items of data that should be standardised across
        # providers. Can override these (for TakeFirst() fields) by making
        # l.add_* calls above this line, or supplement gaps by making them
        # below.
        l.add_fromresponse(response)
        l.add_htmlmeta()
        l.add_schemaorg(response)
        l.add_opengraph()
        l.add_scrapymeta(response)

        l.add_xpath(
            'headline',
            '//article//meta[@itemprop="alternativeHeadline"]/@content')
        l.add_xpath('headline', '//h1[contains(@class, "headline")]/text()')

        return l.load_item()
Пример #9
0
    def parse_page(self, response):
        """@url http://metro.co.uk/2017/02/22/telescope-spots-our-best-bet-for-finding-aliens-a-nearby-star-with-seven-earth-sized-planets-6464648/
        @returns items 1
        @scrapes bodytext bylines fetchtime firstpubtime modtime headline
        @scrapes keywords section source summary url
        """
        s = response.selector
        # Remove any content from the tree before passing it to the loader.
        # There aren't native scrapy loader/selector methods for this.
        #mutate_selector_del_xpath(s, '//*[@style="display:none"]')

        l = NewsLoader(selector=s)

        # articleBody full of headline/byline/fluff
        l.add_xpath('bodytext',
                    '//div[contains(@class, "article-body")]//text()')

        # Add a number of items of data that should be standardised across
        # providers. Can override these (for TakeFirst() fields) by making
        # l.add_* calls above this line, or supplement gaps by making them
        # below.
        l.add_fromresponse(response)
        l.add_htmlmeta()
        l.add_schemaorg(response)
        l.add_opengraph()
        l.add_scrapymeta(response)

        # Sort out bylines with less fluff
        l.add_xpath(
            'bylines',
            '//span[contains(@class, "byline")]//a[@rel="author"]//text()',
            MapCompose(lambda s: re.sub(
                r' For Metro\.co\.uk', r'', s, flags=re.IGNORECASE)))

        return l.load_item()
Пример #10
0
    def parse_page(self, response):
        """Note: firstpubtime also fetched, but via RSS feed (which can't be
                 contracted for)

        @url https://www.buzzfeed.com/maryanngeorgantopoulos/white-supremacists-are-spreading-their-message-on-college-ca
        @returns items 1
        @scrapes bodytext bylines fetchtime headline
        @scrapes section source summary url keywords language
        @noscrapes modtime
        """

        s = response.selector
        # Remove any content from the tree before passing it to the loader.
        # There aren't native scrapy loader/selector methods for this.
        mutate_selector_del_xpath(
            s, '//*[contains(@class, "print") or contains(@class, "hidden")]'
        )  # Physical print only

        l = NewsLoader(selector=s)

        # Remove referer params from end of URLs
        l.add_xpath('url', 'head/link[@rel="canonical"]/@href')

        # Add a number of items of data that should be standardised across
        # providers. Can override these (for TakeFirst() fields) by making
        # l.add_* calls above this line, or supplement gaps by making them
        # below.
        l.add_fromresponse(response)
        l.add_htmlmeta()
        l.add_schemaorg(response)
        l.add_opengraph()
        l.add_scrapymeta(response)

        l.add_xpath(
            'bodytext',
            '//div[@data-print="body"]/*[not(contains(@class, "user-bio") or contains(@class, "_shares") or contains(@class, "inline-promo"))]//text()'
        )
        l.add_xpath('bodytext',
                    '//div[contains(@class, "_item_text")]//text()')
        l.add_xpath(
            'bodytext', '//article//*[contains(@class, "subbuzz-text") or '
            'contains(@class, "subbuzz__title")]//text()')

        return l.load_item()
Пример #11
0
    def parse_page(self, response):
        """@url https://www.nytimes.com/2017/02/28/science/california-aging-dams.html
        @returns items 1
        @scrapes bodytext bylines fetchtime firstpubtime modtime headline
        @scrapes keywords section source summary url
        """
        s = response.selector
        # Remove any content from the tree before passing it to the loader.
        # There aren't native scrapy loader/selector methods for this.
        mutate_selector_del(s, 'xpath',
                            '//footer[contains(@class, "story-footer")]')
        mutate_selector_del(s, 'css', '.nocontent')
        mutate_selector_del(s, 'css', '.visually-hidden')
        mutate_selector_del(s, 'css', '.newsletter-signup')

        l = NewsLoader(selector=s)

        l.add_value('source', 'New York Times')
        # Response header from NYT leads to non-canonical URL with ?_r=0 at end
        l.add_xpath('url', 'head/link[@rel="canonical"]/@href')

        # Add a number of items of data that should be standardised across
        # providers. Can override these (for TakeFirst() fields) by making
        # l.add_* calls above this line, or supplement gaps by making them
        # below.
        l.add_fromresponse(response)
        l.add_htmlmeta()
        l.add_schemaorg(response)
        l.add_opengraph()
        l.add_scrapymeta(response)

        l.add_xpath('headline',
                    '//*[contains(@class, "Post__headline")]//text()')
        l.add_xpath('section', '//*[contains(@class, "Post__kicker")]//text()')
        l.add_xpath(
            'bodytext', '//*[contains(@class, "story-body") or '
            'contains(@class, "Post__body")]//text()')
        l.add_xpath('bodytext',
                    '//div[contains(@class, "body--story")]//p//text()')
        l.add_css('bodytext', '.interactive-graphic ::text')

        return l.load_item()
Пример #12
0
    def parse_page(self, response):
        s = response.selector
        # Remove any content from the tree before passing it to the loader.
        # There aren't native scrapy loader/selector methods for this.
        # CSS is better for operating on classes than XPath, otherwise
        # either will do.
        #mutate_selector_del(s, 'xpath' '//*[@id='someid']')
        #mutate_selector_del(s, 'css', '.classname')

        l = NewsLoader(selector=s)

        # There are multiple articles in a single (JS-rendered) Vice page.
        # We are interested only in the first.
        # There are also, unhelpfully, several levels of <div>s with classes
        # containing "article__body". We only want the ultimate one.
        l.add_xpath(
            'bodytext',
            '(//article)[1]//div[contains(@class, "article__body") and contains(@class, "bod-")]//text()'
        )

        # Add a number of items of data that should be standardised across
        # providers. Can override these (for TakeFirst() fields) by making
        # l.add_* calls above this line, or supplement gaps by making them
        # below.
        l.add_fromresponse(response)
        l.add_htmlmeta()
        l.add_schemaorg(response)
        l.add_opengraph()
        l.add_scrapymeta(response)
        #l.add_schemaorg_bylines()
        #l.add_dublincore()

        return l.load_item()
 def parse_page(self, response):
     s = response.selector
     mutate_selector_del_xpath(s, '//script')
     mutate_selector_del_xpath(s, '//*[@style="display:none"]')
     l = NewsLoader(selector=s)
     l.add_fromresponse(response)
     l.add_htmlmeta()
     l.add_schemaorg(response)
     l.add_opengraph()
     l.add_readability(response)
     item = l.load_item()
     return item
Пример #14
0
    def parse_page(self, response):
        """@url http://www.nbcnews.com/news/asian-america/denied-visas-u-s-tibet-women-s-soccer-team-hold-n728626
        @returns items 1
        @scrapes bodytext bylines fetchtime firstpubtime modtime headline
        @scrapes keywords section source summary url
        """
        s = response.selector
        # Remove any content from the tree before passing it to the loader.
        # There aren't native scrapy loader/selector methods for this.
        #mutate_selector_del_xpath(s, '//*[@style="display:none"]')

        l = NewsLoader(selector=s)

        # Add a number of items of data that should be standardised across
        # providers. Can override these (for TakeFirst() fields) by making
        # l.add_* calls above this line, or supplement gaps by making them
        # below.
        l.add_fromresponse(response)
        l.add_htmlmeta()
        l.add_schemaorg(response)
        l.add_opengraph()
        l.add_scrapymeta(response)

        l.add_xpath('bodytext',
                    '//div[contains(@class, "article-body")]//text()')

        return l.load_item()
Пример #15
0
    def parse_page(self, response):
        """@url http://www.businessinsider.fr/meilleures-entreprises-equilibre-vie-pro-vie-privee-selon-glassdoor?IR=C
        @returns items 1
        @scrapes bodytext bylines fetchtime modtime headline
        @scrapes section source summary url language
        @noscrapes keywords

        """
        s = response.selector
        # Remove any content from the tree before passing it to the loader.
        # There aren't native scrapy loader/selector methods for this.
        # CSS is better for operating on classes than XPath, otherwise
        # either will do.        
#        mutate_selector_del(s, 'xpath', '//div[@id="see-also-links"]')
#        mutate_selector_del(s, 'xpath', '//div[contains(@class, "popular-video")]')
#        mutate_selector_del(s, 'xpath', '//span[contains(@class, "caption-source")]')
        mutate_selector_del(s, 'xpath', '//p[contains(@class, "wp-caption-text")]')
        mutate_selector_del(s, 'xpath', '//div[contains(@class, "pod-fb-like")]')

        l = NewsLoader(selector=s)


        # Add a number of items of data that should be standardised across
        # providers. Can override these (for TakeFirst() fields) by making
        # l.add_* calls above this line, or supplement gaps by making them
        # below.
        l.add_fromresponse(response)
        l.add_htmlmeta()
        l.add_schemaorg(response)
        l.add_opengraph()
        l.add_scrapymeta(response)
        #l.add_schemaorg_bylines()
        #l.add_dublincore()

        l.add_xpath('bodytext', '//div[contains(@class, "post-content")]//text()')
        l.add_xpath('bylines', '//a[@rel="author"]//text()')
        # BI prints times for recent articles as "hours since published". But
        # helpfully includes a unix timestamp in its metadata.
#        ts = s.xpath('//span[@data-bi-format="date"]/@rel').extract_first()
#        if ts:
#            l.add_value('modtime', datetime.fromtimestamp(int(ts)).isoformat())
#        l.add_xpath('section', '//h2[contains(@class, "vert-name")]//text()')

        return l.load_item()
Пример #16
0
    def parse_page(self, response):
        """@url http://uk.reuters.com/article/us-heart-nih-funding-idUKKBN16Y2EI
        @returns items 1
        @scrapes bodytext bylines fetchtime firstpubtime modtime headline
        @scrapes keywords section source summary url
        """
        s = response.selector
        # Remove any content from the tree before passing it to the loader.
        # There aren't native scrapy loader/selector methods for this.        
        mutate_selector_del(s, 'css', 'div.related-content')

        l = NewsLoader(selector=s)

        l.add_value('source', 'Reuters [UK]')

        # Add a number of items of data that should be standardised across
        # providers. Can override these (for TakeFirst() fields) by making
        # l.add_* calls above this line, or supplement gaps by making them
        # below.
        l.add_fromresponse(response)
        l.add_htmlmeta()
        l.add_schemaorg(response)
        #l.add_opengraph()
        l.add_scrapymeta(response)
        #l.add_schemaorg_bylines()
        #l.add_dublincore()

        l.add_xpath('bodytext',
                    '//span[@id="article-text"]/'
                        '*[not(@class="author")]//text()')
        l.add_xpath('summary',
                    '//meta[@name="description"]/@content')

        l.add_value('notes', 'fetchtime delayed by slow feed')

        return l.load_item()
Пример #17
0
    def parse_page(self, response):
        # firstpubtime from RSS feed, so won't appear for contract.
        """@url http://www.bbc.co.uk/news/uk-politics-39020260
        @returns items 1
        @scrapes bodytext fetchtime headline
        @scrapes section source summary url
        @noscrapes modtime keywords
        """
        s = response.selector
        # Remove any content from the tree before passing it to the loader.
        # There aren't native scrapy loader/selector methods for this.
        mutate_selector_del(s, 'xpath', '//*[@class="off-screen"]')

        l = NewsLoader(selector=s)

        l.add_value('source', 'BBC News')

        # BBC titles all have dross at the end, even the embedded ones.
        l.add_xpath(
            'headline', 'head/title/text()',
            lambda x: [re.sub(r' - BBC (News(beat)?|Sport)$', '', x[0])])

        # TODO: Publishes data (including datePublished) as JSON+LD.
        # Need parser. Note that it doesn't seem complete: articleBody in
        # the JSON+LD feed seems to only contain the standfirst.

        # Add a number of items of data that should be standardised across
        # providers. Can override these (for TakeFirst() fields) by making
        # l.add_* calls above this line, or supplement gaps by making them
        # below.
        l.add_fromresponse(response)
        l.add_htmlmeta()
        l.add_schemaorg(response)
        l.add_opengraph()
        l.add_scrapymeta(response)

        l.add_xpath(
            'bodytext',
            '//div[contains(@class, "main_article_text")]//text()')  # Newsbeat
        l.add_xpath(
            'bodytext',
            '//div[contains(@class, "map-body")]//text()')  # media-asset-page
        l.add_xpath('bodytext',
                    '//div[contains(@class, "story-body")]//text()')  # Sport
        l.add_xpath(
            'summary',
            '//div[contains(@class, "vxp-media__summary")]//text()')  # Videos
        l.add_xpath(
            'bodytext',
            '//div[contains(@class, "vxp-media__summary")]//text()')  # Videos

        bodytext = ""
        for r in response.xpath(
                '//article//div[@data-component="text-block"]//text()'):
            bodytext += r.extract()
        l.add_value('bodytext', bodytext)

        #l.add_readability(response)

        match = re.match
        # Newsbeat seems to use a different CMS, which doesn't supply the
        # usual metadata (but which does publish bylines!)
        if response.xpath('//div[contains(@class, "newsbeatlogo")]'):
            l.add_value('section', 'Newsbeat')

#        def strip_by(strl):
#            for s in strl:
#                yield re.sub(r'.*[Bb]y (.*)', r'\1', s).strip()
        l.add_xpath(
            'bylines', '//span[contains(@class, "byline__name")]/text()'
        )  #, strip_by) # lambda y: map(lambda x: re.sub(r'.*By (.*)', r'\1', x).strip(), y))
        l.add_xpath(
            'bylines', '//p[contains(@class, "byline")]/text()'
        )  #, strip_by) # lambda y: map(lambda x: re.sub(r'.*By (.*)', r'\1', x).strip(), y)) # Newsbeat
        l.add_xpath(
            'bylines',
            '//*[contains(@class, "story__byline")]//p[contains(@class, "gel-long-primer") and not(contains(@class, "gel-long-primer-bold"))]/text()'
        )  # Sport. Grot selecting by layout code.

        # TODO: Keywords (none?)

        return l.load_item()
Пример #18
0
    def parse_page(self, response):
        """@url http://www.prnewswire.co.uk/news-releases/virtual-reality-huge-investment-support-accelerates-innovations-and-expands-application-scope-615544713.html
        @returns items 1
        @scrapes bodytext bylines fetchtime firstpubtime headline
        @scrapes keywords source summary url
        @noscrapes modtime section
        """

        s = response.selector
        # Remove any content from the tree before passing it to the loader.
        # There aren't native scrapy loader/selector methods for this.
        #mutate_selector_del_xpath(s, '//*[contains(@class, "classname")]')

        l = NewsLoader(selector=s)

        l.add_value('source', 'PR Newswire [UK]')
        # Not parsing as head/meta for some reason
        l.add_xpath('summary', '//meta[@name="description"]/@content')
        l.add_xpath('bylines', '//meta[@name="author"]/@content')
        l.add_xpath('keywords', '//meta[@name="keywords"]/@content')

        # Add a number of items of data that should be standardised across
        # providers. Can override these (for TakeFirst() fields) by making
        # l.add_* calls above this line, or supplement gaps by making them
        # below.
        l.add_fromresponse(response)
        l.add_htmlmeta()
        l.add_schemaorg(response)
        l.add_opengraph()
        l.add_scrapymeta(response)
        #l.add_schemaorg_bylines()
        #l.add_dublincore()

        l.add_xpath('firstpubtime', '//meta[@name="date"]/@content')
        l.add_xpath('bodytext', '//div[contains(@class, "news-col")]//text()')
        l.add_xpath('headline', '//h1/text()')

        return l.load_item()
Пример #19
0
    def parse_page(self, response):
        """@url https://www.thesun.co.uk/living/2937147/human-ken-doll-quentin-dehar-who-spent-92k-to-look-like-his-idol-has-dumped-his-surgery-obsessed-barbie-girlfriend-for-dying-her-hair-brown/
        @returns items 1
        @scrapes bodytext bylines fetchtime firstpubtime modtime headline
        @scrapes keywords section source summary url
        """

        s = response.selector
        # Remove any content from the tree before passing it to the loader.
        # There aren't native scrapy loader/selector methods for this.

        # TODO: 'Keywords' and 'tags' for The Sun are different. Decide which
        #       which we want.

        # Lose "The Sun" link on the bottom of each page
        mutate_selector_del_xpath(
            s, '//div[contains(@class, "social--fb-page-button")]')
        # Lose the "related articles" carousel
        mutate_selector_del_xpath(s,
                                  '//div[contains(@class, "rail--trending")]')

        l = NewsLoader(selector=s)

        l.add_xpath('summary', 'meta[@name="description"]/@content')

        # TODO: This is kinda grot. Fine except for names like "John da Silva".
        l.add_xpath(
            'bylines',
            '//span[contains(@class, "article__author-name")]//text()',
            lambda x: (s.title() for s in x))

        # Add a number of items of data that should be standardised across
        # providers. Can override these (for TakeFirst() fields) by making
        # l.add_* calls above this line, or supplement gaps by making them
        # below.
        l.add_fromresponse(response)
        l.add_htmlmeta()
        l.add_schemaorg(response)
        l.add_opengraph()
        l.add_scrapymeta(response)

        l.add_xpath(
            'bodytext',
            '//article//div[contains(@class, "article__content")]//text()')

        return l.load_item()
Пример #20
0
    def parse_page(self, response):
        """@url http://www.usatoday.com/story/money/markets/2017/02/28/bonds-telling-less-bullish-tale-than-stocks/98503646/
        @returns items 1
        @scrapes bodytext bylines fetchtime firstpubtime modtime headline
        @scrapes keywords section source summary url
        """
        s = response.selector
        # Remove any content from the tree before passing it to the loader.
        # There aren't native scrapy loader/selector methods for this.
        mutate_selector_del_xpath(
            s, '//*[contains(@class, "inline-share-tools")]')
        mutate_selector_del_xpath(
            s, '//*[contains(@class, "article-print-url")]')
        mutate_selector_del_xpath(s, '//aside')

        l = NewsLoader(selector=s)

        l.add_xpath('bylines',
                    'head/meta[@name="cXenseParse:author"]/@content')
        # Section metadata comes out as "news,world". For this, take "News".
        l.add_xpath(
            'section', 'head/meta[@itemprop="articleSection"]/@content',
            Compose(
                TakeFirst(),
                lambda x: x.split(','),
                TakeFirst(),
                lambda x: x.title(),
            ))

        # Video pages
        l.add_xpath('summary',
                    '//p[contains(@class, "vgm-video-description")]//text()')

        # USA Today provide timestamps to millisecond precision, in a format
        # which dateparser can't handle.
        l.add_xpath(
            'firstpubtime',
            '//*[@itemprop="datePublished" or @property="datePublished"]/@content',
            MapCompose(self.fix_usatoday_date))  # CreativeWork
        l.add_xpath(
            'modtime',
            '//*[@itemprop="dateModified" or @property="dateModified"]/@content',
            MapCompose(self.fix_usatoday_date))  # CreativeWork

        # Add a number of items of data that should be standardised across
        # providers. Can override these (for TakeFirst() fields) by making
        # l.add_* calls above this line, or supplement gaps by making them
        # below.
        l.add_fromresponse(response)
        l.add_htmlmeta()
        l.add_schemaorg(response)
        l.add_opengraph()
        l.add_scrapymeta(response)

        return l.load_item()
Пример #21
0
    def parse_page(self, response):
        """@url https://www.bild.de/politik/ausland/politik-ausland/wef-in-davos-die-top-gaeste-und-die-wichtigsten-themen-67441554.bild.html
        @returns items 1
        @scrapes bodytext bylines fetchtime firstpubtime modtime headline
        @scrapes keywords section source summary url language
        """
        s = response.selector

        # Remove any content from the tree before passing it to the loader.
        # There aren't native scrapy loader/selector methods for this.
        # Drop noscript JS warnings (which are otherwise included as the
        # bodytext for video pages).
        mutate_selector_del(s, 'xpath',
                            '//noscript[contains(@class, "warning")]')
        # Remove BildPLUS subscribe notice (note that BildPLUS articles are
        # paywalled, and the text fetched will be only the opening paragraphs.
        mutate_selector_del(
            s, 'xpath',
            '//strong[text()="Ihre neuesten Erkenntnisse lesen Sie mit BILDplus."]'
        )
        # Remove "related topics" etc.
        mutate_selector_del(s, 'xpath',
                            '//aside[contains(@class, "related-topics")]')
        mutate_selector_del(
            s, 'xpath',
            '//div[contains(@class, "tsr-info") and contains(text(), "Lesen Sie auch")]'
        )

        l = NewsLoader(selector=s)

        # Breadcrumbs section
        l.add_xpath('section',
                    '//div[@id="breadcrumb"]//a[@rel="home"]//text()')

        # Add a number of items of data that should be standardised across
        # providers. Can override these (for TakeFirst() fields) by making
        # l.add_* calls above this line, or supplement gaps by making them
        # below.
        l.add_fromresponse(response)
        l.add_htmlmeta()
        l.add_schemaorg(response)
        l.add_opengraph()
        l.add_scrapymeta(response)
        l.add_readability(response)
        #l.add_schemaorg_bylines()
        #l.add_dublincore()

        # Readability is pretty good on Bild, but sometimes misses the body.
        # Try fallback (this won't be as clean as the readability version as
        # we haven't removed all the "more reading" sections etc.)
        l.add_xpath('bodytext', '//div[contains(@class, "txt")]//text()')
        l.add_xpath('bodytext',
                    '//div[contains(@class, "article-body")]//text()')

        return l.load_item()
Пример #22
0
    def parse_page(self, response):
        """@url http://www.telegraph.co.uk/news/2017/02/27/grandmother-has-married-briton-27-years-deported-singapore-just/
        @returns items 1
        @scrapes bodytext bylines fetchtime firstpubtime headline
        @scrapes keywords section source summary url
        @noscrapes modtime
        """
        s = response.selector
        # Remove any content from the tree before passing it to the loader.
        # There aren't native scrapy loader/selector methods for this.
        #mutate_selector_del_xpath(s, '//*[@style="display:none"]')

        l = NewsLoader(selector=s)

        # This extracts the (top-level) section from the Navigation headline
        # bar. Probably a bit fragile.
        l.add_xpath(
            'section',
            '//a[contains(@class, "header-breadcrumbs__link")]//text()',
            TakeFirst())

        l.add_xpath(
            'bylines',
            '//main//*[@itemprop="author"]//*[@itemprop="name"]//text()')

        # Add a number of items of data that should be standardised across
        # providers. Can override these (for TakeFirst() fields) by making
        # l.add_* calls above this line, or supplement gaps by making them
        # below.
        l.add_fromresponse(response)
        l.add_htmlmeta()
        l.add_schemaorg(response)
        l.add_opengraph()
        l.add_scrapymeta(response)

        if response.xpath('//div[contains(@class, "premium-paywall")]'):
            l.add_value('notes', 'Premium paywall')

        return l.load_item()
Пример #23
0
    def parse_page(self, response):
        """@url http://bigstory.ap.org/article/fc451fdf7e9a47c1b2b9ab95f55c3bfe/tusk-closing-2nd-term-eu-council-president
        @returns items 1
        @scrapes bodytext bylines fetchtime modtime headline
        @scrapes keywords source summary url
        @noscrapes firstpubtime section
        """

        s = response.selector
        # Remove any content from the tree before passing it to the loader.
        # There aren't native scrapy loader/selector methods for this.
        #mutate_selector_del_xpath(s, '//*[contains(@class, "classname")]')

        l = NewsLoader(selector=s)

        l.add_value('source', 'Associated Press')

        # Add a number of items of data that should be standardised across
        # providers. Can override these (for TakeFirst() fields) by making
        # l.add_* calls above this line, or supplement gaps by making them
        # below.
        l.add_fromresponse(response)
        l.add_htmlmeta()
        l.add_schemaorg(response)
        l.add_opengraph()
        l.add_scrapymeta(response)
        #l.add_schemaorg_bylines()
        #l.add_dublincore()

        l.add_xpath('headline', 'head/title/text()')
        l.add_xpath('summary', 'head/meta[@name="description"]/@content')
        l.add_xpath('bylines', '//div[@id="byline"]//a/text()')
        l.add_xpath('bodytext',
                    '//div[contains(@class, "field-name-body")]//text()')
        l.add_xpath(
            'modtime',
            '//div[@id="dateline"]/span[contains(@class, "updated")]/@title')
        # These are sometimes exposed as <meta name='keywords'>, sometimes not.
        l.add_xpath('keywords', '//div[contains(@class, "tags")]//a/text()')

        return l.load_item()
Пример #24
0
    def parse_page(self, response):
        """@url http://www.cbsnews.com/news/iraqi-boy-trapped-in-mosul-for-years-finally-reunited-with-mother/
        @returns items 1
        @scrapes bodytext bylines fetchtime firstpubtime modtime headline
        @scrapes keywords section source summary url
        """
        s = response.selector
        # Remove any content from the tree before passing it to the loader.
        # There aren't native scrapy loader/selector methods for this.
        #mutate_selector_del_xpath(s, '//*[@style="display:none"]')

        l = NewsLoader(selector=s)

        # Add a number of items of data that should be standardised across
        # providers. Can override these (for TakeFirst() fields) by making
        # l.add_* calls above this line, or supplement gaps by making them
        # below.
        l.add_fromresponse(response)
        l.add_htmlmeta()
        #        l.add_schemaorg_mde(response, jsonld=True, rdfa=False, microdata=False)
        l.add_schemaorg(response)
        l.add_opengraph()
        l.add_scrapymeta(response)

        # Media pages. NOTE: These can be multipage; this will only get the
        # first page's text.
        l.add_xpath('bodytext', '//div[contains(@class, "post")]//text()')
        l.add_xpath('bodytext', '//div[@itemid="#article-entry"]//text()')

        return l.load_item()
Пример #25
0
    def parse_page(self, response):
        s = response.selector
        # Remove any content from the tree before passing it to the loader.
        # There aren't native scrapy loader/selector methods for this.
        mutate_selector_del(
            s, 'xpath',
            '//*[contains(@class, "print") or contains(@class, "hidden")]'
        )  # Physical print only

        l = NewsLoader(selector=s)

        # Remove referer params from end of URLs
        l.add_xpath('url', 'head/link[@rel="canonical"]/@href')

        # Add a number of items of data that should be standardised across
        # providers. Can override these (for TakeFirst() fields) by making
        # l.add_* calls above this line, or supplement gaps by making them
        # below.
        l.add_fromresponse(response)
        l.add_htmlmeta()
        l.add_schemaorg(response)
        l.add_opengraph()
        l.add_scrapymeta(response)

        l.add_xpath(
            'bodytext',
            '//div[@data-print="body"]/*[not(contains(@class, "user-bio") or contains(@class, "_shares") or contains(@class, "inline-promo"))]//text()'
        )
        l.add_xpath('bodytext',
                    '//div[contains(@class, "_item_text")]//text()')
        l.add_xpath(
            'bodytext', '//article//*[contains(@class, "subbuzz-text") or '
            'contains(@class, "subbuzz__title")]//text()')

        # BF helpfully includes a unix timestamp in its metadata.
        ts = s.xpath('//time/@data-unix').extract_first()
        if ts:
            l.add_value('modtime', datetime.fromtimestamp(int(ts)).isoformat())

        return l.load_item()
Пример #26
0
    def parse_page(self, response):
        """@url http://www.dailymail.co.uk/news/article-4242322/Milo-Yiannopoulos-BANNED-CPAC-conference.html?ITO=1490
        @returns items 1
        @scrapes bodytext bylines fetchtime firstpubtime modtime headline
        @scrapes keywords section source summary url
        """

        s = response.selector
        # Remove some content from the tree before passing it to the loader.
        # There aren't native scrapy loader/selector methods for this.
        mutate_selector_del_xpath(s, '//script')
        mutate_selector_del_xpath(s, '//*[@style="display:none"]')
        mutate_selector_del_xpath(
            s, '//div[contains(@class, "related-carousel")]')

        l = NewsLoader(selector=s)

        # Get alternative to RSS-source URL fluff
        l.add_xpath('url', 'head/link[@rel="canonical"]/@href')

        drosss = (r' for (Dailymail.com|The Daily Mail|'
                  'Daily Mail Australia|MailOnline)')
        # Sort out bylines with less fluff
        l.add_xpath(
            'bylines', 'head/meta[@property="article:author"]/@content',
            MapCompose(split_multiple_byline_string,
                       lambda s: re.sub(drosss, r'', s, re.IGNORECASE)))

        # Add a number of items of data that should be standardised across
        # providers. Can override these (for TakeFirst() fields) by making
        # l.add_* calls above this line, or supplement gaps by making them
        # below.
        l.add_fromresponse(response)
        l.add_htmlmeta()
        l.add_schemaorg(response)
        l.add_opengraph()

        # TODO: JS dross in body; might need a standard solution to keep this
        #       out.
        # TODO: Related article dross in body. <div class=related-carousel>

        item = l.load_item()

        #        self.logger.debug('bodytext', item['bodytext'])

        return item
Пример #27
0
    def parse_page(self, response):
        """@url http://www.foxnews.com/opinion/2017/02/28/if-trump-really-wants-to-restore-america-to-greatness-hell-have-to-compromise-with-democrats.html
        @returns items 1
        @scrapes bodytext bylines fetchtime firstpubtime modtime headline
        @scrapes section source summary url
        @noscrapes keywords
        """
        s = response.selector
        # Remove any content from the tree before passing it to the loader.
        # There aren't native scrapy loader/selector methods for this.
        #mutate_selector_del_xpath(s, '//*[@style="display:none"]')

        l = NewsLoader(selector=s)

        l.add_xpath('bodytext',
                    '//*[contains(@class, "article-text")]//text()')
        l.add_xpath('section',
                    '//*[contains(@class, "section-title")]//text()')
        l.add_xpath('section', 'head/meta[@name="prism-section"]/@content')
        # Well, this is awkward. Bylines (normally) not in metadata, and not
        # given a suitable class label in the HTML source.
        l.add_xpath(
            'bylines',
            '//div[contains(@class, "article-info")]//p[contains(., "By")]/span//text()'
        )

        # Add a number of items of data that should be standardised across
        # providers. Can override these (for TakeFirst() fields) by making
        # l.add_* calls above this line, or supplement gaps by making them
        # below.
        l.add_fromresponse(response)
        l.add_htmlmeta()
        l.add_dublincore()
        l.add_schemaorg(response)
        l.add_opengraph()
        l.add_scrapymeta(response)

        return l.load_item()
Пример #28
0
    def parse_page(self, response):
        """@url https://www.theguardian.com/business/2017/feb/20/how-unilever-foiled-kraft-heinzs-115m-takeover-bid-warren-buffett
        @returns items 1
        @scrapes bodytext fetchtime firstpubtime headline bylines
        @scrapes section source summary url modtime keywords
        """
        s = response.selector
        # Remove any content from the tree before passing it to the loader.
        # There aren't native scrapy loader/selector methods for this.
        # CSS is better for operating on classes than XPath, otherwise
        # either will do.
        #    1. Strip the submeta footer
        mutate_selector_del(s, 'xpath', '//div[contains(@class, "submeta")]')
        #    2. All the <aside> boxes
        mutate_selector_del(s, 'xpath', '//aside')

        l = NewsLoader(selector=s)

        l.add_value('source', 'The Guardian')

        # Add a number of items of data that should be standardised across
        # providers. Can override these (for TakeFirst() fields) by making
        # l.add_* calls above this line, or supplement gaps by making them
        # below.
        l.add_fromresponse(response)
        l.add_htmlmeta()
        l.add_schemaorg(response)
        l.add_opengraph()

        # Some Guardian articles are missing their OpenGraph article section
        # tag. These data-link-name tags are often multiple.
        l.add_xpath('section', '//a[@data-link-name="article section"]/text()', TakeFirst(), lambda x: x.strip())

        # The body tagging varies depending on the type of article, so let's
        # try several
        # TODO: There's still a bit of grot in this: <aside> tags, the social links
        #       under videos etc.
        # TODO: The <span class="drop-cap"> setup leaves a spurious line break
        #       after the first letter, which results in a space in the output
        #       text.
        l.add_xpath('bodytext', '//article//div[contains(@class, "content__main-column")]/*[not(contains(@class, "meta"))]//text()') # Eyewitness, plus video?

#        l.add_xpath('bodytext', '//div[@data-component="body"]//*[not(contains(@class, "meta"))]//text()') # Video
#        l.add_xpath('bodytext', '//div[@id="mainCol"]//text()') # Australian poll briefing
#        l.add_xpath('bodytext', '//ul[contains(@class, "gallery")]//text()') # In Pictures
#        l.add_xpath('bodytext', '//div[contains(@class, "gv-slice") and contains(@class, "second-strip")]//text()') # Interactive
#        #item['headline'] = join_strip_list(response.xpath('//h1//text()').extract()),
#        #item['bylines'] = response.xpath('//p[@class="byline"]//span[@itemprop="name"]/text()').extract(),

        return l.load_item()
Пример #29
0
    def parse_page(self, response):
        """@url http://www.independent.co.uk/news/world/americas/muslim-american-activist-tarek-el-messidi-jewish-cemetery-mt-carmel-philadelphia-vandalised-a7601266.html
        @returns items 1
        @scrapes bodytext bylines fetchtime firstpubtime modtime headline
        @scrapes keywords section source summary url
        """
        s = response.selector
        # Remove any content from the tree before passing it to the loader.
        # There aren't native scrapy loader/selector methods for this.

        # Picture galleries, generally unrelated to the story
        mutate_selector_del(s, 'css', '.type-gallery')
        # "More about" grot
        mutate_selector_del(
            s, 'xpath', '//li[contains(text(), "More about")]/'
            'parent::*[contains(@class, '
            '"inline-pipes-list")]')

        l = NewsLoader(selector=s)

        l.add_xpath(
            'bylines',
            '//article//*[@itemprop="author"]//*[@itemprop="name"]//text()')

        # Add a number of items of data that should be standardised across
        # providers. Can override these (for TakeFirst() fields) by making
        # l.add_* calls above this line, or supplement gaps by making them
        # below.
        l.add_fromresponse(response)
        l.add_htmlmeta()
        l.add_schemaorg(response)
        l.add_opengraph()
        l.add_scrapymeta(response)

        return l.load_item()
Пример #30
0
    def parse_page(self, response):
        """@url http://abcnews.go.com/Politics/house-intelligence-committee-sets-framework-russian-probe/story?id=45846073
        @returns items 1
        @scrapes bodytext bylines fetchtime firstpubtime modtime headline
        @scrapes keywords section source summary url
        """
        s = response.selector
        # Remove any content from the tree before passing it to the loader.
        # There aren't native scrapy loader/selector methods for this.        
        #mutate_selector_del_xpath(s, '//*[@style="display:none"]')

        l = NewsLoader(selector=s)

        # Add a number of items of data that should be standardised across
        # providers. Can override these (for TakeFirst() fields) by making
        # l.add_* calls above this line, or supplement gaps by making them
        # below.
        l.add_fromresponse(response)
        l.add_htmlmeta()
        l.add_schemaorg(response)
        l.add_opengraph()
        l.add_scrapymeta(response)

        l.add_xpath('section', '//article/@data-section')
        l.add_xpath('modtime', 'head/meta[@name="Last-Modified"]/@content')
        l.add_xpath('firstpubtime', '//div[contains(@class, "article-meta")]//span[contains(@class, "timestamp")]/text()', self._strip_timestamp)

        return l.load_item()