Пример #1
0
    def parse(self, response):
        article = NewsArticle()
        article['url'] = response.url
        article['source'] = 'CNN'
        article['title'] = response.xpath('//h1[@class="pg-headline"]/text()').get()
        article['description'] = response.xpath('//meta[@itemprop="description"]/@content').get()
        article['body'] = response.xpath('//section[@data-zone-label="bodyText"]/div[@class="l-container"]//*/text()').getall()
        article['author'] = response.xpath('//meta[@itemprop="author"]/@content').get()
        article['datePublished'] = response.xpath('//meta[@itemprop="datePublished"]/@content').get()

        return article
 def parse(self, response):
     article = NewsArticle()
     # <script data-rh="true">
     article['url'] = response.url
     article['source'] = 'CNN'
     article['title'] = response.xpath('//h1/text()').get()
     article['description'] = response.xpath('//meta[@name="description"]/@content').get()
     article['date'] = response.xpath('//meta[@itemprop="datePublished"]/@content').get()
     article['author'] = response.xpath('//meta[@itemprop="author"]/@content').get().replace(', CNN', '')
     article['text'] = response.xpath('//section[@data-zone-label="bodyText"]/div[@class="l-container"]//*/text()').getall()
     return article
Пример #3
0
    def parse_item(self, response):
        article = NewsArticle()
        # <script data-rh="true">
        article['url'] = response.url
        article['source'] = 'Associated Press'

        jsonData = json.loads(
            response.xpath('//script[@data-rh="true"]/text()').get())
        article['title'] = jsonData['headline']
        article['description'] = jsonData['description']
        article['date'] = jsonData['datePublished']
        article['author'] = jsonData['author'][0]
        article['text'] = response.xpath(
            '//div[@class="Article"]/p/text()').getall()
        return article
Пример #4
0
    def parse_item(self, response):
        article = NewsArticle()
        article['url'] = response.url
        article['source'] = 'Yahoo News'

        jsonData = json.loads(
            response.xpath(
                '//article[@role="article"]/script[@type="application/ld+json"]/text()'
            ).get())

        article['title'] = jsonData['headline']
        article['description'] = jsonData['description']
        article['date'] = jsonData['datePublished']
        article['author'] = jsonData['author']['name']
        article['text'] = response.xpath(
            '//div[@class="caas-body"]/p/text()').getall()
        return article