def parse(self, response): article = NewsArticle() article['url'] = response.url article['source'] = 'CNN' article['title'] = response.xpath('//h1[@class="pg-headline"]/text()').get() article['description'] = response.xpath('//meta[@itemprop="description"]/@content').get() article['body'] = response.xpath('//section[@data-zone-label="bodyText"]/div[@class="l-container"]//*/text()').getall() article['author'] = response.xpath('//meta[@itemprop="author"]/@content').get() article['datePublished'] = response.xpath('//meta[@itemprop="datePublished"]/@content').get() return article
def parse(self, response): article = NewsArticle() # <script data-rh="true"> article['url'] = response.url article['source'] = 'CNN' article['title'] = response.xpath('//h1/text()').get() article['description'] = response.xpath('//meta[@name="description"]/@content').get() article['date'] = response.xpath('//meta[@itemprop="datePublished"]/@content').get() article['author'] = response.xpath('//meta[@itemprop="author"]/@content').get().replace(', CNN', '') article['text'] = response.xpath('//section[@data-zone-label="bodyText"]/div[@class="l-container"]//*/text()').getall() return article
def parse_item(self, response): article = NewsArticle() # <script data-rh="true"> article['url'] = response.url article['source'] = 'Associated Press' jsonData = json.loads( response.xpath('//script[@data-rh="true"]/text()').get()) article['title'] = jsonData['headline'] article['description'] = jsonData['description'] article['date'] = jsonData['datePublished'] article['author'] = jsonData['author'][0] article['text'] = response.xpath( '//div[@class="Article"]/p/text()').getall() return article
def parse_item(self, response): article = NewsArticle() article['url'] = response.url article['source'] = 'Yahoo News' jsonData = json.loads( response.xpath( '//article[@role="article"]/script[@type="application/ld+json"]/text()' ).get()) article['title'] = jsonData['headline'] article['description'] = jsonData['description'] article['date'] = jsonData['datePublished'] article['author'] = jsonData['author']['name'] article['text'] = response.xpath( '//div[@class="caas-body"]/p/text()').getall() return article