def parse(self, response): canonical_url = response.xpath( '//link[@rel="canonical"]/@href').extract_first() title = response.xpath('//h1/span/text()').extract_first() self.logger.info('%s %s', response.url, title) article_body = response.css('div.text') if article_body: # join multiple text sections body_html = " ".join(article_body.extract()) byline = response.css('span.article-author').xpath( 'span/text()').extract_first() publication_date_str = response.css( 'span.article-pub-date::text').extract_first().strip() # '30 August 2018' publication_date = datetime.strptime(publication_date_str, '%d %B %Y') # datetime.datetime(2018, 8, 30, 0, 0) publication_date = SAST.localize(publication_date) item = ScrapenewsItem() item['body_html'] = body_html item['title'] = title item['byline'] = byline item['published_at'] = publication_date.isoformat() item['retrieved_at'] = datetime.utcnow().isoformat() item['url'] = canonical_url item['file_name'] = response.url.split('/')[-2] item['spider_name'] = self.name item['publication_name'] = self.publication_name yield item
def parse(self, response): canonical_url = response.xpath( '//link[@rel="canonical"]/@href').extract_first() title = response.xpath('//h1/span/text()').extract_first() self.logger.info('%s %s', response.url, title) # should we be using canonical_url instead of response.url for the above? article_body = response.css('div.article-widget-text') if article_body: body_html = " ".join(article_body.css('::text').extract()) byline = response.css('span.article-author').xpath( '@data-author').extract_first() publication_date_str = response.css( 'span.article-pub-date::text').extract_first().strip() # u'26 June 2018 - 07:28' publication_date = datetime.strptime(publication_date_str, '%d %B %Y - %H:%M') # datetime.datetime(2018, 6, 26, 7, 28) publication_date = SAST.localize(publication_date) item = ScrapenewsItem() item['body_html'] = body_html item['title'] = title item['byline'] = byline item['published_at'] = publication_date.isoformat() item['retrieved_at'] = datetime.utcnow().isoformat() item['url'] = canonical_url item['file_name'] = response.url.split('/')[-2] # should we be using canonical_url instead of response.url for the above? item['spider_name'] = self.name item['publication_name'] = self.publication_name yield item
def parse_item(self, response): title = response.xpath( '//h1[contains(@class, "entry-title")]/text()').extract_first() self.logger.info('%s %s', response.url, title) og_type = response.xpath( '//meta[@property="og:type"]/@content').extract_first() if og_type == 'activity': body_element = response.xpath( '//div[contains(@class, "td-post-content")]') body_html = body_element.extract_first() publication_date = response.xpath( '//time/@datetime').extract_first() if body_html: item = ScrapenewsItem() item['body_html'] = body_html item['title'] = title item['published_at'] = publication_date item['retrieved_at'] = datetime.utcnow().isoformat() item['url'] = response.url item['file_name'] = response.url.split('/')[-1] item['publication_name'] = self.publication_name item['spider_name'] = self.name yield item else: self.logger.info("No body found for %s", response.url)
def parse(self, response): canonical_url = response.xpath('//link[@rel="canonical"]/@href').extract_first() if canonical_url: url = canonical_url else: url = response.url if '/opinionistas' in url: self.logger.info("Ignoring %s", url) return title = response.xpath('//div[@class="titles"]/h1/text()').extract_first() self.logger.info('%s %s', url, title) article_body = response.xpath('//div[@class="article-container"]') if article_body: body_html = article_body.extract_first() byline = response.xpath('//meta[@name="author"]/@content').extract_first() publication_date_str = response.xpath('//meta[@name="published"]/@content').extract_first() publication_date = datetime.strptime(publication_date_str, '%Y-%m-%d') publication_date = SAST.localize(publication_date) item = ScrapenewsItem() item['body_html'] = body_html item['title'] = title item['byline'] = byline item['published_at'] = publication_date.isoformat() item['retrieved_at'] = datetime.utcnow().isoformat() item['url'] = url item['file_name'] = url.split('/')[-2] item['spider_name'] = self.name item['publication_name'] = self.publication_name yield item
def parse(self, response): title = response.xpath('//header/h1/text()').extract_first() self.logger.info('%s %s', response.url, title) article_body = response.xpath('//div[@itemprop="articleBody"]') if article_body: body_html = article_body.extract_first() byline = response.xpath( '//span[@itemprop="author"]/strong/text()').extract_first() publication_date_str = response.xpath( '//span[@itemprop="datePublished"]/@content').extract_first() publication_date_str = publication_date_str.strip()[:16] publication_date = datetime.strptime(publication_date_str, '%Y-%m-%dT%H:%M') publication_date = SAST.localize(publication_date) item = ScrapenewsItem() item['body_html'] = body_html item['title'] = title item['byline'] = byline item['published_at'] = publication_date.isoformat() item['retrieved_at'] = datetime.utcnow().isoformat() item['url'] = response.url item['file_name'] = response.url.split('/')[-1] item['spider_name'] = self.name item['publication_name'] = self.publication_name yield item
def parse_item(self, response): canonical_url = response.xpath('//link[@rel="canonical"]/@href').extract_first() title = response.css('h2 > span').xpath('text()').extract_first() self.logger.info('%s %s', response.url, title) # should we be using canonical_url instead of response.url for the above? og_type = response.xpath('//meta[@property="og:type"]/@content').extract_first() if og_type == 'article': body_html = " ".join(response.css('article.article-full p').extract()) byline = response.css('.byline span[itemprop="author"] a ::text').extract_first() publication_date_str = response.xpath('//meta[@itemprop="datePublished"]/@content').extract_first() # alas no time portion: possibly use timelib to publication_date = datetime.strptime(publication_date_str, '%Y-%m-%d') # datetime.datetime(2018, 6, 14, 11, 0) publication_date = SAST.localize(publication_date) # datetime.datetime(2018, 6, 14, 11, 0, tzinfo=<DstTzInfo 'Africa/Johannesburg' SAST+2:00:00 STD>) if body_html: item = ScrapenewsItem() item['body_html'] = body_html item['title'] = title item['byline'] = byline item['published_at'] = publication_date.isoformat() item['retrieved_at'] = datetime.utcnow().isoformat() item['url'] = canonical_url item['file_name'] = response.url.split('/').pop() item['spider_name'] = self.name item['publication_name'] = self.publication_name yield item else: self.logger.info("No body found for %s", response.url)
def parse_item(self, response): canonical_url = response.xpath('//link[@rel="canonical"]/@href').extract_first() title = response.xpath('//h1/text()').extract_first() self.logger.info('%s %s', response.url, title) # should we be using canonical_url instead of response.url for the above? og_type = response.xpath('//meta[@property="og:type"]/@content').extract_first() if og_type == 'article': article_body = response.css('div.post-content') body_html = " ".join(article_body.css('::text').extract()) byline = response.css('span.author::text').extract_first().strip() # When extracting, ignore the first line as it is a comment. publication_date_str = response.css('span.create::text').extract()[1].strip() publication_date = datetime.strptime(publication_date_str, '%d %B %Y, %I:%M %p') publication_date = SAST.localize(publication_date) if body_html: item = ScrapenewsItem() item['body_html'] = body_html item['title'] = title.strip() item['byline'] = byline item['published_at'] = publication_date.isoformat() item['retrieved_at'] = datetime.utcnow().isoformat() item['url'] = canonical_url item['file_name'] = response.url.split('/')[-2] # should we be using canonical_url instead of response.url for the above? item['spider_name'] = self.name item['publication_name'] = self.publication_name yield item else: self.logger.info("No body found for %s", response.url)
def parse_item(self, response): canonical_url = response.xpath('//link[@rel="canonical"]/@href').extract_first() title = response.xpath('//h1[@class="entry-title"]/text()').extract_first() self.logger.info('%s %s', response.url, title) # should we be using canonical_url instead of response.url for the above? og_type = response.xpath('//meta[@property="og:type"]/@content').extract_first() if og_type == 'article': article_body = response.css('div.td-post-content') body_html = " ".join(article_body.xpath('//p').extract()) byline = response.css('div.td-post-author-name').css('::text').extract()[2] publication_date_str = response.xpath('//time/@datetime').extract_first() # u'2018-06-14T11:00:00+00:00' publication_date = datetime.strptime(publication_date_str[0:19], '%Y-%m-%dT%H:%M:%S') # datetime.datetime(2018, 6, 14, 11, 0) publication_date = SAST.localize(publication_date) # datetime.datetime(2018, 6, 14, 11, 0, tzinfo=<DstTzInfo 'Africa/Johannesburg' SAST+2:00:00 STD>) if body_html: item = ScrapenewsItem() item['body_html'] = body_html item['title'] = title item['byline'] = byline item['published_at'] = publication_date.isoformat() item['retrieved_at'] = datetime.utcnow().isoformat() item['url'] = canonical_url item['file_name'] = response.url.split('/')[-2] item['spider_name'] = self.name item['publication_name'] = self.publication_name yield item else: self.logger.info("No body found for %s", response.url)
def parse_item(self, response): canonical_url = response.xpath( '//link[@rel="canonical"]/@href').extract_first() if canonical_url: url = canonical_url else: url = response.url title = response.css('h1.article-title-primary').xpath( 'span/text()').extract_first() self.logger.info('%s %s', url, title) # Ignore premium content articles is_premium_content = response.css('div.premium-alert').xpath( "h3/text()").extract_first( ) == 'This article is reserved for our subscribers.' if is_premium_content: self.logger.info("Ignoring premium content %s", url) return article_body = response.css('div.article-widget-text') if article_body: # join multiple text sections body_html = " ".join(article_body.extract()) byline = response.css('span.heading-author').xpath( 'text()').extract_first() publication_date_str = response.css('div.article-pub-date').xpath( 'text()').extract_first().strip() publication_date = datetime.strptime(publication_date_str, '%d %B %Y - %H:%M') publication_date = SAST.localize(publication_date) item = ScrapenewsItem() item['body_html'] = body_html item['title'] = title item['byline'] = byline item['published_at'] = publication_date.isoformat() item['retrieved_at'] = datetime.utcnow().isoformat() item['url'] = url item['file_name'] = url.split('/')[-2] item['spider_name'] = self.name publication_urls = { 'bd': 'Business Day', 'fm': 'Financial Mail', 'rdm': 'Rand Daily Mail', 'bt': 'Business Times', 'ft': 'Financial Times' } url_part = url.split('/')[3] item['publication_name'] = publication_urls.get( url_part, 'Business Day') yield item else: self.logger.info("No body found for %s", response.url)
def parse_item(self, response): og_url = response.xpath( '//meta[@property="og:url"]/@content').extract_first() # no 'canonical' that I could find title = response.xpath('//h1/text()').extract_first() self.logger.info('%s %s', response.url, title) # should we be using og_url instead of response.url for the above? og_type = response.xpath( '//meta[@property="og:type"]/@content').extract_first() if og_type == 'article': subtitle = response.xpath('//p[@id="article_subtitle"]').css( '::text').extract_first() photo_caption = response.xpath( '//figcaption[@id="article_primary_image_caption"]/text()' ).extract_first() article_body = " ".join( response.xpath('//div[@id="article_body"]').css( '::text').extract()) if subtitle and photo_caption: body_html = subtitle + photo_caption + article_body elif subtitle and not photo_caption: body_html = subtitle + article_body elif photo_caption and not subtitle: body_html = photo_caption + article_body else: body_html = article_body byline = response.xpath( '//a[@rel="author"]/text()').extract_first() publication_date_str = response.xpath( '//time/@datetime').extract_first() publication_date = datetime.strptime(publication_date_str, '%Y-%m-%d') publication_date = SAST.localize(publication_date) if body_html: item = ScrapenewsItem() item['body_html'] = body_html item['title'] = title item['byline'] = byline item['published_at'] = publication_date.isoformat() item['retrieved_at'] = datetime.utcnow().isoformat() item['url'] = og_url item['file_name'] = response.url.split('/')[-2] # should we be using og_url instead of response.url for the above? item['spider_name'] = self.name item['publication_name'] = self.publication_name yield item else: self.logger.info("No body found for %s", response.url)
def parse_item(self, response): # Whitelist categories that are actually news if not response.css('.post-categories').xpath( 'li/a[contains(@href, "news-headlines")]'): self.logger.info("Skipping non-news article %s", response.url) return canonical_url = response.xpath( '//link[@rel="canonical"]/@href').extract_first() title = response.xpath( '//h1[@class="entry-title"]/text()').extract_first() og_type = response.xpath( '//meta[@property="og:type"]/@content').extract_first() if og_type == 'article': article_body = response.css('div.entry-content') body_html = article_body.extract_first() byline = response.css('div.author-name').css( '::text').extract_first() publication_date_str = response.xpath( '//time/@datetime').extract_first() # u'2018-06-14T11:00:00+00:00' publication_date = datetime.strptime(publication_date_str[0:19], '%Y-%m-%dT%H:%M:%S') # datetime.datetime(2018, 6, 14, 11, 0) publication_date = SAST.localize(publication_date) # datetime.datetime(2018, 6, 14, 11, 0, tzinfo=<DstTzInfo 'Africa/Johannesburg' SAST+2:00:00 STD>) if body_html: item = ScrapenewsItem() item['body_html'] = body_html item['title'] = title item['byline'] = byline item['published_at'] = publication_date.isoformat() item['retrieved_at'] = datetime.utcnow().isoformat() item['url'] = canonical_url item['file_name'] = response.url.split('/')[-2] item['spider_name'] = self.name item['publication_name'] = self.publication_name yield item else: self.logger.info("No body found for %s", canonical_url)
def parse(self, response): canonical_url = response.xpath( '//link[@rel="canonical"]/@href').extract_first() ## Skip excluded sections section = response.css('a.section').xpath('text()').extract_first() if section and section.lower() in IGNORE_SECTIONS: self.logger.info("Skipping %s because section is %s", canonical_url, section) return ## Skip syndicated content body_html = "".join(response.css("#body_content p").extract()) body_text = remove_tags(body_html) for string in SKIP_STRINGS: suffix = body_text[-20:] if string in suffix: self.logger.info("Skipping %s because suffix %r contains %r", canonical_url, suffix, string) return publication_date_str = response.xpath( '//meta[@name="publicationdate"]/@content').extract_first() publication_date = datetime.strptime(publication_date_str, '%d/%m/%Y') publication_date = SAST.localize(publication_date) item = ScrapenewsItem() item['body_html'] = response.css("#body_content").extract_first() item['title'] = response.xpath( '//meta[@name="title"]/@content').extract_first() item['byline'] = response.xpath( '//meta[@name="author"]/@content').extract_first() item['published_at'] = publication_date.isoformat() item['retrieved_at'] = datetime.utcnow().isoformat() item['url'] = canonical_url item['file_name'] = response.url.split('/')[-1] item['spider_name'] = self.name item['publication_name'] = self.publication_name yield item
def parse(self, response): if '/News/' not in response.url: self.logger.info("Ignoring %s", response.url) return title = response.xpath( '//div[contains(@class, "article_details")]/h1/text()' ).extract_first() self.logger.info('%s %s', response.url, title) article_body = response.xpath('//article[@id="article-body"]') if article_body: body_html = article_body.extract_first() byline = response.xpath( '//div[contains(@class, "ByLineWidth")]/p/text()' ).extract_first() publication_date_str = response.xpath( '//span[@id="spnDate"]/text()').extract_first() accreditation = response.xpath( '//div[contains(@class, "ByLineWidth")]/div[contains(@class, "accreditation")]/a/@href' ).extract_first() publication_date = datetime.strptime(publication_date_str, '%Y-%m-%d %H:%M') publication_date = SAST.localize(publication_date) item = ScrapenewsItem() item['body_html'] = body_html item['title'] = title item['byline'] = byline item['published_at'] = publication_date.isoformat() item['retrieved_at'] = datetime.utcnow().isoformat() item['url'] = response.url item['file_name'] = response.url.split('/')[-1] item['spider_name'] = self.name item['publication_name'] = self.publication_name if accreditation: item['publication_name'] += " with " + accreditation[1:] yield item self.logger.info("")
def parse_item(self, response): canonical_url = response.xpath( '//link[@rel="canonical"]/@href').extract_first() title = response.xpath('//h1/text()').extract_first() self.logger.info('%s %s', response.url, title) # should we be using canonical_url instead of response.url for the above? og_type = response.xpath( '//meta[@property="og:type"]/@content').extract_first() if og_type == 'article': article_body = response.css('div.article-body') body_html = " ".join( article_body.xpath('//p').css('::text').extract()) byline = response.xpath( '//strong[@itemprop="name"]/text()').extract_first() publication_date_str = response.xpath( '//meta[@itemprop="datePublished"]/@content').extract_first() # '2020-01-30T08:22:00.000Z' publication_date = datetime.strptime(publication_date_str[:19], '%Y-%m-%dT%H:%M:%S') # datetime.datetime(2018, 6, 18, 9, 1); datetime.datetime(2018, 6, 8, 21, 20) publication_date = SAST.localize(publication_date) # datetime.datetime(2018, 6, 8, 21, 20, tzinfo=<DstTzInfo 'Africa/Johannesburg' SAST+2:00:00 STD>) if body_html: item = ScrapenewsItem() item['body_html'] = body_html item['title'] = title item['byline'] = byline item['published_at'] = publication_date.isoformat() item['retrieved_at'] = datetime.utcnow().isoformat() item['url'] = canonical_url item['file_name'] = response.url.split('/')[-1] item['publication_name'] = self.publication_name item['spider_name'] = self.name yield item else: self.logger.info("No body found for %s", response.url)
def parse_item(self, response): title = response.css('header.article-header h1').xpath( 'text()').extract_first() self.logger.info('%s %s', response.url, title) publication_date = response.css('.article-meta time').xpath( '@datetime').extract_first() body_html = response.css('.article-text').extract_first() if body_html: item = ScrapenewsItem() item['body_html'] = body_html item['title'] = title item['published_at'] = publication_date item['retrieved_at'] = datetime.utcnow().isoformat() item['url'] = response.url item['file_name'] = response.url.split('/')[-1] item['publication_name'] = self.publication_name item['spider_name'] = self.name yield item else: self.logger.info("No body found for %s", response.url)
def parse(self, response): canonical_url = response.xpath( '//link[@rel="canonical"]/@href').extract_first() title = response.xpath('//h1/span/text()').extract_first() self.logger.info('%s %s', response.url, title) article_body = response.css('div.article-widget-text') if article_body: # join multiple text sections body_html = " ".join(article_body.extract()) byline_includes_by = response.xpath( '//span[@class="heading-author"]/text()').extract() byline_str = " ".join(byline_includes_by) byline_no_by = (byline_str).split(" ")[1:] byline = " ".join(byline_no_by) publication_date_str = response.css('div.article-pub-date').xpath( 'text()').extract()[0].strip() # '03 May 2018 - 19:17' publication_date = datetime.strptime(publication_date_str, '%d %B %Y - %H:%M') # datetime.datetime(2018, 5, 3, 19, 17) publication_date = SAST.localize(publication_date) item = ScrapenewsItem() item['body_html'] = body_html item['title'] = title item['byline'] = byline item['published_at'] = publication_date.isoformat() item['retrieved_at'] = datetime.utcnow().isoformat() item['url'] = canonical_url item['file_name'] = response.url.split('/')[-2] item['spider_name'] = self.name item['publication_name'] = self.publication_name yield item