def parse(self, response): item = NewsItem() type = response.xpath('//meta[@property="og:type"]//@content').extract_first() if type is None or "article" not in type: return item['url'] = response.url item['date'] = parse( response.xpath('//*[@id="article-feed"]/article[1]//span[@class="timestamp"]').extract()[0], fuzzy=True).strftime("%Y-%m-%dT%H:%M:%S") try: item['author'] = " ".join( response.xpath('//*[@id="article-feed"]/article[1]//div[@class="author"]//text()') .extract()).strip() except IndexError: item['author'] = '' item['title'] = response.xpath('//meta[@property="og:title"]//@content').extract()[0].strip() item['description'] = response.xpath( '//meta[@property="og:description"]//@content').extract_first().rstrip() item['content'] = remove_unicode(' '.join(response.xpath( '//*[@id="article-feed"]/article[1]//*[@class="article-body"]//*[@itemprop="articleBody"]//text()').extract()).rstrip()) yield item
def parse_item_page(self, response): item_data = { "title": remove_unicode( response.xpath('//meta[@property="og:title"]/@content'). extract()[0].strip()), "author": " ".join( response.xpath('//span[@class="author"]//text()').extract() [1:-1]).strip(), "date": parse(response.xpath( '//meta[@property="article:published_time"]/@content').extract( )[0].strip(), fuzzy=True).strftime("%Y-%m-%dT%H:%M:%S"), "description": remove_unicode( response.xpath('//meta[@property="og:description"]/@content'). extract()[0].strip()), "content": self._get_content(response), "url": response.url, } yield NewsItem(**item_data)
def parse_item(self, response): super(NextBigWhatSpider, self).parse_item(response) htmlparser = etree.HTMLParser() tree = etree.parse(BytesIO(response.body), htmlparser) news_item = NewsItem() try: title = tree.xpath(".//header[contains(@class, 'entry-header')]/h1/text()") details = tree.xpath('.//div[contains(@class, "herald-entry-content")]/p/text()') if title and details: news_item['source'] = self.name news_item['crawled_date'] = datetime.now() news_item['source_url'] = response.url.split('?')[0] news_item['title'] = title[0].strip().encode('ascii','ignore') news_item['details'] = "\t".join([item.strip().encode('ascii','ignore') for item in details if item.strip()]) img_urls = tree.xpath('.//div[contains(@class, "herald-post-thumbnail herald-post-thumbnail-single")]/span/img/@src') if img_urls: news_item['img_urls'] = get_stripped_list(img_urls) meta_result = self.get_meta(tree) if 'description' in meta_result: news_item['blurb'] = meta_result['description'] return news_item except: pass return None
def parse_item(self, response): super(FinancialExpressSpider, self).parse_item(response) htmlparser = etree.HTMLParser() tree = etree.parse(BytesIO(response.body), htmlparser) self.log('==RESPONSE=================>>>>>>>>! %s' % response.request.headers['Referer']) referer = response.request.headers['Referer'] news_item = NewsItem() try: title = tree.xpath(".//meta[@itemprop='headline']/@content") details = tree.xpath(".//div[@itemprop='articleBody']//p//text()") # self.log('==Title=================>>>>>>>>! %s' % title[0]) if title and details: news_item['source'] = self.name news_item['source_url'] = response.url.split('?')[0] news_item['crawled_date'] = datetime.now() news_item['title'] = title[0].strip().encode('ascii', 'ignore') news_item['details'] = "\t".join( [ele.strip().encode('ascii', 'ignore') for ele in details]) img_urls = tree.xpath( ".//div[@itemprop='articleBody']//img[contains(@class,'size-full')]/@src" ) if img_urls: news_item['img_urls'] = get_stripped_list(img_urls) news_item['cover_image'] = img_urls[0] meta_result = self.get_meta(tree) if 'og:image' in meta_result: news_item['cover_image'] = meta_result['og:image'] if 'og:description' in meta_result: news_item['blurb'] = meta_result['og:description'] news_item['blurb'] = news_item['blurb'].strip().encode( 'ascii', 'ignore') if 'og:updated_time' in meta_result: news_item['published_date'] = datetime.strptime( meta_result['og:updated_time'].split("+")[0], '%Y-%m-%dT%H:%M:%S') authors = tree.xpath(".//meta[@itemprop='author']/@content") if authors: news_item['author'] = get_stripped_list(authors) for item in categories: if referer in sum(item['subcategory'].values(), []): news_item['category'] = item['category'] key = (key for key, value in item['subcategory'].items() if referer in value).next() news_item['sub_categories'] = [key] return news_item except Exception, e: self.log('==Exception=================>>>>>>>>! %r' % e)
def parse_item(self, response): filedir = self.pre_write_check() filename = os.path.join(filedir, md5(response.url).hexdigest()) if not os.path.exists(filename): with open(filename, "wb") as html: html.write(response.body) else: print "skipped file {0}".format(filename) return None htmlparser = etree.HTMLParser() tree = etree.parse(BytesIO(response.body), htmlparser) news_item = NewsItem() try: title = tree.xpath(".//h1[contains(@class,\"post-tile entry-title\")]/text()") details = tree.xpath('//div[contains(@class,"entry-content")]/p//text()') if title and details: news_item['source'] = self.name news_item['crawled_date'] = datetime.now() news_item['source_url'] = response.url.split('?')[0] news_item['title'] = title[0].strip().encode('ascii','ignore') news_item['details'] = "\t".join([ele.strip().encode('ascii','ignore') for ele in details]) # " ".join([ele.strip().encode('ascii','ignore') for ele in details]) img_urls = tree.xpath('.//div[contains(@class,\'feature-img\')]/img/@src') if img_urls: news_item['img_urls'] = get_stripped_list(img_urls) news_item['cover_image'] = img_urls[0] blurb = tree.xpath('.//div[@class=\'entry-content\']/p/em/text()') if blurb: news_item['blurb'] = blurb[0].strip().encode('ascii','ignore') ## TODO ## Author, Tags tags = tree.xpath('.//div[contains(@class,\'mom-post-meta single-post-meta\')]/span[3]/a//text()') if tags: news_item['tags'] = tags published_date = tree.xpath('.//span//time[contains(@class,\'updated\')]//text()') if published_date: news_item['published_date'] = datetime.strptime(" ".join([item.strip().encode('ascii','ignore') for item in published_date]), '%B %d, %Y') author = tree.xpath('.//span[contains(@class,\'fn\')]/a/text()') if author: news_item['author'] = author referer = response.request.headers['Referer'] for item in categories: if referer in sum(item['subcategory'].values(), []): news_item['category'] = item['category'] key = (key for key,value in item['subcategory'].items() if referer in value).next() news_item['sub_categories'] = [key] return news_item except Exception, e: self.log('==Exception=================>>>>>>>>! %r' % e)
def parse_item(self, response): super(MoneycontrolSpider, self).parse_item(response) htmlparser = etree.HTMLParser() tree = etree.parse(BytesIO(response.body), htmlparser) news_item = NewsItem() try: title = tree.xpath('.//h1[contains(@class, "arti_title")]/text()') details = tree.xpath( './/div[contains(@class, "MT20")]//p//text()[not(ancestor::script)]' ) if title and details: news_item['source'] = self.name news_item['crawled_date'] = datetime.now() news_item['source_url'] = response.url.split('?')[0] news_item['title'] = title[0].strip().encode('ascii', 'ignore') news_item['details'] = '\t'.join([ item.strip().encode('ascii', 'ignore').decode('unicode_escape') for item in details if item.strip() ]) img_urls = tree.xpath( './/table[contains(@class,"MR15")]//div/img/@src') if img_urls: news_item['img_urls'] = get_stripped_list(img_urls) news_item['cover_image'] = img_urls[0] tags = tree.xpath( './/div[contains(@class, "tag_wrap MT20")]/a//text()') if tags: news_item['tags'] = get_stripped_list(tags) published_date = tree.xpath( './/p[contains(@class, "arttidate MT15")]//text()') if published_date: if '|' in published_date[0]: news_item['published_date'] = datetime.strptime( published_date[0].split('|')[0].strip().encode( 'ascii', 'ignore'), '%b %d, %Y, %I.%M %p') else: news_item['published_date'] = datetime.strptime( published_date[0].strip().encode( 'ascii', 'ignore'), '%b %d, %Y, %I.%M %p') referer = response.request.headers['Referer'] for item in categories: if referer in sum(item['subcategory'].values(), []): news_item['category'] = item['category'] key = (key for key, value in item['subcategory'].items() if referer in value).next() news_item['sub_categories'] = [key] return news_item except Exception, e: self.log('==Exception=================>>>>>>>>! %r' % e)
def parse_item(self, response): super(EntrepreneurSpider, self).parse_item(response) htmlparser = etree.HTMLParser() tree = etree.parse(BytesIO(response.body), htmlparser) news_item = NewsItem() try: title = tree.xpath(".//h1[contains(@class,\'headline\')]//text()") details = tree.xpath( './/div[contains(@class,\'bodycopy\')]//p//text()') if title and details: news_item['source'] = self.name news_item['source_url'] = response.url.split('?')[0] news_item['crawled_date'] = datetime.now() news_item['title'] = title[0].strip().encode('ascii', 'ignore') news_item['details'] = "\t".join( [det.strip().encode('ascii', 'ignore') for det in details]) img_urls = tree.xpath( './/div[contains(@class,\'hero topimage\')]/img/@src') if img_urls: news_item['img_urls'] = get_stripped_list(img_urls) news_item['cover_image'] = img_urls[0] blurb = tree.xpath( './/div[contains(@class,\'bodycopy\')]/p/text()') news_item['blurb'] = " ".join([ short_blurb.strip().encode('ascii', 'ignore') for short_blurb in blurb[0:1] ]) published_date = tree.xpath( './/time[contains(@itemprop,\'datePublished\')]//text()') if published_date: news_item['published_date'] = datetime.strptime( published_date[0].strip(), '%B %d, %Y') tags = tree.xpath( './/div[contains(@class,\'article-tags\')]/a/text()') if tags: news_item['tags'] = get_stripped_list(tags) author = tree.xpath( './/div[contains(@itemprop,\'name\')]/text()') if author: news_item['author'] = get_stripped_list(author) referer = response.request.headers['Referer'] for item in categories: if referer in sum(item['subcategory'].values(), []): news_item['category'] = item['category'] key = (key for key, value in item['subcategory'].items() if referer in value).next() news_item['sub_categories'] = [key] return news_item except Exception, e: self.log('==Exception=================>>>>>>>>! %r' % e)
def parse_item(self, response): filedir = self.pre_write_check() filename = os.path.join(filedir, md5(response.url).hexdigest()) with open(filename, "wb") as html: html.write(response.body) htmlparser = etree.HTMLParser() tree = etree.parse(BytesIO(response.body), htmlparser) news_item = NewsItem() try: title = tree.xpath( ".//div[contains(@class,\"large-12 columns article-title\")]//h1//text()" ) details = tree.xpath('//html/body/div/div/article/div//p//text()') if title and details: news_item['title'] = title[0].strip().encode('ascii', 'ignore') details = [ ele.strip().encode('ascii', 'ignore') for ele in details ] news_item['details'] = "\t".join(details) img_urls = tree.xpath( './/img[contains(@class,\"article-hero-img\")]/@src') if img_urls: news_item['img_urls'] = get_stripped_list(img_urls) cover_image = tree.xpath( './/img[contains(@class,\"article-hero-img\")]//img/@src') if cover_image: news_item['cover_image'] = cover_image news_item['source'] = self.name news_item['crawled_date'] = datetime.now() news_item['source_url'] = response.url.split('?')[0] author = tree.xpath( ".//div[contains(@class,\"author\")]/h4/a/text()") news_item['author'] = author published_date = tree.xpath( ".//div[contains(@class,\"datetime\")]/h2/span/text()") news_item['published_date'] = published_date news_item['tags'] = tree.xpath( ".//div[contains(@class,\"tags\")]//a/text()") meta_result = self.get_meta(tree) if 'og:image' in meta_result: news_item['cover_image'] = meta_result['og:image'] if 'og:description' in meta_result: news_item['blurb'] = meta_result['og:description'] news_item['blurb'] = news_item['blurb'].strip().encode( 'ascii', 'ignore') return news_item except: pass return None
def parse_item(self, response): filedir = self.pre_write_check() filename = os.path.join(filedir, md5(response.url).hexdigest()) with open(filename, "wb") as html: html.write(response.body) htmlparser = etree.HTMLParser() tree = etree.parse(BytesIO(response.body), htmlparser) news_item = NewsItem() try: title = tree.xpath('.//h1[contains(@class,"arti_heading")]/text()') details = tree.xpath('.//div[@id=\'arti_content_n\']//p/text()') if title and details: news_item['title'] = title[0].strip().encode('ascii','ignore') details = [x.strip().encode('ascii','ignore') for x in details if x.strip()] details = "\t".join(details).strip() news_item['details'] = details news_item['source'] = self.name news_item['crawled_date'] = datetime.now() news_item['source_url'] = response.url img_urls = tree.xpath('.//div[@id=\'arti_content_n\']/p/strong/img/@src') if img_urls: news_item['img_urls'] = get_stripped_list(img_urls) news_item['cover_image'] = img_urls[0] tags = tree.xpath('.//div[contains(@id, "tags_div")]//a/text()') if tags: news_item['tags'] = get_stripped_list(tags) author = tree.xpath('.//span[contains(@class, "grey1")]/a/text()') authorname = tree.xpath('.//span[contains(@class, "grey1")]/text()') if author: author = [x.strip().encode('ascii','ignore')for x in author] author = " ".join(author).strip() news_item['author'] = get_stripped_list(author) if authorname: authorname = [x.strip().encode('ascii','ignore')for x in authorname] authorname = " ".join(authorname).strip() news_item['author'] = get_stripped_list(authorname) published_date = tree.xpath('.//div[contains(@class, "sm1 grey1")]/text()') if published_date: pub_date = published_date[0] news_item['published_date'] = datetime.strptime(pub_date.split('IST')[0].strip().encode('ascii','ignore') if 'IST' in pub_date else pub_date, '%B %d, %Y %H:%M') referer = response.request.headers['Referer'] for item in categories: if referer in sum(item['subcategory'].values(), []): news_item['category'] = item['category'] key = (key for key,value in item['subcategory'].items() if referer in value).next() news_item['sub_categories'] = [key] return news_item except Exception, e: self.log('==Exception=================>>>>>>>>! %r' % e)
def parse(self, response): item = NewsItem() lang = response.xpath( '//*[@id="responsive-news"]//meta[@property="og:locale"]//@content' ).extract_first() type = response.xpath( '//*[@id="responsive-news"]//meta[@property="og:type"]//@content' ).extract_first() if lang is None or "en" not in lang or "article" not in type: return item['url'] = response.url try: item['date'] = datetime.utcfromtimestamp(float( response.xpath( '//div[@class="story-body"]//div[contains(@class,"date date--v2")]//@data-seconds').extract_first())) \ .strftime("%Y-%m-%dT%H:%M:%S") except TypeError: item['date'] = '' try: _author = response.xpath( '//*//span[@class="byline__name"]//text()').extract_first() if _author is None: item['author'] = 'BBC News' else: _author_split = _author.split(" ") if _author_split[0] == "By": _author = " ".join(_author_split[1:]) item['author'] = _author + " | BBC News" # # " ".join( # response.xpath('//*[@id="responsive-news"]//meta[@property="article:author"]//@content') # .extract()[0]).strip() # # intoarce https://www.facebook.com/bbcnews except IndexError: item['author'] = 'BBC News' item['title'] = response.xpath( '//*[@id="responsive-news"]//meta[@property="og:title"]//@content' ).extract_first().strip() item['description'] = response.xpath( '//*[@id="responsive-news"]//meta[@property="og:description"]//@content' ).extract_first().rstrip() item['content'] = remove_unicode(' '.join( response.xpath( '//div[@class="story-body"]//div[@property="articleBody"]//p//text()' ).extract()).rstrip()) yield item
def parse_item(self, response): super(SmallBizTrendsSpider, self).parse_item(response) htmlparser = etree.HTMLParser() tree = etree.parse(BytesIO(response.body), htmlparser) news_item = NewsItem() try: title = tree.xpath(".//div[@class='post-inner']/h1/text()") details = tree.xpath('.//div[@class=\"entry\"]/p/text()') if title and details: news_item['source'] = self.name news_item['crawled_date'] = datetime.now() news_item['source_url'] = response.url.split('?')[0] news_item['title'] = title[0].strip().decode( 'unicode_escape').encode('ascii', 'ignore') news_item['details'] = '\t'.join([ item.strip().encode('ascii', 'ignore').decode('unicode_escape') for item in details if item.strip() ]) # ' '.join([item.strip().encode('ascii','ignore').decode('unicode_escape') for item in details if item.strip()]) if tree.xpath( './/span[@class=\'full-span-featured-image\']/span/img/@src' ): news_item['img_urls'] = tree.xpath( './/span[@class=\'full-span-featured-image\']/span/img/@src' ) elif tree.xpath('.//img[contains(@class,\'size-full\')]/@src'): news_item['img_urls'] = tree.xpath( './/img[contains(@class,\'size-full\')]/@src') elif tree.xpath( './/img[contains(@class,\'aligncenter\')]/@src'): news_item['img_urls'] = tree.xpath( './/img[contains(@class,\'aligncenter\')]/@src') meta_result = self.get_meta(tree) if 'description' in meta_result: news_item['blurb'] = meta_result['description'] published_date = tree.xpath( './/span[contains(@class,\'article-date\')]/text()') if published_date: news_item['published_date'] = datetime.strptime( published_date[0], '%b %d, %Y') author = tree.xpath( './/span[contains(@itemprop,\'name\')]/a/text()') if author: news_item['author'] = author return news_item except: pass return None
def parse_item(self, response): super(HuffingtonPostSpider, self).parse_item(response) htmlparser = etree.HTMLParser() tree = etree.parse(BytesIO(response.body), htmlparser) news_item = NewsItem() try: title = tree.xpath(".//h1[contains(@class,\"title\")]//text()") details = tree.xpath('.//div[contains(@class,\"content\")]//p//text()') if title and details: news_item['source'] = self.name news_item['source_url'] = response.url.split('?')[0] news_item['crawled_date'] = datetime.now() news_item['title'] = title[0].strip().encode('ascii','ignore') news_item['details'] = "\t".join([ele.strip().encode('ascii','ignore') for ele in details]) img_urls = tree.xpath('.//div[contains(@class,\"top-media--image image\")]/img/@src') if img_urls: news_item['img_urls'] = get_stripped_list(img_urls) cover_image = tree.xpath('.//span[contains(@class,\"img-caption\")]//img/@src') if cover_image: news_item['cover_image'] = get_stripped_list(cover_image)[0] meta_result = self.get_meta(tree) if 'og:image' in meta_result: news_item['cover_image'] = meta_result['og:image'] if 'og:description' in meta_result: news_item['blurb'] = meta_result['og:description'] news_item['blurb'] = news_item['blurb'].strip().encode('ascii','ignore') published_date = tree.xpath('.//div[contains(@class,\"timestamp\")]/span/text()') if published_date: pub_date = published_date[0].strip() news_item['published_date'] = datetime.strptime(pub_date.split('IST')[0].strip() if 'IST' in pub_date else pub_date, '%d/%m/%Y %I:%M %p') author = tree.xpath('.//a[contains(@class,\"author-card__details__name\")]/text()') if author: news_item['author'] = author[0].strip().encode('ascii','ignore') tags = tree.xpath('.//div[contains(@class,\"tag-cloud\")]/a/text()') if tags: news_item['tags'] = [x.strip().encode('ascii','ignore')for x in tags] referer = response.request.headers['Referer'] for item in categories: if referer in sum(item['subcategory'].values(), []): news_item['category'] = item['category'] key = (key for key,value in item['subcategory'].items() if referer in value).next() news_item['sub_categories'] = [key] return news_item except Exception, e: self.log('==Exception=================>>>>>>>>! %r' % e)
def parse_item(self, response): super(VentureBeatSpider, self).parse_item(response) htmlparser = etree.HTMLParser() tree = etree.parse(BytesIO(response.body), htmlparser) news_item = NewsItem() try: title = tree.xpath( "//h1[contains(@class,\'article-title\')]//text()") details = tree.xpath( '//div[contains(@class,"article-content")]/p//text()') if title and details: news_item['source'] = self.name news_item['crawled_date'] = datetime.now() news_item['source_url'] = response.url.split('?')[0] news_item['title'] = title[0].strip().encode('ascii', 'ignore') news_item['details'] = "\t".join( [ele.strip().encode('ascii', 'ignore') for ele in details]) img_urls = tree.xpath( '//div[contains(@class,"article-content")]//img/@src') if img_urls: news_item['img_urls'] = get_stripped_list(img_urls) news_item['cover_image'] = img_urls[0] published_date = tree.xpath( './/time[contains(@class,\"the-time\")]/text()') if published_date: news_item['published_date'] = datetime.strptime( published_date[0], '%B %d, %Y %I:%M %p') author = tree.xpath( './/a[contains(@class,\"author url fn\")]/text()') if author: news_item['author'] = get_stripped_list(author) tags = tree.xpath( './/div[contains(@class,\"article-tags\")]/a/text()') if tags: news_item['tags'] = get_stripped_list(tags) referer = response.request.headers['Referer'] for item in categories: if referer in sum(item['subcategory'].values(), []): news_item['category'] = item['category'] key = (key for key, value in item['subcategory'].items() if referer in value).next() news_item['sub_categories'] = [key] return news_item except: self.log('==Exception=================>>>>>>>>! %r' % e) return None
def parse_item(self, response): super(NdtvSpider, self).parse_item(response) htmlparser = etree.HTMLParser() tree = etree.parse(BytesIO(response.body), htmlparser) news_item = NewsItem() try: # title = tree.xpath('.//div[contains(@class, "storytitle")]/h1/text()') title = tree.xpath('.//h1[@itemprop="headline"]//text()') details = tree.xpath('.//div[contains(@class, "pdl200")]//text()[not(ancestor::script)]') # details = tree.xpath('.//span[@itemprop="articleBody"]//text') if title and details: news_item['source'] = self.name news_item['crawled_date'] = datetime.now() news_item['source_url'] = response.url.split('?')[0] news_item['title'] = title[0].strip().encode('ascii','ignore') news_item['details'] ='\t'.join([item.strip().encode('ascii','ignore').decode('unicode_escape') for item in details if item.strip()]) # img_urls = tree.xpath('.//div[contains(@class,"storypicbig")]/img/@src') img_urls = tree.xpath('.//div[contains(@class,"whosaid_top_mainimg_cont")]/img/@src') if img_urls: news_item['img_urls'] = get_stripped_list(img_urls) # cover_image = tree.xpath('.//table[contains(@class,"MR15")]//div/img/@src') # if cover_image: news_item['cover_image'] = get_stripped_list(img_urls)[0] published_date = tree.xpath('.//div[contains(@class, "dateline")]/text()') date_str = published_date[0].replace("(IST)","").strip().split(":",1)[1] if published_date: pub_date = published_date[0].strip() news_item['published_date'] = parse(date_str) # pub_date.strip('| Last Updated:(IST)').strip().encode('ascii','ignore') if '| Last Updated:(IST)' in pub_date else pub_date tags=tree.xpath('.//p[contains(@class, "alltags")]/a/text()') if tags: news_item['tags'] = get_stripped_list(tags) author = tree.xpath('.//div[contains(@class, "dateline")]/a/text()') if author: news_item['author'] = get_stripped_list(author) referer = response.request.headers['Referer'] for item in categories: if referer in sum(item['subcategory'].values(), []): news_item['category'] = item['category'] key = (key for key,value in item['subcategory'].items() if referer in value).next() news_item['sub_categories'] = [key] return news_item except Exception, e: self.log('==Exception=================>>>>>>>>! %r' % e)
def parse_item(self, response): super(Thehindubusiness, self).parse_item(response) htmlparser = etree.HTMLParser() tree = etree.parse(BytesIO(response.body), htmlparser) news_item = NewsItem() try: # title = tree.xpath('.//h1[@class=\'detail-title\']/text()') title = tree.xpath('.//h1[@class=\'title\']/text()') # details = tree.xpath('.//p[@class=\'body\']/text()') details = tree.xpath('.//div[starts-with(@id,"content-body-14269002")]//p//text()') if title and details: news_item['source'] = self.name news_item['crawled_date'] = datetime.now() news_item['source_url'] = response.url.split('?')[0] news_item['title'] = title[0].strip().encode('ascii','ignore') news_item['details'] = "\t".join([x.strip().encode('ascii','ignore')for x in details]).strip() # img_urls = tree.xpath('.//div[contains(@class,"text-embed")]/img/@src') img_urls = tree.xpath('.//div[@class="img-container picture"]/img/@data-proxy-image') other_img_urls = tree.xpath('.//div[contains(@id,"hcenter")]/img/@src') if img_urls: news_item['img_urls'] = get_stripped_list(img_urls) if other_img_urls: news_item['img_urls'] = get_stripped_list(other_img_urls) cover_image = tree.xpath('.//div[@class="img-container picture"]/img/@data-proxy-image') if cover_image: news_item['cover_image'] = cover_image[0].strip() tags = tree.xpath('.//div[contains(@id, "articleKeywords")]/p//a/text()') if tags: news_item['tags'] = get_stripped_list(tags) # published_date = tree.xpath('.//div[contains(@class, "artPubUpdate")]/text()') published_date = tree.xpath('.//div[@class="teaser-text update-time"]/span/none/text()') date_str = published_date[0].replace("IST","").strip() if published_date: news_item['published_date'] = parse(date_str) # datetime.strptime(published_date[0].split('Updated:')[1].split('IST')[0].strip().encode('ascii','ignore'), '%B %d, %Y %I:%M') referer = response.request.headers['Referer'] for item in categories: if referer in sum(item['subcategory'].values(), []): news_item['category'] = item['category'] key = (key for key,value in item['subcategory'].items() if referer in value).next() news_item['sub_categories'] = [key] return news_item except: self.log('==Exception=================>>>>>>>>! %r' % e) return None
def parse_item(self, response): super(PandoSpider, self).parse_item(response) htmlparser = etree.HTMLParser() tree = etree.parse(BytesIO(response.body), htmlparser) news_item = NewsItem() try: title = tree.xpath(".//div[contains(@class,\'shim\')]/h1//text()") details = tree.xpath( './/div[contains(@class,\'contains-copy excerpt\')]//p//text()' ) if title and details: news_item['source'] = self.name news_item['crawled_date'] = datetime.now() news_item['source_url'] = response.url.split('?')[0] news_item['title'] = title[0].strip().encode('ascii', 'ignore') news_item['details'] = "\t".join( [det.strip().encode('ascii', 'ignore') for det in details]) img_urls = tree.xpath( './/p[contains(@id,\'featured-image\')]/img/@src') if img_urls: news_item['img_urls'] = img_urls blurb = tree.xpath( './/div[contains(@class,\'contains-copy excerpt\')]/p/text()' ) if blurb: news_item['blurb'] = " ".join([ blurb.strip().encode('ascii', 'ignore') for blurb in blurb ]) cover_image = tree.xpath( './/p[contains(@id,\'featured-image\')]/img/@src') if cover_image: news_item['cover_image'] = cover_image published_date = tree.xpath('//*[@id="byline"]/span/text()') if published_date: news_item['published_date'] = datetime.strptime( published_date[1].split('\n')[1].strip(), '%B %d, %Y') author = tree.xpath( './/p[contains(@id,\'byline\')]/span//a/text()') if author: news_item['author'] = author[1].split('By')[1].strip() return news_item except: pass return None
def parse(self, response): article = response.xpath('//article[@itemtype="https://schema.org/NewsArticle"]') if article is None: return item = NewsItem() item['url'] = article.xpath('//meta[@itemprop="url"]/@content').extract_first() if item['url'] is None: return title = article.xpath('//meta[@itemprop="headline"]/@content').extract_first() if title is None: return index = title.index(' - CNN') if index >= 0: title = title[0:index] item['title'] = remove_unicode(title) item['description'] = remove_unicode(article.xpath('//meta[@itemprop="description"]/@content').extract_first()) if item['description'] is None: return date = article.xpath('//meta[@itemprop="dateCreated"]/@content').extract_first() if date is None: return item['date'] = parse(date).strftime("%Y-%m-%dT%H:%M:%S") if item['date'] is None: return item['author'] = remove_unicode(article.xpath('//meta[@itemprop="author"]/@content').extract_first()) if item['author'] is None: return articleBody = response.xpath('//article[@itemprop="articleBody"]') if articleBody is None: return paragraphs = response.xpath('//div[@class="zn-body__paragraph speakable"]') paragraphs.extend(response.xpath('//div[@class="zn-body__paragraph"]')) if len(paragraphs) == 0: return content = [] for p in paragraphs: content.extend(p.xpath('string()').extract()) item['content'] = remove_unicode(' '.join(content)) yield item
def parse_item(self, response): super(ETSpider, self).parse_item(response) htmlparser = etree.HTMLParser() tree = etree.parse(BytesIO(response.body), htmlparser) news_item = NewsItem() try: title = tree.xpath('.//h1[contains(@class, "title")]/text()[1]') details = tree.xpath('.//div[@class=\'Normal\']//text()') if title and details : news_item['source'] = self.name news_item['crawled_date'] = datetime.now() news_item['source_url'] = response.url.split('?')[0] news_item['title'] = title[0].strip().decode('unicode_escape').encode('ascii','ignore') news_item['details'] = "\t".join([item.strip().encode('ascii','ignore') for item in details if item.strip()]) news_item['cover_image'] = '' news_item['blurb'] = '' news_item['img_urls'] = [] img_urls = tree.xpath('.//figure/img/@src') if img_urls: news_item['img_urls'] = get_stripped_list(img_urls) meta_result = self.get_meta(tree) if 'og:image' in meta_result: news_item['cover_image'] = meta_result['og:image'] if 'og:description' in meta_result: news_item['blurb'] = meta_result['og:description'] news_item['blurb'] = news_item['blurb'].decode('unicode_escape').encode('ascii','ignore') published_date = tree.xpath('.//div[contains(@class,\'byline\')]/text()') self.log('==Pub date=================>>>>>>>>! %r' % published_date) print "pb------------------->",published_date if published_date: # published_date = " ".join(published_date) # news_item['published_date'] = datetime.strptime(published_date.split('|')[1].strip('IST').strip(), '%b %d, %Y, %I.%M %p') news_item['author'] = published_date[0].split('|')[0].strip() date_str = (published_date[0].split(":")[1:])[0].replace("IST","").strip() news_item['published_date'] = datetime.strptime(date_str, '%b %d, %Y, %I.%M %p') referer = response.request.headers['Referer'] for item in categories: if referer in sum(item['subcategory'].values(), []): news_item['category'] = item['category'] key = (key for key,value in item['subcategory'].items() if referer in value).next() news_item['sub_categories'] = [key] return news_item except Exception, e: self.log('==Exception=================>>>>>>>>! %r' % e)
def parse_item(self, response): super(DealCurrySpider, self).parse_item(response) htmlparser = etree.HTMLParser() tree = etree.parse(BytesIO(response.body), htmlparser) news_item = NewsItem() try: title = tree.xpath(".//h1/text()") details = tree.xpath( './/div[contains(@class, "articleSpacer")]/p//text()') if title and details: news_item['source_url'] = response.url.split('?')[0] news_item['source'] = self.name news_item['crawled_date'] = datetime.now() news_item['title'] = title[0].strip().encode('ascii', 'ignore') news_item['details'] = "\t".join([ x.strip().encode('ascii', 'ignore') for x in details ]).strip() # "\t".join([item.strip().encode('ascii','ignore') for item in details if item.strip()]) tags = tree.xpath( './/div[contains(@style, "padding-bottom:10px")]/span[contains(@style, "color:#346f9a; float:left; text-align:left")]/a/text()' ) news_item['tags'] = tags[0].strip().encode('ascii', 'ignore') published_date = tree.xpath( ".//span[contains(@style, 'color:#6b6b6b;float:left; text-align:left; margin-left:5px')]/text()" ) news_item['published_date'] = datetime.strptime( published_date[0].encode('ascii', 'ignore'), '%d %B %Y') author = tree.xpath( './/div[contains(@style, "")]/span[contains(@style, "color:#6b6b6b; float:left; text-align:left;")]/text()' ) news_item['author'] = author[0].split('by')[1].strip().encode( 'ascii', 'ignore') img_urls = tree.xpath( './/div[contains(@style, "padding-bottom:10px")]/img/@src') if img_urls: news_item['img_urls'] = get_stripped_list(img_urls) meta_result = self.get_meta(tree) if 'description' in meta_result: news_item['blurb'] = meta_result['description'] return news_item except: pass return None
def parse(self, response): item_data = { "title": remove_unicode(response.xpath('//meta[@name="dc.title"]/@content').extract()[0].strip()), "author": " ".join(response.xpath('//*[@class="article-source"]//text()').extract()).strip(), "date": parse(response.xpath('//meta[@name="dc.date"]/@content').extract()[0], fuzzy=True).strftime( "%Y-%m-%dT%H:%M:%S"), "description": remove_unicode( response.xpath('//meta[@name="dc.description"]/@content').extract()[0].strip()), "content": remove_unicode( ' '.join(response.xpath('//*[@class="article-body"]/p//text()').extract()).strip()), "url": response.url, } yield NewsItem(**item_data)
def parse_item(self, response): super(ReutersSpider, self).parse_item(response) htmlparser = etree.HTMLParser() tree = etree.parse(BytesIO(response.body), htmlparser) news_item = NewsItem() try: title = tree.xpath(".//h1[contains(@class,\"article-headline\")]/text()") details = tree.xpath('//*[@id="article-text"]//text()') if title and details: news_item['source'] = self.name news_item['crawled_date'] = datetime.now() news_item['source_url'] = response.url.split('?')[0] news_item['title'] = title[0].strip().encode('ascii','ignore') news_item['details'] = "\t".join([ det.strip().encode('ascii','ignore') for det in details ]) # " ".join([ det.strip().encode('ascii','ignore') for det in details ]) img_urls = tree.xpath('.//div[contains(@class,\'related-photo-container\')]/img/@src') if img_urls: news_item['img_urls'] = get_stripped_list(img_urls) news_item['cover_image'] = img_urls[0] blurb = tree.xpath('.//div[contains(@class,\'related-photo-caption\')]/text()') if blurb: news_item['blurb'] = " ".join([ blurb.strip().encode('ascii','ignore') for blurb in blurb ]) published_date = tree.xpath('.//span[contains(@class,\'timestamp\')]//text()') date_str = published_date[0].replace("|","").replace("IST","").strip() if published_date: pub_date = published_date[0].strip() d1 =[pub_date.split('IST')[0] if 'IST' in pub_date else pub_date] # news_item['published_date'] = datetime.strptime(d1[0].strip().encode('ascii','ignore'), '%d %b, %Y') # datetime.strptime(d1[0], '%a %b %d, %Y %I:%M%p ') news_item['published_date'] = parse(date_str) author = tree.xpath('.//span[contains(@class,\'byline\')]/text()') if author: news_item['author'] = author[0].split('By')[1].strip() referer = response.request.headers['Referer'] for item in categories: if referer in sum(item['subcategory'].values(), []): news_item['category'] = item['category'] key = (key for key,value in item['subcategory'].items() if referer in value).next() news_item['sub_categories'] = [key] return news_item except Exception, e: self.log('==Exception=================>>>>>>>>! %r' % e)
def parse_one_news(self, response): """ """ news_loader = NewsLoader(item=NewsItem(), response=response) #news_loader.add_css('title',"#h1title::text") title = response.xpath("/html/head/title/text()").extract() if title: #news_loader.add_value("title",title) news_loader.add_xpath('title', '/html/head/title/text()') else: news_loader.add_xpath( "title", "//div[@class='qq_article']/div[@class='hd']/h1/text()") logger.warning( "!!!! did't get title on head,parse <%s>'s body instead." % response.url) news_loader.add_value('rank', str(response.meta['rank'])) news_loader.add_value('news_time', response.meta['news_time']) #news_loader.add_css('publisher',"#ne_article_source::text") publisher = response.xpath( "string(//div[@class='a_Info']/span[@class='a_source'])").extract( ) if publisher and publisher[0]: news_loader.add_xpath( 'publisher', "string(//div[@class='a_Info']/span[@class='a_source'])") else: news_loader.add_value("publisher", u"qq.com") news_loader.add_value("news_url", response.url) # content = response.xpath("//div[@id='Cnt-Main-Article-QQ']/p[not(style)]").extract() # if content: news_loader.add_xpath( 'content', "//div[@id='Cnt-Main-Article-QQ']/p[not(style)]") # else: # news_loader.add_xpath("content","//div[@class='w_text']") # logger.warning("!!!! plan A failed,use plan B instead in parsing content <%s>" % response.url) news_loader.add_value('category', response.meta['category']) news_loader.add_value("site", u"qq.com") # 不要到pipeline当中去找这个cover cover = response.xpath( "//div[@id='Cnt-Main-Article-QQ']/p[not(style)]").xpath( ".//img/@src[starts-with(.,'http')]").extract() news_cover = cover[0] if cover else DEFAULT_NEWS_COVER news_loader.add_value("cover", news_cover) return news_loader.load_item()
def parse(self, response): article = response.xpath('/html/head/meta[@property="og:type" and @content="article"]') if article is None: return item = NewsItem() item['url'] = response.xpath('//meta[@property="og:url"]/@content').extract_first() if item['url'] is None: return item['title'] = remove_unicode(response.xpath('//meta[@property="og:title"]/@content').extract_first()) if item['title'] is None: return item['description'] = remove_unicode(response.xpath('//meta[@property="og:description"]/@content').extract_first()) if item['description'] is None: return date = response.xpath('//*[@itemprop="datePublished"]/@content').extract_first() if date is None: return item['date'] = parse(date).strftime("%Y-%m-%dT%H:%M:%S") if item['date'] is None: return author = response.xpath('//*[@class="article-author"]') if author is None: return authors = ' '.join(response.xpath('//*[@class="byline__author-name" and @itemprop="name"]/@content').extract()) item['author'] = remove_unicode(authors) if item['author'] is None: return articleBody = response.xpath('//article[@itemprop="articleBody"]') if articleBody is None: return content = [] paragraphs = articleBody.xpath('//div[@class="article-body-text component version-2"]//p') if len(paragraphs) == 0: return for p in paragraphs: content.extend(p.xpath('string()').extract()) item['content'] = remove_unicode(' '.join(content)) yield item
def parse_item(self, response): super(VccircleSpider, self).parse_item(response) htmlparser = etree.HTMLParser() tree = etree.parse(BytesIO(response.body), htmlparser) news_item = NewsItem() try: title = tree.xpath('//*[@id="block-system-main"]/div/div[2]/div[2]/h2/text()') # details = tree.xpath('.//div[@class=\'cont-text\']/div//text()') details = tree.xpath('.//div[@class=\'vcc-snippet-body\']/p[@class=\'selectionShareable\']//text()') if title and details: news_item['source'] = self.name news_item['crawled_date'] = datetime.now() news_item['source_url'] = response.url.split('?')[0] news_item['title'] = title[0].strip().encode('ascii','ignore') news_item['details'] = "\t".join([x.strip().encode('ascii','ignore')for x in details]).strip() img_urls = tree.xpath('.//div[contains(@class,"field-item even")]/img/@src') if img_urls: news_item['img_urls'] = get_stripped_list(img_urls) news_item['cover_image'] = img_urls[0] cover_image = tree.xpath('.//table[contains(@class,"MR15")]//div/img/@src') if cover_image: news_item['cover_image'] = cover_image[0] tags = tree.xpath('.//div[contains(@class, "content-tags")]//a/text()') if tags: news_item['tags'] = get_stripped_list(tags) author = tree.xpath('.//span[contains(@class, "byline_person")]/text()') if author: news_item['author'] = author[0].split('by')[1].strip() if 'by' in author[0] else author[0].strip() published_date = tree.xpath('.//span[contains(@class, "date-display-single")]/text()') if published_date: news_item['published_date'] = datetime.strptime("".join(get_stripped_list(published_date)[0]), '%A, %B %d, %Y - %I:%M') referer = response.request.headers['Referer'] for item in categories: if referer in sum(item['subcategory'].values(), []): news_item['category'] = item['category'] key = (key for key,value in item['subcategory'].items() if referer in value).next() news_item['sub_categories'] = [key] return news_item except Exception, e: self.log('==Exception=================>>>>>>>>! %r' % e)
def parse_item(self, response): super(BusinessStandardSpider, self).parse_item(response) htmlparser = etree.HTMLParser() tree = etree.parse(BytesIO(response.body), htmlparser) news_item = NewsItem() try: title = tree.xpath(".//h1[contains(@class,\'headline\')]//text()") details = tree.xpath( './/span[contains(@class,\'p-content\')]/div//text()[not(ancestor::script)]' ) if title and details: news_item['source'] = self.name news_item['source_url'] = response.url.split('?')[0] news_item['crawled_date'] = datetime.now() news_item['title'] = title[0].strip().encode('ascii', 'ignore') news_item['details'] = "\t".join([ item.strip().encode('ascii', 'ignore') for item in details ]) img_urls = tree.xpath( './/img[contains(@class,\'imgCont\')]/@src') if img_urls: news_item['img_urls'] = get_stripped_list(img_urls) published_date = tree.xpath( './/p[contains(@class,\'fL\')]//span//text()') if published_date: news_item['published_date'] = datetime.strptime( published_date[3].split("\t")[0], '%B %d, %Y') related = tree.xpath( './/div[contains(@class,\'readmore_tagBG\')]//h2//a/text()' ) if related: news_item['tags'] = [ item.strip() for item in related if item.strip() ] cover_image = tree.xpath( './/img[contains(@class,\'imgCont\')]/@src') if cover_image: news_item['cover_image'] = cover_image return news_item except: pass return None
def parse_item(self, response): super(SMETimesSpider, self).parse_item(response) htmlparser = etree.HTMLParser() tree = etree.parse(BytesIO(response.body), htmlparser) news_item = NewsItem() try: title = tree.xpath(".//span[contains(@class,\"blue-heading\")]//text()") details = tree.xpath('//span[@class="text"]//text()') details = [ele.encode('ascii','ignore').replace("\n","") for ele in details] if title and details: news_item['source'] = self.name news_item['crawled_date'] = datetime.now() news_item['source_url'] = response.url.split('?')[0] news_item['title'] = title[0].strip().encode('ascii','ignore') news_item['details'] = "\t".join(details) img_urls = tree.xpath('//span[contains(@class,"text")]//img/@src') if not img_urls[0].lower().find(self.name.lower()) == -1: news_item['img_urls'] = get_stripped_list(img_urls) news_item['cover_image'] = img_urls[0] published_date = tree.xpath('.//div[contains(@align,\'justify\')]/span/span//text()') if published_date: pub_date = published_date[0].split("|")[1] news_item['published_date'] = datetime.strptime(pub_date, ' %d %b, %Y') author = tree.xpath('.//div[contains(@align,\'justify\')]/span/span//text()') if author : news_item['author'] = author[0].split("|")[0].strip() referer = response.request.headers['Referer'] for item in categories: if referer in sum(item['subcategory'].values(), []): news_item['category'] = item['category'] key = (key for key,value in item['subcategory'].items() if referer in value).next() news_item['sub_categories'] = [key] self.log('==Exception=================>>>>>>>>! %r' % news_item) return news_item except Exception, e: self.log('==Exception=================>>>>>>>>! %r' % e)
def parse_item(self, response): super(TechCrunchSpider, self).parse_item(response) htmlparser = etree.HTMLParser() tree = etree.parse(BytesIO(response.body), htmlparser) news_item = NewsItem() try: title = tree.xpath( ".//h1[contains(@class,\'alpha tweet-title\')]//text()") details = tree.xpath( './/div[contains(@class,\'article-entry text\')]//p//text()') if title and details: news_item['title'] = title[0].strip().encode('ascii', 'ignore') news_item['details'] = "\t".join( [det.strip().encode('ascii', 'ignore') for det in details]) news_item['source'] = self.name news_item['crawled_date'] = datetime.now() news_item['source_url'] = response.url.split('?')[0] img_urls = tree.xpath( './/div[contains(@class,\'article-entry text\')]/img/@src') if img_urls: news_item['img_urls'] = img_urls cover_image = tree.xpath( './/div[contains(@class,\'article-entry text\')]/img/@src') if cover_image: news_item['cover_image'] = cover_image[0] author = tree.xpath( '/html/body/div[4]/article/div/div[1]/div/header/div[2]/div[1]/a/text()' ) if author: news_item['author'] = author return news_item except: pass return None
def parse_item(self, response): super(SmeWebSpider, self).parse_item(response) htmlparser = etree.HTMLParser() tree = etree.parse(BytesIO(response.body), htmlparser) news_item = NewsItem() try: title = tree.xpath(".//h1[contains(@itemprop,\"name\")]/text()") details = tree.xpath('.//div[@class="article__body"]/p//text()') if title and details: news_item['source'] = self.name news_item['crawled_date'] = datetime.now() news_item['title'] = title[0].strip().encode('ascii', 'ignore') news_item['details'] = "\t".join([ x.strip().encode('ascii', 'ignore') for x in details ]).strip() news_item['source_url'] = response.url.split('?')[0] img_urls = tree.xpath( './/a[contains(@class,\"article__figure__link\")]/img/@src' ) if img_urls: news_item['img_urls'] = get_stripped_list(img_urls) meta_result = self.get_meta(tree) if 'og:image' in meta_result: news_item['cover_image'] = meta_result['og:image'] if 'og:description' in meta_result: news_item['blurb'] = meta_result['og:description'] news_item['blurb'] = news_item['blurb'].strip().encode( 'ascii', 'ignore') published_date = tree.xpath( './/span[contains(@class,\"article__meta__info\")]/time/text()' ) if published_date: news_item['published_date'] = datetime.strptime( published_date[0].strip().encode('ascii', 'ignore'), '%B %d %Y %I:%M %p') author = tree.xpath( './/span[contains(@class,\"article__meta__value\")]/text()' ) if author: author = author[0].strip() news_item['author'] = author.split( '\n')[1].strip() if '\n' in author else author tags = tree.xpath( './/div[contains(@class,\"article__tags-container\")]/a/span/text()' ) if tags: news_item['tags'] = get_stripped_list(tags) referer = response.request.headers['Referer'] for item in categories: if referer in sum(item['subcategory'].values(), []): news_item['category'] = item['category'] key = (key for key, value in item['subcategory'].items() if referer in value).next() news_item['sub_categories'] = [key] return news_item except Exception, e: self.log('==Exception=================>>>>>>>>! %r' % e)
def parse_item(self, response): super(BusinessInsiderSpider, self).parse_item(response) htmlparser = etree.HTMLParser() tree = etree.parse(BytesIO(response.body), htmlparser) news_item = NewsItem() try: # title = tree.xpath('//*[@id="Content"]/div[3]/div[3]/div[1]/div/div[1]/div/article/div[1]/h1/text()') title = tree.xpath( '//*[@id="Content"]/div[3]/div[3]/div[1]/div/div[1]/div/article/div[1]/h1//text()' ) # details = tree.xpath('.//div[contains(@class,\'section1\')]//p//text()') details = tree.xpath( './/div[contains(@class,"hide_show_handler main_content")]//p//text()' ) if title and details: news_item['source'] = self.name news_item['source_url'] = response.url.split('?')[0] news_item['crawled_date'] = datetime.now() news_item['title'] = title[0].strip().encode('ascii', 'ignore') news_item['details'] = "\t".join([ item.strip().encode('ascii', 'ignore') for item in details ]) img_urls = tree.xpath( './/div[contains(@class,\'MeetingImg blk\')]/img/@src') img_url_list = [] if img_urls: for img_url in img_urls: img_url_list.append("http://www.businessinsider.in" + img_url) news_item['img_urls'] = get_stripped_list(img_url_list) published_date = tree.xpath( './/div[contains(@class,\'ByLine\')]//span[contains(@class,\'Date\')]//text()' ) if published_date: news_item['published_date'] = datetime.strptime( get_stripped_list(published_date)[0], '%b %d, %Y, %I.%M %p') author = tree.xpath('.//a[contains(@class,\'Name\')]/text()') if author: news_item['author'] = get_stripped_list(author) tags = tree.xpath( './/span[contains(@class,\'anchorLink\')]/text()') more_tags = tree.xpath( './/div[contains(@id,\'commentHash\')]//a/text()') if tags: news_item['tags'] = get_stripped_list(tags) if more_tags: news_item['tags'] = get_stripped_list(more_tags) cover_image = tree.xpath( './/div[contains(@class,\'MeetingImg blk\')]/img/@src') if cover_image: news_item['cover_image'] = img_url_list[0] # get_stripped_list(cover_image) referer = response.request.headers['Referer'] for item in categories: if referer in sum(item['subcategory'].values(), []): news_item['category'] = item['category'] key = (key for key, value in item['subcategory'].items() if referer in value).next() news_item['sub_categories'] = [key] return news_item except: self.log('==Exception=================>>>>>>>>! %r' % e) return None
def parse_item(self, response): super(MashableSpider, self).parse_item(response) htmlparser = etree.HTMLParser() tree = etree.parse(BytesIO(response.body), htmlparser) news_item = NewsItem() try: title = tree.xpath("//h1[contains(@class,\'title\')]//text()") details = tree.xpath( '//div[contains(@class,"post-text")]/p//text()') detail = tree.xpath( '//section[contains(@class,"article-content blueprint")]//p//text()' ) if title and details or detail: news_item['source'] = self.name news_item['crawled_date'] = datetime.now() news_item['source_url'] = response.url.split('?')[0] news_item['title'] = title[0].strip().encode('ascii', 'ignore') if details: news_item['details'] = "\t".join([ ele.strip().encode('ascii', 'ignore') for ele in details ]) if detail: news_item['details'] = "\t".join([ ele.strip().encode('ascii', 'ignore') for ele in detail ]) img_urls = tree.xpath( '//div[contains(@id,"post-content")]//img/@src') if img_urls: news_item['img_urls'] = get_stripped_list(img_urls) cover_image = tree.xpath( '//div[contains(@id,"post-content")]//img/@src') if cover_image: news_item['cover_image'] = cover_image meta_result = self.get_meta(tree) if 'og:image' in meta_result: news_item['cover_image'] = meta_result['og:image'] if 'og:description' in meta_result: news_item['blurb'] = meta_result['og:description'] news_item['blurb'] = news_item['blurb'].strip().encode( 'ascii', 'ignore') author = tree.xpath( '//span[contains(@class,"author_name")]/a/text()') if author: news_item['author'] = author tags = tree.xpath( '//footer[contains(@class,"article-topics")]/a/text()') if tags: news_item['tags'] = tags return news_item except: pass return None