def parse_item(self, response): super(FinancialExpressSpider, self).parse_item(response) htmlparser = etree.HTMLParser() tree = etree.parse(BytesIO(response.body), htmlparser) self.log('==RESPONSE=================>>>>>>>>! %s' % response.request.headers['Referer']) referer = response.request.headers['Referer'] news_item = NewsItem() try: title = tree.xpath(".//meta[@itemprop='headline']/@content") details = tree.xpath(".//div[@itemprop='articleBody']//p//text()") # self.log('==Title=================>>>>>>>>! %s' % title[0]) if title and details: news_item['source'] = self.name news_item['source_url'] = response.url.split('?')[0] news_item['crawled_date'] = datetime.now() news_item['title'] = title[0].strip().encode('ascii', 'ignore') news_item['details'] = "\t".join( [ele.strip().encode('ascii', 'ignore') for ele in details]) img_urls = tree.xpath( ".//div[@itemprop='articleBody']//img[contains(@class,'size-full')]/@src" ) if img_urls: news_item['img_urls'] = get_stripped_list(img_urls) news_item['cover_image'] = img_urls[0] meta_result = self.get_meta(tree) if 'og:image' in meta_result: news_item['cover_image'] = meta_result['og:image'] if 'og:description' in meta_result: news_item['blurb'] = meta_result['og:description'] news_item['blurb'] = news_item['blurb'].strip().encode( 'ascii', 'ignore') if 'og:updated_time' in meta_result: news_item['published_date'] = datetime.strptime( meta_result['og:updated_time'].split("+")[0], '%Y-%m-%dT%H:%M:%S') authors = tree.xpath(".//meta[@itemprop='author']/@content") if authors: news_item['author'] = get_stripped_list(authors) for item in categories: if referer in sum(item['subcategory'].values(), []): news_item['category'] = item['category'] key = (key for key, value in item['subcategory'].items() if referer in value).next() news_item['sub_categories'] = [key] return news_item except Exception, e: self.log('==Exception=================>>>>>>>>! %r' % e)
def parse_item(self, response): super(MoneycontrolSpider, self).parse_item(response) htmlparser = etree.HTMLParser() tree = etree.parse(BytesIO(response.body), htmlparser) news_item = NewsItem() try: title = tree.xpath('.//h1[contains(@class, "arti_title")]/text()') details = tree.xpath( './/div[contains(@class, "MT20")]//p//text()[not(ancestor::script)]' ) if title and details: news_item['source'] = self.name news_item['crawled_date'] = datetime.now() news_item['source_url'] = response.url.split('?')[0] news_item['title'] = title[0].strip().encode('ascii', 'ignore') news_item['details'] = '\t'.join([ item.strip().encode('ascii', 'ignore').decode('unicode_escape') for item in details if item.strip() ]) img_urls = tree.xpath( './/table[contains(@class,"MR15")]//div/img/@src') if img_urls: news_item['img_urls'] = get_stripped_list(img_urls) news_item['cover_image'] = img_urls[0] tags = tree.xpath( './/div[contains(@class, "tag_wrap MT20")]/a//text()') if tags: news_item['tags'] = get_stripped_list(tags) published_date = tree.xpath( './/p[contains(@class, "arttidate MT15")]//text()') if published_date: if '|' in published_date[0]: news_item['published_date'] = datetime.strptime( published_date[0].split('|')[0].strip().encode( 'ascii', 'ignore'), '%b %d, %Y, %I.%M %p') else: news_item['published_date'] = datetime.strptime( published_date[0].strip().encode( 'ascii', 'ignore'), '%b %d, %Y, %I.%M %p') referer = response.request.headers['Referer'] for item in categories: if referer in sum(item['subcategory'].values(), []): news_item['category'] = item['category'] key = (key for key, value in item['subcategory'].items() if referer in value).next() news_item['sub_categories'] = [key] return news_item except Exception, e: self.log('==Exception=================>>>>>>>>! %r' % e)
def parse_item(self, response): super(EntrepreneurSpider, self).parse_item(response) htmlparser = etree.HTMLParser() tree = etree.parse(BytesIO(response.body), htmlparser) news_item = NewsItem() try: title = tree.xpath(".//h1[contains(@class,\'headline\')]//text()") details = tree.xpath( './/div[contains(@class,\'bodycopy\')]//p//text()') if title and details: news_item['source'] = self.name news_item['source_url'] = response.url.split('?')[0] news_item['crawled_date'] = datetime.now() news_item['title'] = title[0].strip().encode('ascii', 'ignore') news_item['details'] = "\t".join( [det.strip().encode('ascii', 'ignore') for det in details]) img_urls = tree.xpath( './/div[contains(@class,\'hero topimage\')]/img/@src') if img_urls: news_item['img_urls'] = get_stripped_list(img_urls) news_item['cover_image'] = img_urls[0] blurb = tree.xpath( './/div[contains(@class,\'bodycopy\')]/p/text()') news_item['blurb'] = " ".join([ short_blurb.strip().encode('ascii', 'ignore') for short_blurb in blurb[0:1] ]) published_date = tree.xpath( './/time[contains(@itemprop,\'datePublished\')]//text()') if published_date: news_item['published_date'] = datetime.strptime( published_date[0].strip(), '%B %d, %Y') tags = tree.xpath( './/div[contains(@class,\'article-tags\')]/a/text()') if tags: news_item['tags'] = get_stripped_list(tags) author = tree.xpath( './/div[contains(@itemprop,\'name\')]/text()') if author: news_item['author'] = get_stripped_list(author) referer = response.request.headers['Referer'] for item in categories: if referer in sum(item['subcategory'].values(), []): news_item['category'] = item['category'] key = (key for key, value in item['subcategory'].items() if referer in value).next() news_item['sub_categories'] = [key] return news_item except Exception, e: self.log('==Exception=================>>>>>>>>! %r' % e)
def parse_item(self, response): filedir = self.pre_write_check() filename = os.path.join(filedir, md5(response.url).hexdigest()) with open(filename, "wb") as html: html.write(response.body) htmlparser = etree.HTMLParser() tree = etree.parse(BytesIO(response.body), htmlparser) news_item = NewsItem() try: title = tree.xpath('.//h1[contains(@class,"arti_heading")]/text()') details = tree.xpath('.//div[@id=\'arti_content_n\']//p/text()') if title and details: news_item['title'] = title[0].strip().encode('ascii','ignore') details = [x.strip().encode('ascii','ignore') for x in details if x.strip()] details = "\t".join(details).strip() news_item['details'] = details news_item['source'] = self.name news_item['crawled_date'] = datetime.now() news_item['source_url'] = response.url img_urls = tree.xpath('.//div[@id=\'arti_content_n\']/p/strong/img/@src') if img_urls: news_item['img_urls'] = get_stripped_list(img_urls) news_item['cover_image'] = img_urls[0] tags = tree.xpath('.//div[contains(@id, "tags_div")]//a/text()') if tags: news_item['tags'] = get_stripped_list(tags) author = tree.xpath('.//span[contains(@class, "grey1")]/a/text()') authorname = tree.xpath('.//span[contains(@class, "grey1")]/text()') if author: author = [x.strip().encode('ascii','ignore')for x in author] author = " ".join(author).strip() news_item['author'] = get_stripped_list(author) if authorname: authorname = [x.strip().encode('ascii','ignore')for x in authorname] authorname = " ".join(authorname).strip() news_item['author'] = get_stripped_list(authorname) published_date = tree.xpath('.//div[contains(@class, "sm1 grey1")]/text()') if published_date: pub_date = published_date[0] news_item['published_date'] = datetime.strptime(pub_date.split('IST')[0].strip().encode('ascii','ignore') if 'IST' in pub_date else pub_date, '%B %d, %Y %H:%M') referer = response.request.headers['Referer'] for item in categories: if referer in sum(item['subcategory'].values(), []): news_item['category'] = item['category'] key = (key for key,value in item['subcategory'].items() if referer in value).next() news_item['sub_categories'] = [key] return news_item except Exception, e: self.log('==Exception=================>>>>>>>>! %r' % e)
def parse_item(self, response): super(HuffingtonPostSpider, self).parse_item(response) htmlparser = etree.HTMLParser() tree = etree.parse(BytesIO(response.body), htmlparser) news_item = NewsItem() try: title = tree.xpath(".//h1[contains(@class,\"title\")]//text()") details = tree.xpath('.//div[contains(@class,\"content\")]//p//text()') if title and details: news_item['source'] = self.name news_item['source_url'] = response.url.split('?')[0] news_item['crawled_date'] = datetime.now() news_item['title'] = title[0].strip().encode('ascii','ignore') news_item['details'] = "\t".join([ele.strip().encode('ascii','ignore') for ele in details]) img_urls = tree.xpath('.//div[contains(@class,\"top-media--image image\")]/img/@src') if img_urls: news_item['img_urls'] = get_stripped_list(img_urls) cover_image = tree.xpath('.//span[contains(@class,\"img-caption\")]//img/@src') if cover_image: news_item['cover_image'] = get_stripped_list(cover_image)[0] meta_result = self.get_meta(tree) if 'og:image' in meta_result: news_item['cover_image'] = meta_result['og:image'] if 'og:description' in meta_result: news_item['blurb'] = meta_result['og:description'] news_item['blurb'] = news_item['blurb'].strip().encode('ascii','ignore') published_date = tree.xpath('.//div[contains(@class,\"timestamp\")]/span/text()') if published_date: pub_date = published_date[0].strip() news_item['published_date'] = datetime.strptime(pub_date.split('IST')[0].strip() if 'IST' in pub_date else pub_date, '%d/%m/%Y %I:%M %p') author = tree.xpath('.//a[contains(@class,\"author-card__details__name\")]/text()') if author: news_item['author'] = author[0].strip().encode('ascii','ignore') tags = tree.xpath('.//div[contains(@class,\"tag-cloud\")]/a/text()') if tags: news_item['tags'] = [x.strip().encode('ascii','ignore')for x in tags] referer = response.request.headers['Referer'] for item in categories: if referer in sum(item['subcategory'].values(), []): news_item['category'] = item['category'] key = (key for key,value in item['subcategory'].items() if referer in value).next() news_item['sub_categories'] = [key] return news_item except Exception, e: self.log('==Exception=================>>>>>>>>! %r' % e)
def parse_item(self, response): super(VentureBeatSpider, self).parse_item(response) htmlparser = etree.HTMLParser() tree = etree.parse(BytesIO(response.body), htmlparser) news_item = NewsItem() try: title = tree.xpath( "//h1[contains(@class,\'article-title\')]//text()") details = tree.xpath( '//div[contains(@class,"article-content")]/p//text()') if title and details: news_item['source'] = self.name news_item['crawled_date'] = datetime.now() news_item['source_url'] = response.url.split('?')[0] news_item['title'] = title[0].strip().encode('ascii', 'ignore') news_item['details'] = "\t".join( [ele.strip().encode('ascii', 'ignore') for ele in details]) img_urls = tree.xpath( '//div[contains(@class,"article-content")]//img/@src') if img_urls: news_item['img_urls'] = get_stripped_list(img_urls) news_item['cover_image'] = img_urls[0] published_date = tree.xpath( './/time[contains(@class,\"the-time\")]/text()') if published_date: news_item['published_date'] = datetime.strptime( published_date[0], '%B %d, %Y %I:%M %p') author = tree.xpath( './/a[contains(@class,\"author url fn\")]/text()') if author: news_item['author'] = get_stripped_list(author) tags = tree.xpath( './/div[contains(@class,\"article-tags\")]/a/text()') if tags: news_item['tags'] = get_stripped_list(tags) referer = response.request.headers['Referer'] for item in categories: if referer in sum(item['subcategory'].values(), []): news_item['category'] = item['category'] key = (key for key, value in item['subcategory'].items() if referer in value).next() news_item['sub_categories'] = [key] return news_item except: self.log('==Exception=================>>>>>>>>! %r' % e) return None
def parse_item(self, response): super(NdtvSpider, self).parse_item(response) htmlparser = etree.HTMLParser() tree = etree.parse(BytesIO(response.body), htmlparser) news_item = NewsItem() try: # title = tree.xpath('.//div[contains(@class, "storytitle")]/h1/text()') title = tree.xpath('.//h1[@itemprop="headline"]//text()') details = tree.xpath('.//div[contains(@class, "pdl200")]//text()[not(ancestor::script)]') # details = tree.xpath('.//span[@itemprop="articleBody"]//text') if title and details: news_item['source'] = self.name news_item['crawled_date'] = datetime.now() news_item['source_url'] = response.url.split('?')[0] news_item['title'] = title[0].strip().encode('ascii','ignore') news_item['details'] ='\t'.join([item.strip().encode('ascii','ignore').decode('unicode_escape') for item in details if item.strip()]) # img_urls = tree.xpath('.//div[contains(@class,"storypicbig")]/img/@src') img_urls = tree.xpath('.//div[contains(@class,"whosaid_top_mainimg_cont")]/img/@src') if img_urls: news_item['img_urls'] = get_stripped_list(img_urls) # cover_image = tree.xpath('.//table[contains(@class,"MR15")]//div/img/@src') # if cover_image: news_item['cover_image'] = get_stripped_list(img_urls)[0] published_date = tree.xpath('.//div[contains(@class, "dateline")]/text()') date_str = published_date[0].replace("(IST)","").strip().split(":",1)[1] if published_date: pub_date = published_date[0].strip() news_item['published_date'] = parse(date_str) # pub_date.strip('| Last Updated:(IST)').strip().encode('ascii','ignore') if '| Last Updated:(IST)' in pub_date else pub_date tags=tree.xpath('.//p[contains(@class, "alltags")]/a/text()') if tags: news_item['tags'] = get_stripped_list(tags) author = tree.xpath('.//div[contains(@class, "dateline")]/a/text()') if author: news_item['author'] = get_stripped_list(author) referer = response.request.headers['Referer'] for item in categories: if referer in sum(item['subcategory'].values(), []): news_item['category'] = item['category'] key = (key for key,value in item['subcategory'].items() if referer in value).next() news_item['sub_categories'] = [key] return news_item except Exception, e: self.log('==Exception=================>>>>>>>>! %r' % e)
def parse_item(self, response): super(Thehindubusiness, self).parse_item(response) htmlparser = etree.HTMLParser() tree = etree.parse(BytesIO(response.body), htmlparser) news_item = NewsItem() try: # title = tree.xpath('.//h1[@class=\'detail-title\']/text()') title = tree.xpath('.//h1[@class=\'title\']/text()') # details = tree.xpath('.//p[@class=\'body\']/text()') details = tree.xpath('.//div[starts-with(@id,"content-body-14269002")]//p//text()') if title and details: news_item['source'] = self.name news_item['crawled_date'] = datetime.now() news_item['source_url'] = response.url.split('?')[0] news_item['title'] = title[0].strip().encode('ascii','ignore') news_item['details'] = "\t".join([x.strip().encode('ascii','ignore')for x in details]).strip() # img_urls = tree.xpath('.//div[contains(@class,"text-embed")]/img/@src') img_urls = tree.xpath('.//div[@class="img-container picture"]/img/@data-proxy-image') other_img_urls = tree.xpath('.//div[contains(@id,"hcenter")]/img/@src') if img_urls: news_item['img_urls'] = get_stripped_list(img_urls) if other_img_urls: news_item['img_urls'] = get_stripped_list(other_img_urls) cover_image = tree.xpath('.//div[@class="img-container picture"]/img/@data-proxy-image') if cover_image: news_item['cover_image'] = cover_image[0].strip() tags = tree.xpath('.//div[contains(@id, "articleKeywords")]/p//a/text()') if tags: news_item['tags'] = get_stripped_list(tags) # published_date = tree.xpath('.//div[contains(@class, "artPubUpdate")]/text()') published_date = tree.xpath('.//div[@class="teaser-text update-time"]/span/none/text()') date_str = published_date[0].replace("IST","").strip() if published_date: news_item['published_date'] = parse(date_str) # datetime.strptime(published_date[0].split('Updated:')[1].split('IST')[0].strip().encode('ascii','ignore'), '%B %d, %Y %I:%M') referer = response.request.headers['Referer'] for item in categories: if referer in sum(item['subcategory'].values(), []): news_item['category'] = item['category'] key = (key for key,value in item['subcategory'].items() if referer in value).next() news_item['sub_categories'] = [key] return news_item except: self.log('==Exception=================>>>>>>>>! %r' % e) return None
def parse_item(self, response): super(VccircleSpider, self).parse_item(response) htmlparser = etree.HTMLParser() tree = etree.parse(BytesIO(response.body), htmlparser) news_item = NewsItem() try: title = tree.xpath('//*[@id="block-system-main"]/div/div[2]/div[2]/h2/text()') # details = tree.xpath('.//div[@class=\'cont-text\']/div//text()') details = tree.xpath('.//div[@class=\'vcc-snippet-body\']/p[@class=\'selectionShareable\']//text()') if title and details: news_item['source'] = self.name news_item['crawled_date'] = datetime.now() news_item['source_url'] = response.url.split('?')[0] news_item['title'] = title[0].strip().encode('ascii','ignore') news_item['details'] = "\t".join([x.strip().encode('ascii','ignore')for x in details]).strip() img_urls = tree.xpath('.//div[contains(@class,"field-item even")]/img/@src') if img_urls: news_item['img_urls'] = get_stripped_list(img_urls) news_item['cover_image'] = img_urls[0] cover_image = tree.xpath('.//table[contains(@class,"MR15")]//div/img/@src') if cover_image: news_item['cover_image'] = cover_image[0] tags = tree.xpath('.//div[contains(@class, "content-tags")]//a/text()') if tags: news_item['tags'] = get_stripped_list(tags) author = tree.xpath('.//span[contains(@class, "byline_person")]/text()') if author: news_item['author'] = author[0].split('by')[1].strip() if 'by' in author[0] else author[0].strip() published_date = tree.xpath('.//span[contains(@class, "date-display-single")]/text()') if published_date: news_item['published_date'] = datetime.strptime("".join(get_stripped_list(published_date)[0]), '%A, %B %d, %Y - %I:%M') referer = response.request.headers['Referer'] for item in categories: if referer in sum(item['subcategory'].values(), []): news_item['category'] = item['category'] key = (key for key,value in item['subcategory'].items() if referer in value).next() news_item['sub_categories'] = [key] return news_item except Exception, e: self.log('==Exception=================>>>>>>>>! %r' % e)
def parse_item(self, response): filedir = self.pre_write_check() filename = os.path.join(filedir, md5(response.url).hexdigest()) if not os.path.exists(filename): with open(filename, "wb") as html: html.write(response.body) else: print "skipped file {0}".format(filename) return None htmlparser = etree.HTMLParser() tree = etree.parse(BytesIO(response.body), htmlparser) news_item = NewsItem() try: title = tree.xpath(".//h1[contains(@class,\"post-tile entry-title\")]/text()") details = tree.xpath('//div[contains(@class,"entry-content")]/p//text()') if title and details: news_item['source'] = self.name news_item['crawled_date'] = datetime.now() news_item['source_url'] = response.url.split('?')[0] news_item['title'] = title[0].strip().encode('ascii','ignore') news_item['details'] = "\t".join([ele.strip().encode('ascii','ignore') for ele in details]) # " ".join([ele.strip().encode('ascii','ignore') for ele in details]) img_urls = tree.xpath('.//div[contains(@class,\'feature-img\')]/img/@src') if img_urls: news_item['img_urls'] = get_stripped_list(img_urls) news_item['cover_image'] = img_urls[0] blurb = tree.xpath('.//div[@class=\'entry-content\']/p/em/text()') if blurb: news_item['blurb'] = blurb[0].strip().encode('ascii','ignore') ## TODO ## Author, Tags tags = tree.xpath('.//div[contains(@class,\'mom-post-meta single-post-meta\')]/span[3]/a//text()') if tags: news_item['tags'] = tags published_date = tree.xpath('.//span//time[contains(@class,\'updated\')]//text()') if published_date: news_item['published_date'] = datetime.strptime(" ".join([item.strip().encode('ascii','ignore') for item in published_date]), '%B %d, %Y') author = tree.xpath('.//span[contains(@class,\'fn\')]/a/text()') if author: news_item['author'] = author referer = response.request.headers['Referer'] for item in categories: if referer in sum(item['subcategory'].values(), []): news_item['category'] = item['category'] key = (key for key,value in item['subcategory'].items() if referer in value).next() news_item['sub_categories'] = [key] return news_item except Exception, e: self.log('==Exception=================>>>>>>>>! %r' % e)
def parse_item(self, response): super(ETSpider, self).parse_item(response) htmlparser = etree.HTMLParser() tree = etree.parse(BytesIO(response.body), htmlparser) news_item = NewsItem() try: title = tree.xpath('.//h1[contains(@class, "title")]/text()[1]') details = tree.xpath('.//div[@class=\'Normal\']//text()') if title and details : news_item['source'] = self.name news_item['crawled_date'] = datetime.now() news_item['source_url'] = response.url.split('?')[0] news_item['title'] = title[0].strip().decode('unicode_escape').encode('ascii','ignore') news_item['details'] = "\t".join([item.strip().encode('ascii','ignore') for item in details if item.strip()]) news_item['cover_image'] = '' news_item['blurb'] = '' news_item['img_urls'] = [] img_urls = tree.xpath('.//figure/img/@src') if img_urls: news_item['img_urls'] = get_stripped_list(img_urls) meta_result = self.get_meta(tree) if 'og:image' in meta_result: news_item['cover_image'] = meta_result['og:image'] if 'og:description' in meta_result: news_item['blurb'] = meta_result['og:description'] news_item['blurb'] = news_item['blurb'].decode('unicode_escape').encode('ascii','ignore') published_date = tree.xpath('.//div[contains(@class,\'byline\')]/text()') self.log('==Pub date=================>>>>>>>>! %r' % published_date) print "pb------------------->",published_date if published_date: # published_date = " ".join(published_date) # news_item['published_date'] = datetime.strptime(published_date.split('|')[1].strip('IST').strip(), '%b %d, %Y, %I.%M %p') news_item['author'] = published_date[0].split('|')[0].strip() date_str = (published_date[0].split(":")[1:])[0].replace("IST","").strip() news_item['published_date'] = datetime.strptime(date_str, '%b %d, %Y, %I.%M %p') referer = response.request.headers['Referer'] for item in categories: if referer in sum(item['subcategory'].values(), []): news_item['category'] = item['category'] key = (key for key,value in item['subcategory'].items() if referer in value).next() news_item['sub_categories'] = [key] return news_item except Exception, e: self.log('==Exception=================>>>>>>>>! %r' % e)
def parse_item(self, response): super(DealCurrySpider, self).parse_item(response) htmlparser = etree.HTMLParser() tree = etree.parse(BytesIO(response.body), htmlparser) news_item = NewsItem() try: title = tree.xpath(".//h1/text()") details = tree.xpath( './/div[contains(@class, "articleSpacer")]/p//text()') if title and details: news_item['source_url'] = response.url.split('?')[0] news_item['source'] = self.name news_item['crawled_date'] = datetime.now() news_item['title'] = title[0].strip().encode('ascii', 'ignore') news_item['details'] = "\t".join([ x.strip().encode('ascii', 'ignore') for x in details ]).strip() # "\t".join([item.strip().encode('ascii','ignore') for item in details if item.strip()]) tags = tree.xpath( './/div[contains(@style, "padding-bottom:10px")]/span[contains(@style, "color:#346f9a; float:left; text-align:left")]/a/text()' ) news_item['tags'] = tags[0].strip().encode('ascii', 'ignore') published_date = tree.xpath( ".//span[contains(@style, 'color:#6b6b6b;float:left; text-align:left; margin-left:5px')]/text()" ) news_item['published_date'] = datetime.strptime( published_date[0].encode('ascii', 'ignore'), '%d %B %Y') author = tree.xpath( './/div[contains(@style, "")]/span[contains(@style, "color:#6b6b6b; float:left; text-align:left;")]/text()' ) news_item['author'] = author[0].split('by')[1].strip().encode( 'ascii', 'ignore') img_urls = tree.xpath( './/div[contains(@style, "padding-bottom:10px")]/img/@src') if img_urls: news_item['img_urls'] = get_stripped_list(img_urls) meta_result = self.get_meta(tree) if 'description' in meta_result: news_item['blurb'] = meta_result['description'] return news_item except: pass return None
def parse_item(self, response): super(ReutersSpider, self).parse_item(response) htmlparser = etree.HTMLParser() tree = etree.parse(BytesIO(response.body), htmlparser) news_item = NewsItem() try: title = tree.xpath(".//h1[contains(@class,\"article-headline\")]/text()") details = tree.xpath('//*[@id="article-text"]//text()') if title and details: news_item['source'] = self.name news_item['crawled_date'] = datetime.now() news_item['source_url'] = response.url.split('?')[0] news_item['title'] = title[0].strip().encode('ascii','ignore') news_item['details'] = "\t".join([ det.strip().encode('ascii','ignore') for det in details ]) # " ".join([ det.strip().encode('ascii','ignore') for det in details ]) img_urls = tree.xpath('.//div[contains(@class,\'related-photo-container\')]/img/@src') if img_urls: news_item['img_urls'] = get_stripped_list(img_urls) news_item['cover_image'] = img_urls[0] blurb = tree.xpath('.//div[contains(@class,\'related-photo-caption\')]/text()') if blurb: news_item['blurb'] = " ".join([ blurb.strip().encode('ascii','ignore') for blurb in blurb ]) published_date = tree.xpath('.//span[contains(@class,\'timestamp\')]//text()') date_str = published_date[0].replace("|","").replace("IST","").strip() if published_date: pub_date = published_date[0].strip() d1 =[pub_date.split('IST')[0] if 'IST' in pub_date else pub_date] # news_item['published_date'] = datetime.strptime(d1[0].strip().encode('ascii','ignore'), '%d %b, %Y') # datetime.strptime(d1[0], '%a %b %d, %Y %I:%M%p ') news_item['published_date'] = parse(date_str) author = tree.xpath('.//span[contains(@class,\'byline\')]/text()') if author: news_item['author'] = author[0].split('By')[1].strip() referer = response.request.headers['Referer'] for item in categories: if referer in sum(item['subcategory'].values(), []): news_item['category'] = item['category'] key = (key for key,value in item['subcategory'].items() if referer in value).next() news_item['sub_categories'] = [key] return news_item except Exception, e: self.log('==Exception=================>>>>>>>>! %r' % e)
def parse_item(self, response): super(BusinessStandardSpider, self).parse_item(response) htmlparser = etree.HTMLParser() tree = etree.parse(BytesIO(response.body), htmlparser) news_item = NewsItem() try: title = tree.xpath(".//h1[contains(@class,\'headline\')]//text()") details = tree.xpath( './/span[contains(@class,\'p-content\')]/div//text()[not(ancestor::script)]' ) if title and details: news_item['source'] = self.name news_item['source_url'] = response.url.split('?')[0] news_item['crawled_date'] = datetime.now() news_item['title'] = title[0].strip().encode('ascii', 'ignore') news_item['details'] = "\t".join([ item.strip().encode('ascii', 'ignore') for item in details ]) img_urls = tree.xpath( './/img[contains(@class,\'imgCont\')]/@src') if img_urls: news_item['img_urls'] = get_stripped_list(img_urls) published_date = tree.xpath( './/p[contains(@class,\'fL\')]//span//text()') if published_date: news_item['published_date'] = datetime.strptime( published_date[3].split("\t")[0], '%B %d, %Y') related = tree.xpath( './/div[contains(@class,\'readmore_tagBG\')]//h2//a/text()' ) if related: news_item['tags'] = [ item.strip() for item in related if item.strip() ] cover_image = tree.xpath( './/img[contains(@class,\'imgCont\')]/@src') if cover_image: news_item['cover_image'] = cover_image return news_item except: pass return None
def parse_item(self, response): super(SMETimesSpider, self).parse_item(response) htmlparser = etree.HTMLParser() tree = etree.parse(BytesIO(response.body), htmlparser) news_item = NewsItem() try: title = tree.xpath(".//span[contains(@class,\"blue-heading\")]//text()") details = tree.xpath('//span[@class="text"]//text()') details = [ele.encode('ascii','ignore').replace("\n","") for ele in details] if title and details: news_item['source'] = self.name news_item['crawled_date'] = datetime.now() news_item['source_url'] = response.url.split('?')[0] news_item['title'] = title[0].strip().encode('ascii','ignore') news_item['details'] = "\t".join(details) img_urls = tree.xpath('//span[contains(@class,"text")]//img/@src') if not img_urls[0].lower().find(self.name.lower()) == -1: news_item['img_urls'] = get_stripped_list(img_urls) news_item['cover_image'] = img_urls[0] published_date = tree.xpath('.//div[contains(@align,\'justify\')]/span/span//text()') if published_date: pub_date = published_date[0].split("|")[1] news_item['published_date'] = datetime.strptime(pub_date, ' %d %b, %Y') author = tree.xpath('.//div[contains(@align,\'justify\')]/span/span//text()') if author : news_item['author'] = author[0].split("|")[0].strip() referer = response.request.headers['Referer'] for item in categories: if referer in sum(item['subcategory'].values(), []): news_item['category'] = item['category'] key = (key for key,value in item['subcategory'].items() if referer in value).next() news_item['sub_categories'] = [key] self.log('==Exception=================>>>>>>>>! %r' % news_item) return news_item except Exception, e: self.log('==Exception=================>>>>>>>>! %r' % e)
def parse_item(self, response): super(YourStorySpider, self).parse_item(response) htmlparser = etree.HTMLParser() tree = etree.parse(BytesIO(response.body), htmlparser) news_item = NewsItem() try: title = tree.xpath(".//h3[contains(@class,\"title\")]/text()") details = tree.xpath( './/div[@class=\'ys_post_content text\']/p/text()') if title and details: news_item['source'] = self.name news_item['crawled_date'] = datetime.now() news_item['source_url'] = response.url.split('?')[0] news_item['title'] = title[0].strip().encode('ascii', 'ignore') news_item['details'] = "\t".join([ x.strip().encode('ascii', 'ignore') for x in details ]).strip() img_urls = tree.xpath( './/img[contains(@class,\'size-full\')]/@src') if img_urls: news_item['img_urls'] = get_stripped_list(img_urls) meta_result = self.get_meta(tree) if 'og:image' in meta_result: news_item['cover_image'] = meta_result['og:image'] if 'og:description' in meta_result: news_item['blurb'] = meta_result['og:description'] news_item['blurb'] = news_item['blurb'].strip().encode( 'ascii', 'ignore') author = tree.xpath( './/a[contains(@class, "postInfo color-ys")]/text()') if author: news_item['author'] = author[0].strip().encode( 'ascii', 'ignore') published_date = tree.xpath( './/p[contains(@class, "postInfo color-grey mt-5 fr")]/text()' ) if published_date: news_item['published_date'] = datetime.strptime( published_date[0].split('\n')[1].strip(), '%d %B %Y') tags = tree.xpath( './/ul[contains(@class, "articleTags mt-5")]/li/a/text()') if tags: news_item['tags'] = [ x.strip().encode('ascii', 'ignore') for x in tags ] return news_item except: pass return None
def parse_item(self, response): super(SmeWebSpider, self).parse_item(response) htmlparser = etree.HTMLParser() tree = etree.parse(BytesIO(response.body), htmlparser) news_item = NewsItem() try: title = tree.xpath(".//h1[contains(@itemprop,\"name\")]/text()") details = tree.xpath('.//div[@class="article__body"]/p//text()') if title and details: news_item['source'] = self.name news_item['crawled_date'] = datetime.now() news_item['title'] = title[0].strip().encode('ascii', 'ignore') news_item['details'] = "\t".join([ x.strip().encode('ascii', 'ignore') for x in details ]).strip() news_item['source_url'] = response.url.split('?')[0] img_urls = tree.xpath( './/a[contains(@class,\"article__figure__link\")]/img/@src' ) if img_urls: news_item['img_urls'] = get_stripped_list(img_urls) meta_result = self.get_meta(tree) if 'og:image' in meta_result: news_item['cover_image'] = meta_result['og:image'] if 'og:description' in meta_result: news_item['blurb'] = meta_result['og:description'] news_item['blurb'] = news_item['blurb'].strip().encode( 'ascii', 'ignore') published_date = tree.xpath( './/span[contains(@class,\"article__meta__info\")]/time/text()' ) if published_date: news_item['published_date'] = datetime.strptime( published_date[0].strip().encode('ascii', 'ignore'), '%B %d %Y %I:%M %p') author = tree.xpath( './/span[contains(@class,\"article__meta__value\")]/text()' ) if author: author = author[0].strip() news_item['author'] = author.split( '\n')[1].strip() if '\n' in author else author tags = tree.xpath( './/div[contains(@class,\"article__tags-container\")]/a/span/text()' ) if tags: news_item['tags'] = get_stripped_list(tags) referer = response.request.headers['Referer'] for item in categories: if referer in sum(item['subcategory'].values(), []): news_item['category'] = item['category'] key = (key for key, value in item['subcategory'].items() if referer in value).next() news_item['sub_categories'] = [key] return news_item except Exception, e: self.log('==Exception=================>>>>>>>>! %r' % e)
def parse_item(self, response): super(ForbesIndiaSpider, self).parse_item(response) htmlparser = etree.HTMLParser() tree = etree.parse(BytesIO(response.body), htmlparser) news_item = NewsItem() try: # title = tree.xpath(".//div[contains(@class,\"PT10 PB20\")]/div/h1/text()") title = tree.xpath( ".//div[contains(@class,'col-lg-9 col-md-8 col-sm-7')]/h1//text()" ) # details = tree.xpath('//*[@id="article"]/div[7]/p//text()') details = tree.xpath('.//div[contains(@class,"storydiv")]//text()') if title and details: news_item['source'] = self.name news_item['crawled_date'] = datetime.now() news_item['source_url'] = response.url.split('?')[0] news_item['title'] = title[0].strip().encode('ascii', 'ignore') news_item['details'] = "\t".join( [ele.strip().encode('ascii', 'ignore') for ele in details]) details_start_char = tree.xpath( './/span[contains(@class,\'g-60-b\')]/text()') if details_start_char: news_item['details'] = details_start_char[0] + news_item[ 'details'] # img_urls = tree.xpath('.//table[contains(@class,\'PB20\')]//img/@src') img_urls = tree.xpath( './/div[contains(@class,"artical-main-sec MT20 spacediv")]//img/@src' ) if not img_urls: img_urls = tree.xpath( './/*[@id="article"]//table[1]//tr[1]//img/@src') img_urls_p = tree.xpath( './/p[contains(@class,\'padding-bottom:2px;\')]//img/@src') if img_urls: news_item['img_urls'] = get_stripped_list(img_urls) elif img_urls_p: news_item['img_urls'] = get_stripped_list(img_urls_p) # cover_image = tree.xpath('.//table[contains(@class,\'PB20\')]//img/@src') cover_image = tree.xpath( './/div[contains(@class,"artical-main-sec MT20 spacediv")]//img/@src' ) if cover_image: news_item['cover_image'] = get_stripped_list( cover_image)[0] blurb = tree.xpath( './/div[@class=\'caption1 PT5 PB10\']//text()') if blurb: news_item['blurb'] = blurb[0].strip().encode( 'ascii', 'ignore') # published_date_first = tree.xpath('.//div[contains(@class,\'articlehd\')]/span/text()') # published_date_second = tree.xpath('.//div[contains(@class,\'date PB5\')]/text()') published_date_first = tree.xpath( './/div[contains(@class,"update-date text-uppercase")]//text()' ) if published_date_first: news_item['published_date'] = datetime.strptime( published_date_first[1].encode('ascii', 'ignore'), ' %b %d, %Y ') # datetime.strptime(published_date_first[0].split('|')[1].strip('').encode('ascii','ignore'), ' %b %d, %Y ') # author =tree.xpath('.//div[contains(@class,\'author_name\')]/a/text()') author = tree.xpath( './/div[contains(@class,"author-name text-uppercase")]//text()' ) author_name = tree.xpath( './/div[contains(@class,\'byline1 PT5\')]/a/text()') if author: news_item['author'] = get_stripped_list(author) if author_name: news_item['author'] = get_stripped_list(author_name) tags = tree.xpath( './/div[contains(@class,\'link PT10\')]/a/text()') if tags: news_item['tags'] = get_stripped_list(tags) referer = response.request.headers['Referer'] for item in categories: if referer in sum(item['subcategory'].values(), []): news_item['category'] = item['category'] key = (key for key, value in item['subcategory'].items() if referer in value).next() news_item['sub_categories'] = [key] return news_item except Exception, e: self.log('==Exception=================>>>>>>>>! %r' % e)
def parse_item(self, response): super(BloombergSpider, self).parse_item(response) htmlparser = etree.HTMLParser() tree = etree.parse(BytesIO(response.body), htmlparser) self.log('==RESPONSE=================>>>>>>>>! %s' % response.request.headers['Referer']) news_item = NewsItem() try: # title = tree.xpath(".//span[contains(@class,\"lede-headline__highlighted\")]//text()") title = tree.xpath( ".//span[contains(@class,\"lede-text-only__highlight\")]//text()" ) # details = tree.xpath('.//div[contains(@class,\"article-body__content\")]//p//text()') details = tree.xpath( './/div[contains(@class,\"body-copy\")]//p//text()') # self.log('==Title=================>>>>>>>>! %s' % title[0]) if title and details: news_item['source'] = self.name news_item['source_url'] = response.url.split('?')[0] news_item['crawled_date'] = datetime.now() news_item['title'] = title[0].strip().encode('ascii', 'ignore') news_item['details'] = "\t".join( [ele.strip().encode('ascii', 'ignore') for ele in details]) # img_urls = tree.xpath('.//div[contains(@class,\"inline-media__unlinked-image\")]//img/@src') img_urls = tree.xpath( './/div[contains(@class,"lazy-img")]/img//@src') if img_urls: news_item['img_urls'] = get_stripped_list(img_urls) news_item['cover_image'] = img_urls[0] meta_result = self.get_meta(tree) if 'og:image' in meta_result: news_item['cover_image'] = meta_result['og:image'] if 'og:description' in meta_result: news_item['blurb'] = meta_result['og:description'] news_item['blurb'] = news_item['blurb'].strip().encode( 'ascii', 'ignore') # published_date = tree.xpath('.//time[contains(@class,\"published-at time-based\")]/text()') published_date = tree.xpath( '//*[@itemprop="datePublished"]//text()') # published_date = tree.xpath('.//div[contains(@class,"lede-text-only__times")]//text()')[1] if published_date: date_str = published_date[1].split('GMT')[0].strip() news_item['published_date'] = parse(date_str) # datetime.strptime(published_date[0].split('EDT')[0].strip().encode('ascii','ignore'), '%B %d, %Y %I:%M %p') # authors = tree.xpath('.//div[contains(@class,\"author-byline\")]/a/text()') authors = tree.xpath( './/div[contains(@class,"author")]//text()') if authors: news_item['author'] = get_stripped_list(authors)[0] referer = response.request.headers['Referer'] for item in categories: if referer in sum(item['subcategory'].values(), []): news_item['category'] = item['category'] key = (key for key, value in item['subcategory'].items() if referer in value).next() news_item['sub_categories'] = [key] return news_item except Exception, e: self.log('==Exception=================>>>>>>>>! %r' % e)
def parse_item(self, response): deny_xpaths = [ 'http://4.bp.blogspot.com/-chhNLDME0Yo/VZw_4EpmQ1I/AAAAAAAANcE/8WKzyR2Dhh8/s640/startup%2Breport%2Bindia.png', 'http://3.bp.blogspot.com/-0GSnIzmvK84/UhtWvbUmd7I/AAAAAAAAFco/8rFc-kLEt0c/s1600/martjack+logo+st.gif', 'http://4.bp.blogspot.com/-OeT3NW9TP1U/VgxZe7EgYCI/AAAAAAAAPSs/-zPCdu_ayZg/s1600/peppertap.jpg', 'http://3.bp.blogspot.com/-lVFGg8e2EyA/UhgPxpABqdI/AAAAAAAAFX0/BaOb_gHYLCI/s1600/bigbasket.com+logo.png', 'http://4.bp.blogspot.com/-KTNZQPHscyE/VjPvqSZgK9I/AAAAAAAAP4U/mAqBIH6ljr0/s320/advertise%2Bwith%2Bstartuptimes.png', 'http://2.bp.blogspot.com/-ZBwO4Q4XxZc/VkLhwiuEPfI/AAAAAAAAQHk/CJALT6xKiks/s1600/ecommerce%2Breport%2Bindia%2B2025.png', 'https://1.bp.blogspot.com/-X0_bk4Fw4LM/Vw2L0J3gCTI/AAAAAAAAR88/qrczkABYY9wZFX0Pyqkf9ty-ltc270DhgCLcB/s640/ambani%2Bson%2Bbefore%2Bafter%2Bpics.png', 'http://1.bp.blogspot.com/-3a0VWriQezs/Ued5D94B7yI/AAAAAAAAEjU/kouuhO17mVA/s320/Startups+Request+for+Interview.png' ] super(StartupTimesSpider, self).parse_item(response) htmlparser = etree.HTMLParser() tree = etree.parse(BytesIO(response.body), htmlparser) news_item = NewsItem() try: title = tree.xpath( ".//h3[@class=\"post-title entry-title\"]/a/text()") details = tree.xpath( './/div[@dir=\'ltr\']/text()[preceding-sibling::br and following-sibling::br]' ) if title and details: print self.name, "namee" news_item['source'] = self.name news_item['crawled_date'] = datetime.now() news_item['source_url'] = response.url.split('?')[0] news_item['title'] = title[0].strip().encode('ascii', 'ignore') news_item['details'] = '\t'.join([ item.strip().encode('ascii', 'ignore').decode('unicode_escape') for item in details if item.strip() ]) img_urls = tree.xpath( './/div[contains(@style,\'center\')]/a/img/@src') if img_urls: news_item['img_urls'] = get_stripped_list(img_urls) news_item['cover_image'] = img_urls[0] meta_result = self.get_meta(tree) if 'description' in meta_result: news_item['blurb'] = meta_result['description'] news_item['blurb'] = news_item['blurb'].strip().encode( 'ascii', 'ignore') ## TODO ## author, tags, published_date referer = response.request.headers['Referer'] for item in categories: if referer in sum(item['subcategory'].values(), []): news_item['category'] = item['category'] key = (key for key, value in item['subcategory'].items() if referer in value).next() news_item['sub_categories'] = [key] return news_item except Exception, e: self.log('==Exception=================>>>>>>>>! %r' % e)
def parse_item(self, response): super(GigaomSpider, self).parse_item(response) htmlparser = etree.HTMLParser() tree = etree.parse(BytesIO(response.body), htmlparser) news_item = NewsItem() try: title = tree.xpath( ".//h1[contains(@class,\'entry-title single-title\')]//text()") detail = tree.xpath( './/section[contains(@class,\'entry-content wrap cf\')]//p//text()' ) if title and detail: news_item['source'] = self.name news_item['source_url'] = response.url.split('?')[0] news_item['crawled_date'] = datetime.now() news_item['title'] = title[0].strip().encode('ascii', 'ignore') news_item['details'] = "\t".join( [det.strip().encode('ascii', 'ignore') for det in detail]) img_urls = tree.xpath( './/div[contains(@class,\'featured-image\')]/img/@src') if img_urls: news_item['img_urls'] = get_stripped_list(img_urls) cover_image = tree.xpath( './/h1[contains(@class,\'entry-title single-title\')]/img/@src' ) if cover_image: news_item['cover_image'] = get_stripped_list( cover_image)[0] author = tree.xpath( './/span[contains(@class,\'entry-author author\')]/a/text()' ) if author: news_item['author'] = get_stripped_list(author) published_date = tree.xpath( './/time[contains(@class,\'updated entry-time\')]/text()') date_str = published_date[0].replace("-", "").replace("CST", "").strip() if published_date: news_item['published_date'] = datetime.strptime( date_str, '%b %d, %Y %I:%M %p') # news_item['published_date'] = datetime.strptime( published_date[0].strip('PDT'), '%b %d, %Y - %I:%M %p ') tags = tree.xpath( './/span[contains(@itemprop,\'keywords\')]//text()') if tags: news_item['tags'] = [ det.strip().encode('ascii', 'ignore') for det in [var for var in tags if var != ' '] ] referer = response.request.headers['Referer'] for item in categories: if referer in sum(item['subcategory'].values(), []): news_item['category'] = item['category'] key = (key for key, value in item['subcategory'].items() if referer in value).next() news_item['sub_categories'] = [key] return news_item except Exception, e: self.log('==Exception=================>>>>>>>>! %r' % e)
def parse_item(self, response): super(IAmWirepiderSpider, self).parse_item(response) htmlparser = etree.HTMLParser() tree = etree.parse(BytesIO(response.body), htmlparser) news_item = NewsItem() try: # title = tree.xpath('.//div[@class="entry-header"]/h1/text()') title = tree.xpath( './/h1[@class="entry-title margin-bottom-30"]//text()') details = tree.xpath('.//div[@class="entry-content"]/p//text()') if title and details: news_item['source'] = self.name news_item['source_url'] = response.url.split('?')[0] news_item['crawled_date'] = datetime.now() news_item['title'] = title[0].strip().encode( 'ascii', 'ignore').decode('unicode_escape') news_item['details'] = '\t'.join([ item.strip().encode('ascii', 'ignore').decode('unicode_escape') for item in details if item.strip() ]) img_urls = tree.xpath('.//img[contains(@class,\'wp\')]/@src') if img_urls: news_item['img_urls'] = get_stripped_list(img_urls) news_item['cover_image'] = img_urls[0] # published_date = tree.xpath('.//time[contains(@class, "entry-date")]/text()') published_date = tree.xpath( './/time[contains(@class, "entry-date published")]//text()' ) if published_date: news_item['published_date'] = parse( get_stripped_list(published_date)[0]) # datetime.strptime(get_stripped_list(published_date)[0], '%B %d, %Y') author = tree.xpath( './/a[contains(@class, "url fn n")]/text()') if author: news_item['author'] = get_stripped_list(author) tags = tree.xpath( './/ul[contains(@class, "tag-list")]/li/a/text()') if tags: news_item['tags'] = get_stripped_list(tags) meta_result = self.get_meta(tree) if 'description' in meta_result: news_item['blurb'] = meta_result['description'] referer = response.request.headers['Referer'] for item in categories: if referer in sum(item['subcategory'].values(), []): news_item['category'] = item['category'] key = (key for key, value in item['subcategory'].items() if referer in value).next() news_item['sub_categories'] = [key] return news_item except Exception, e: self.log('==Exception=================>>>>>>>>! %r' % e)
def parse_item(self, response): super(StartOHolicsSpider, self).parse_item(response) htmlparser = etree.HTMLParser() tree = etree.parse(BytesIO(response.body), htmlparser) news_item = NewsItem() try: title = tree.xpath(".//h1/text()") details = tree.xpath( './/span[contains(@style,\'font-size:\')]/text()') if title and details: news_item['source'] = self.name news_item['crawled_date'] = datetime.now() news_item['source_url'] = response.url.split('?')[0] news_item['title'] = title[0].strip().encode('ascii', 'ignore') news_item['details'] = '\t'.join([ item.strip().encode('ascii', 'ignore').decode('unicode_escape') for item in details if item.strip() ]) img_urls = tree.xpath( './/img[contains(@class,\'wp-image\')]/@src') if img_urls: news_item['img_urls'] = get_stripped_list(img_urls) news_item['cover_image'] = img_urls[0] meta_result = self.get_meta(tree) if 'description' in meta_result: news_item['blurb'] = meta_result['description'] author = tree.xpath( './/li[contains(@class, "entry-author")]/a/text()') if author: news_item['author'] = get_stripped_list(author) published_date = tree.xpath( './/li[contains(@class, "date")]/text()') if published_date: news_item['published_date'] = datetime.strptime( published_date[0].strip().encode('ascii', 'ignore'), '%d %b, %Y') tags = tree.xpath( './/p[contains(@class, "entry-tags")]/a/text()') if tags: news_item['tags'] = get_stripped_list(tags) referer = response.request.headers['Referer'] for item in categories: if referer in sum(item['subcategory'].values(), []): news_item['category'] = item['category'] key = (key for key, value in item['subcategory'].items() if referer in value).next() news_item['sub_categories'] = [key] return news_item except Exception, e: self.log('==Exception=================>>>>>>>>! %r' % e)
def parse_item(self, response): super(BusinessInsiderSpider, self).parse_item(response) htmlparser = etree.HTMLParser() tree = etree.parse(BytesIO(response.body), htmlparser) news_item = NewsItem() try: # title = tree.xpath('//*[@id="Content"]/div[3]/div[3]/div[1]/div/div[1]/div/article/div[1]/h1/text()') title = tree.xpath( '//*[@id="Content"]/div[3]/div[3]/div[1]/div/div[1]/div/article/div[1]/h1//text()' ) # details = tree.xpath('.//div[contains(@class,\'section1\')]//p//text()') details = tree.xpath( './/div[contains(@class,"hide_show_handler main_content")]//p//text()' ) if title and details: news_item['source'] = self.name news_item['source_url'] = response.url.split('?')[0] news_item['crawled_date'] = datetime.now() news_item['title'] = title[0].strip().encode('ascii', 'ignore') news_item['details'] = "\t".join([ item.strip().encode('ascii', 'ignore') for item in details ]) img_urls = tree.xpath( './/div[contains(@class,\'MeetingImg blk\')]/img/@src') img_url_list = [] if img_urls: for img_url in img_urls: img_url_list.append("http://www.businessinsider.in" + img_url) news_item['img_urls'] = get_stripped_list(img_url_list) published_date = tree.xpath( './/div[contains(@class,\'ByLine\')]//span[contains(@class,\'Date\')]//text()' ) if published_date: news_item['published_date'] = datetime.strptime( get_stripped_list(published_date)[0], '%b %d, %Y, %I.%M %p') author = tree.xpath('.//a[contains(@class,\'Name\')]/text()') if author: news_item['author'] = get_stripped_list(author) tags = tree.xpath( './/span[contains(@class,\'anchorLink\')]/text()') more_tags = tree.xpath( './/div[contains(@id,\'commentHash\')]//a/text()') if tags: news_item['tags'] = get_stripped_list(tags) if more_tags: news_item['tags'] = get_stripped_list(more_tags) cover_image = tree.xpath( './/div[contains(@class,\'MeetingImg blk\')]/img/@src') if cover_image: news_item['cover_image'] = img_url_list[0] # get_stripped_list(cover_image) referer = response.request.headers['Referer'] for item in categories: if referer in sum(item['subcategory'].values(), []): news_item['category'] = item['category'] key = (key for key, value in item['subcategory'].items() if referer in value).next() news_item['sub_categories'] = [key] return news_item except: self.log('==Exception=================>>>>>>>>! %r' % e) return None