def getPageContent(self, response): try: data = ' '.join( response.xpath( "//div[@id='article-body']/div/p/text()").extract()) if not data: data = ' '.join( response.xpath( "//section[@class='chapter']//text()").extract()) if not data: data = ' '.join( response.xpath( "//div[contains(@class,'-5s7sjXv')]/div/div/article/p/text()" ).extract()) if not data: data = ' '.join( response.xpath( "//div[contains(@class,'_1Joi0PLr')]//span/text()"). extract()) if not data: logger.error(__name__ + " Unable to Extract Content : " + response.url) data = 'Error' except Exception as e: logger.error(__name__ + " [UNHANDLED] Unable to Extract Content : " + str(e) + " : " + response.url) data = 'Error' return data
def get_logs_or_items(sql, sort, order, queries): items = { 'total': 0, 'rows': [] } filter_str = "SELECT * FROM res_main, res_count \ ORDER BY res_main." + sort + " " + order + " \ OFFSET %s FETCH NEXT %s ROWS ONLY;" sql += filter_str try: conn = postgresSQL() conn.cursor.execute(sql, queries) data = conn.RealDictCursor.fetchall() for row in data: item = {} for column in row: if column == "total": pass item[column] = str(row[column]) items['rows'].append(item) if data != None: items['total'] = data[0]['total'] else: items['total'] = 0 except Exception as e: items['total'] = 0 logger.error(__name__ + " " +str(e)) return json.dumps(items)
def parse_title(self, response): try: if 'gadgets.ndtv.com' in response.url: return response.xpath( '//div[@class="lead_heading"]/h1/span/text()' ).extract_first().strip() elif 'www.ndtv.com' in response.url or 'food.ndtv.com' in response.url: return response.xpath('//h1[@itemprop="headline"]/text()' ).extract_first().strip() elif 'sports.ndtv.com' in response.url or 'profit.ndtv.com' in response.url: return response.xpath('//h1[@itemprop="headline"]/text()' ).extract_first().strip() elif 'auto.ndtv.com' in response.url: return response.xpath( '//h1[@class="article__headline"]/text()').extract_first() elif 'doctor.ndtv.com' in response.url: return response.xpath( '//div[contains(@class, article_heading)]/div[@class="__sslide"]/h1/text()' ).extract_first().strip() else: logger.error(__name__ + ' Unable to Extract Image ' + response.url) return None except Exception as e: logger.error(__name__ + " [UNHANDLED] Unable to Extract Title : " + str(e) + " : " + response.url) return None
def insertIntoNewsTable(self, item, log_id): # Insert item into NEWS_TABLE after all the processing. try: # Verify the Connection to Database if not self.checkConnection(): return False # Prepare the Query postgresQuery = "INSERT INTO " + DB_INFO['NEWS_TABLE'] + " (title, content, image, link, newsDate, site_id, log_id, datescraped) VALUES (%s, %s, %s, %s, %s, %s, %s, NOW())" # Execute the Query self.cursor.execute(postgresQuery, (item['title'], item['content'], item['image'], item['link'], item['newsDate'], item['source'], log_id)) return True except psycopg2.Error as Error: # If the link already exists, this exception will be invoked if (str(Error.pgcode) == '23505'): pass else: logger.error(__name__ + "(" + str(Error.pgcode) + ") " + str(Error) + " occured at " + str(item.get('link'))) except Exception as Error: logger.error(__name__ + " Error While Inserting to Database : " + str(Error)) return False
def parse_date(self, response): try: date = '' if 'www.ndtv.com' in response.url: date = response.xpath('//meta[@name="modified-date"]/@content' ).extract_first()[:-6] elif 'doctor.ndtv.com' in response.url or 'sports.ndtv.com' in response.url: date = response.xpath('//meta[@name="publish-date"]/@content' ).extract_first()[:-6] elif 'auto.ndtv.com' in response.url: date = response.xpath( '//meta[@itemprop="datePublished"]/@content' ).extract_first()[:-6] elif 'food.ndtv.com' in response.url: date = response.xpath( '//span[@itemprop="dateModified"]/@content').extract_first( )[:-6] elif 'gadgets.ndtv.com' in response.url or 'profit.ndtv.com' in response.url: date = response.xpath('//meta[@name="publish-date"]/@content' ).extract_first()[:-6] else: logger.error(__name__ + ' Unable to Extract Date ' + response.url) return None date = (parser.parse(date)).strftime('%Y-%m-%dT%H:%M:%S') except Exception as e: logger.error(__name__ + " [UNHANDLED] Unable to extract Date : " + str(e) + " : " + response.url) return None return date
def connect(self): # Check if connection already exists if self.connection != None: # Is Connection Closed? if self.connection.closed == 0: # If Not, Return the current cursor return self.cursor # Create a New Connection try: # Connect to Database self.connection = psycopg2.connect(host= DB_INFO['HOST_NAME'], user=DB_INFO['USERNAME'], database=DB_INFO['DATABASE_NAME'], password=DB_INFO['PASSWORD']) # Enable AutoCommit self.connection.autocommit = True # Set Cursor to DictCursor self.cursor = self.connection.cursor(cursor_factory = psycopg2.extras.DictCursor) self.RealDictCursor = self.connection.cursor(cursor_factory = rdc) logger.debug(__name__+" Connected to Database") # Try Initializing the Database if not self.initilaize(): logger.error(__name__ + " Database Cannot be initialiazed automatically. Try it Manually.") return self.cursor except Exception as e: logger.critical(__name__+" Database Connection Error! Msg: " + str(e)) if self.connection != None: self.connection.close() self.connection = None return False
def parse_more_news(self, response): try: ajax_response = json.loads(response.text) self.news_id = ajax_response['min_news_id'] html = Selector(text=ajax_response['html']) for news in html.css('div.news-card'): self.urls_parsed += 1 try: item = ScrapenewsItem() item['image'] = news.css('div.news-card-image::attr(style)' ).extract_first()[23:-3] item['title'] = news.css( 'a.clickable>span::text').extract_first() item['content'] = news.css( 'div[itemprop*=articleBody]::text').extract_first() item['newsDate'] = news.css( 'span.time::attr(content)').extract_first()[:-5] item['link'] = news.css( 'div.read-more>a::attr(href)').extract_first() item['source'] = 105 yield item self.urls_scraped += 1 except Exception as e: logger.error(__name__ + " [UNHANDLED] Unable to Extract Data : " + str(e)) self.urls_dropped += 1 except Exception as e: logger.error(__name__ + " [UNHANDLED] " + str(e) + " for response url " + response.url)
def getPageContent(self, response): try: data = ' '.join( response.xpath( "//div[contains(@class,'io-article-body')]//text()"). extract()) if not data: data = ' '.join( response.xpath( "//div[contains(@id,'slider0')]/p/text()").extract()) if not data: data = response.xpath( "//article//*[not(self::script) and not(self::style)]/text()" ).extract() data = ' '.join([ x for x in data if x != ' ' and x != u'\xa0' ]) # Removing all the blank spaces & joining list if not data: logger.error(__name__ + " Unable to extract Content : " + response.url) data = 'Error' except Exception as e: logger.error(__name__ + " [UNHANDLED] Unable to extract Content : " + str(e) + " : " + response.url) data = 'Error' return data
def getPageDate(self, response): try: # split used to Spit Data in Correct format! data = (str( response.xpath("//script[@type='application/ld+json']"). extract_first()).split('datePublished":"', 1)[1])[:19] except (TypeError, IndexError) as Error: # This fail case works only on very specific articles. scriptData = None scriptsList = response.xpath( "/html/head/script[not(contains(@type,'text/javascript'))]") for script in scriptsList: try: scriptData = (script.extract()).split( "<script>utag_data", 1)[1] break except: continue if (scriptData is not None): data = (scriptData.split('"publish_date":"', 1)[1]).split("+", 1)[0] if (data is None): logger.error(__name__ + " Unable to Extract Date : " + response.url) data = 'Error' except Exception as Error: logger.error(__name__ + " [UNHANDLED] Unable to Extract Date : " + str(Error) + " : " + response.url) data = 'Error' return data
def getPageDate(self, response): try: # split & rsplit Used to Spit Data in Correct format! data = response.xpath("//span[@class='dattime']/text()").extract()[1].rsplit(' ', 3)[0] except Exception as Error: logger.error(__name__ + " [UNHANDLED] Unable to Extract Date : " + str(Error) + " : " + response.url) data = 'Error' return data
def getPageDate(self, response): try: # split & rsplit Used to Spit Data in Correct format! data = response.xpath("/html/head/meta[@name='Last-Modified']/@content").extract_first() except Exception as Error: logger.error(__name__ + " [UNHANDLED] Unable to Extract Date : " + str(Error) + " : " + response.url) data = 'Error' return data
def getPageDate(self, response): try: # split & rsplit Used to Spit Data in Correct format! data = (response.xpath("//head/meta[@property='article:published_time']/@content").extract_first()) except Exception as e: logger.error(__name__ + " [UNHANDLED] Unable to extract Date : " + str(e) + " : " + response.url) data = 'Error' return data
def process_date(self, date): """ Processes Date and tries to convert it to Valid Python DateTime Object. Returns Formatted String """ try: parsed_date = parser.parse(date, ignoretz=False, fuzzy=True) return str(parsed_date) except Exception as e: logger.error(__name__ + " Unable to Parse Date (Input: " + str(date) + ") due to " + str(e)) raise DropItem("Unable to Parse Date due to " + str(e))
def getPageDate(self, response): try: data = (response.xpath("//time/@datetime").extract_first()).rsplit( '+', 1)[0] except Exception as Error: logger.error(__name__ + " [UNHANDLED] Unable to extract Date : " + str(Error) + ' : ' + response.url) data = 'Error' return data
def getPageTitle(self, response): try: data = ' '.join( response.xpath("//h1[@itemprop='headline']/text()"). extract_first().split()) except Exception as Error: logger.error(__name__ + " [UNHANDLED] Unable to Extract Title : " + str(Error) + " : " + response.url) data = 'Error' return data
def getPageTitle(self, response): try: data = response.xpath('//h1[@class="arttitle"]/text()').extract_first() if (data is None): logger.error(__name__ + " Unable to Extract Page Title: " + response.url) data = 'Error' except Exception as e: logger.error(__name__ + " [UNHANDLED] Unable to Extract Page Title : " + str(e) + " : " + response.url) data = 'Error' return data
def gettitle(self,response): try: data = response.xpath('//h1[contains(@class, "article-heading margin")]/text()').extract_first() if (data is None): logger.error(__name__ + " Unable to Extract Title : " + response.url) data = 'Error' except Exception as e: logger.error(__name__ + " [UNHANDLED] Unable to Extract Title : " + str(e) + " : " + response.url) data = 'Error' return data
def getPageDate(self, response): try: data = (response.xpath( "/html/head/meta[@property='article:published_time']/@content" ).extract_first()).rsplit('+', 1)[0] except Exception as Error: logger.error(__name__ + " [UNHANDLED] Unable to Extract Date : " + str(Error) + ' : ' + response.url) data = 'Error' return data
def getPageContent(self, response): try: data = ' '.join(response.xpath("//div[@class='content']//*[not(self::script)]/text()").extract()) if not data: logger.error(__name__ + " Unable to Extract Content : " + response.url) data = 'Error' except Exception as e: logger.error(__name__ + " [UNHANDLED] Unable to Extract Content : " + str(e) + " : "+ response.url) data = 'Error' return data
def getPageTitle(self, response): try: data = response.xpath("head/title/text()").extract_first() if (data is None): logger.error(__name__ + " Unable to extract page title " + response.url) data = 'Error' except Exception as e: logger.error(__name__ + " [UNHANDLED] Unable to extract page title : " + str(e) + " : " + response.url) data = 'Error' return data
def getimage(self,response): try: data = response.xpath('//div[contains(@class, "field-item")]/img/@src').extract_first() #scrapes image url if (data is None): logger.error(__name__ + " Unable to Extract Image : " + response.url) data = 'Error' except Exception as e: logger.error(__name__ + " [UNHANDLED] Unable to Extract Image : " + str(e) + " : " + response.url) data = 'Error' return data
def getcontent(self,response): try: data = response.xpath('//div[contains(@class, "article")]/div[contains(@class, "field")]//p/text()').extract() if (data is None): logger.error(__name__ + " [UNHANDLED] Unable to Extract Title : " + response.url) data = 'Error' except Exception as e: logger.error(__name__ + " [UNHANDLED] Unable to Extract Content : " + str(e) + " : " + response.url) data = 'Error' return data
def getPageImage(self, response): try: data = response.xpath('//div[@class="content"]/div/figure/img/@src').extract_first() if (data is None): logger.error(__name__ + " Unable to Extract Image : " + response.url) data = 'Error' except Exception as e: logger.error(__name__ + " [UNHANDLED] Unable to Extract Image : " + str(e) + " : " + response.url) data = 'Error' return data
def getPageImage(self, response): try: data = response.xpath("//meta[@property='og:image']/@content").extract_first() if (data is None): logger.error(__name__ + " Unable to Extract Image : " + response.url) data = 'Error' except Exception as e: logger.error(__name__ + " [UNHANDLED] Unable to extract Image : " + str(e) + " : " + response.url) data = 'Error' return data
def getPageDate(self, response): try: # split & rsplit Used to Spit Data in Correct format! data = (response.xpath( "//head/meta[@itemprop='datePublished']/@content"). extract_first()).rsplit('+', 1)[0] except Exception as Error: logger.error(__name__ + " [UNHANDLED] Unable to Extract Date : " + str(Error) + " : " + response.url) data = 'Error' return data
def getdatetime(self,response): try: data = response.xpath('//span[contains(@class, "date")]/text()').extract_first()[10:-4] z = (datetime.strptime(data," %b %d, %Y, %H:%M %p")).strftime("%Y-%m-%d %H:%M:%S") if (z is None): logger.error(__name__ + " Unable to Extract Title : " + response.url) data = 'Error' except Exception as e: logger.error(__name__ + " [UNHANDLED] Unable to Extract Date : " + str(e) + " : " + response.url) data = 'Error' return data
def getlink(self, response): try: data = response.url if data == self.start_urls: logger.error(__name__ + " Unable to Extract Link : " + response.url) data = 'Error' except Exception as e: logger.error(__name__ + " [UNHANDLED] Unable to Extract Link : " + str(e) + " : " + response.url) data = 'Error' return data
def parse(self, response): try: newsContainer = response.xpath("//ul[@id='cagetory']/li[@class='clearfix']") for newsBox in newsContainer: link = newsBox.xpath('a/@href').extract_first() if not self.postgres.checkUrlExists(link): self.urls_parsed += 1 yield scrapy.Request(url=link, callback=self.parse_article, errback=self.errorRequestHandler) else: self.urls_dropped += 1 except Exception as e: logger.error(__name__+ " [UNHANDLED] " + str(e) + " for response url " + response.url)
def parse(self, response): try: newsContainer = response.xpath("//div[@class='singlesunday']") for newsBox in newsContainer: link = 'http://www.asianage.com' + newsBox.xpath('div/h2/a/@href').extract_first() if not self.postgres.checkUrlExists(link): self.urls_parsed += 1 yield scrapy.Request(url=link, callback=self.parse_article, errback=self.errorRequestHandler) else: self.urls_dropped += 1 except Exception as e: logger.error(__name__ + " [UNHANDLED] " + str(e) + " for response url " + response.url)
def getPageContent(self, response): try: data = ' '.join(response.xpath("//div[@id='storyBody']/p/text()").extract()) if not data: data = ' '.join(response.xpath("//div[@id='storyBody']/p//text()").extract()) if not data: logger.error(__name__ + " Unable to extract page content " + response.url) data = 'Error' except Exception as e: logger.error(__name__ + " [UNHANDLED] Unable to extract Content : " + str(e) + " : " + response.url) data = 'Error' return data