def process_feed(self): for f in self._feed_list: self._link = f self._fburned = feedparser.parse(self._link) # grab the details from the burned feed self._furl = self._fburned['url'] self._fversion = self._fburned['version'] self._flang = "" self._log.info( "Processing articles for %s: %s, %s" % (self._furl, str(self._fversion), self._flang.strip())) self._articles = [] for i in self._fburned['items']: self._aframe = { 'title': None, 'date': None, 'link': None, 'keywords': [], 'feed': None, 'language': None } try: self._ititle = i['title'] #self._isummary = i['summary'] self._idate = i['published'] self._ilink = i['link'] self._ctext = stripper( i['summary']).get_data() #strip out HTML self._sum = summarize( self._ctext, self._kword_amt) #summarize the article self._kwords = self._sum.get_most_used_words() except Exception, e: print str(e) self._log.error("Failed to process %s" % i['title']) self._log.debug( "Processed article: \ntitle:%s \ndate:%s \nlink:%s \nkeywords:%s" % (self._ititle, self._idate, self._ilink, self._kwords)) self._aframe['title'] = self._ititle self._aframe['date'] = self._idate self._aframe['link'] = self._ilink self._aframe['keywords'] = self._kwords self._aframe['feed'] = self._furl self._aframe['language'] = self._flang hashed = hashlib.sha256(str(self._aframe)).hexdigest( ) #hash the contents to check in DB self._aframe['hashed'] = hashed self._articles.append(self._aframe) if self._mongodb_handle._not_processed(hashed): self._log.info("Adding %s (%s)" % (self._ititle, hashed)) self._mongodb_handle._insert_full(self._aframe)
def process_feed(self): for f in self._feed_list: self._link = f self._fburned = feedparser.parse(self._link) # grab the details from the burned feed self._furl = self._fburned['url'] self._fversion = self._fburned['version'] self._flang = "" self._log.info("Processing articles for %s: %s, %s" % (self._furl,str(self._fversion),self._flang.strip()) ) self._articles = [] for i in self._fburned['items']: self._aframe = {'title':None,'date':None,'link':None,'keywords':[],'feed':None,'language':None} try: self._ititle = i['title'] #self._isummary = i['summary'] self._idate = i['published'] self._ilink = i['link'] self._ctext = stripper(i['summary']).get_data() #strip out HTML self._sum = summarize(self._ctext,self._kword_amt) #summarize the article self._kwords = self._sum.get_most_used_words() except Exception,e: print str(e) self._log.error("Failed to process %s" % i['title']) self._log.debug("Processed article: \ntitle:%s \ndate:%s \nlink:%s \nkeywords:%s" % (self._ititle,self._idate,self._ilink,self._kwords)) self._aframe['title'] = self._ititle self._aframe['date'] = self._idate self._aframe['link'] = self._ilink self._aframe['keywords'] = self._kwords self._aframe['feed'] = self._furl self._aframe['language'] = self._flang hashed = hashlib.sha256(str(self._aframe)).hexdigest() #hash the contents to check in DB self._aframe['hashed'] = hashed self._articles.append(self._aframe) if self._mongodb_handle._not_processed(hashed): self._log.info("Adding %s (%s)" % (self._ititle,hashed)) self._mongodb_handle._insert_full(self._aframe)
def process_feed(self): for f in self._feed_list: self._frame = {'feed':None,'version':None,'language':None,'articles':[]} self._link,self._lang = f.split(",") self._lang = self._lang.strip() self._fburned = feedparser.parse(self._link) # grab the details from the burned feed self._furl = self._fburned['url'] self._fversion = self._fburned['version'] self._flang = self._lang self._log.debug("Processing articles for %s: %s, %s" % (self._furl,str(self._fversion),self._flang.strip()) ) self._articles = [] for i in self._fburned['items']: self._aframe = {'title':None,'date':None,'link':None,'keywords':[]} try: self._ititle = i['title'] #self._isummary = i['summary'] self._idate = i['published'] self._ilink = i['link'] self._ctext = stripper(i['summary']).get_data() #strip out HTML self._sum = summarize(self._ctext,self._kword_amt) #summarize the article self._kwords = self._sum.get_most_used_words() except Exception,e: self._log.error("Failed to process %s" % i['title']) self._log.debug("Processed article: \ntitle:%s \ndate:%s \nlink:%s \nkeywords:%s" % (self._ititle,self._idate,self._ilink,self._kwords)) self._aframe['title'] = self._ititle self._aframe['date'] = self._idate self._aframe['link'] = self._ilink self._aframe['keywords'] = self._kwords self._articles.append(self._aframe) self._frame['feed'] = self._furl self._frame['version'] = self._fversion self._frame['language'] = self._flang self._frame['articles'] = self._articles