def ExtractArticleText( self, title ): """Tries to extract the text from an archived acticles HTML""" if not self.session.query(self.Articles).filter(self.Articles.title==title).count(): print 'Article',title,'not found in archive database' return False Artdata = self.session.query(self.Articles).filter(self.Articles.title==title).one() if not Artdata.ArchivedHTML: #print 'Article',title,'HTML not archived, attempting to archive now' #check = self.ArchiveArticleHTML( title ) #if not check: # print 'Failed to archive article HTML.' print 'Article',title,'HTML not archived, oh well...' url = Artdata.url reporter = Reporter() reporter.read( url = url ) try: Artdata.TEXT = reporter.report_news().encode('latin-1', 'ignore') except UnicodeDecodeError: print 'there is something gross in thier text...' return False Artdata.ArchivedText = True Artdata.ArchiveTextTime = time.time() self.session.commit() return True
from reporter import Reporter test = 'http://www.washingtonpost.com/world/in-latvia-young-people-discover-new-passions-in-bad-economic-times/2013/07/29/ac638cac-efbf-11e2-8c36-0e868255a989_story.html' test2 = 'http://blogs.hbr.org/2013/11/making-decisions-together-when-you-dont-agree-on-whats-important/' my_reporter = Reporter() my_reporter.read(url=test2) print my_reporter.report_news()