示例#1
0
 def ExtractArticleText( self, title ):
     """Tries to extract the text from an archived acticles HTML"""
     if not self.session.query(self.Articles).filter(self.Articles.title==title).count():
         print 'Article',title,'not found in archive database'
         return False
     Artdata = self.session.query(self.Articles).filter(self.Articles.title==title).one()
     if not Artdata.ArchivedHTML:
         #print 'Article',title,'HTML not archived, attempting to archive now'
         #check = self.ArchiveArticleHTML( title )
         #if not check:
         #    print 'Failed to archive article HTML.'
         print 'Article',title,'HTML not archived, oh well...'
     url = Artdata.url
     reporter = Reporter()
     reporter.read( url = url )
     try:
         Artdata.TEXT = reporter.report_news().encode('latin-1', 'ignore')
     except UnicodeDecodeError:
         print 'there is something gross in thier text...'
         return False
     Artdata.ArchivedText = True
     Artdata.ArchiveTextTime = time.time()
     self.session.commit()
     return True
示例#2
0
from reporter import Reporter

test = 'http://www.washingtonpost.com/world/in-latvia-young-people-discover-new-passions-in-bad-economic-times/2013/07/29/ac638cac-efbf-11e2-8c36-0e868255a989_story.html'
test2 = 'http://blogs.hbr.org/2013/11/making-decisions-together-when-you-dont-agree-on-whats-important/'

my_reporter = Reporter()
my_reporter.read(url=test2)
print my_reporter.report_news()