예제 #1
0
 def parse_feed(self, entry):
     'Extract list of articles from the feed.'
     articles = []
     (url, publisher, publisher_location) = entry
     try:
         c = urlopen(url)
     except URLError:
         print 'Failed to fetch ' + url
     feed = feedparser.parse(c)
     # for e in feed.entries[:1]: # read just the first entry while debugging
     for e in feed.entries:
         image_link = None
         image_type = None
         for link in e.links:
             if link['rel'] == 'enclosure':
                 image_link = link['href']
                 image_type = link['type']
         article = Article(
             publisher=publisher,
             publisher_location=publisher_location,
             published_date=e.updated_parsed,
             title=e.title,
             link=e.link,
             image_link=image_link,
             image_type=image_type)
         content = self.htmlparser.parse(e.link)
         m = re.search(r'-\s*([a-zA-Z]+(,?\s+[a-zA-Z]+){0,6})$', content)
         if m:
             article.source = m.group(1)
         article.content = re.sub(r'(\\n)?\s*-\s*([a-zA-Z]+(,?\s+[a-zA-Z]+){0,6})$', '', content)
         article.store(self.db) # put article and word frequencies into couchdb
         articles.append(article)
     return articles