def task_rss(self, grab, task): """ Extract some information from rss feed """ logger.info("Get rss: {}".format(grab.response.url)) all_authors = set() all_tags = set() all_titles = [] feed = feedparser.parse(remove_bom(grab.response.body)) # feed = parse_feed(grab)['feed'] feed_entries = len(feed['entries']) # if feed_entries > 0: # print feed['entries'][0].keys() for entry in feed['entries']: if 'author' in entry: all_authors.add(entry['author'].lower()) if 'tags' in entry: tags = filter(None, map( lambda x: x['term'].lower(), entry['tags'] )) all_tags |= set(tags) # union operator if 'title' in entry: all_titles.append(entry['title'].lower()) if 'bozo_exception' in feed: logger.error( "Error parsing feed: {}".format(feed['bozo_exception'])) feed_parsing_error = (str(feed['bozo_exception']) if 'bozo_exception' in feed else '') content = { 'authors': list(all_authors), 'tags': list(all_tags), 'titles': list(all_titles), 'entries': feed_entries, 'error': feed_parsing_error, } # we put this data into the task object in previous handler data = task.data data['content'] = content self.save_blog(data)
def parse_feed(grab, teaser_size=1000): """ Extract details of feed fetched with Grab. Returns dict with keys: * feed * entries """ # BOM removing is required because without it # sometimes feedparser just raise SegmentationFault o_O feed = feedparser.parse(remove_bom(grab.response.body)) entries = [] for entry in feed.entries: try: entries.append(parse_entry(entry, feed, teaser_size=teaser_size)) except Exception as ex: log.error('Entry parsing error', exc_info=ex) return {'feed': feed, 'entries': entries}