def parse_feed(feed_url, page): ids = [] entries = get_feed_entries(feed_url) for entry in entries: parsed = parse_entry(entry) log.debug(parsed['title']) if not parsed: log.warn("Parsing feed failed. continuing") continue if isinstance(parsed['content'], feedparser.FeedParserDict): if 'value' in parsed['content'].keys(): body = parsed['content']['value'] else: body = parsed['content'] if not body: log.warn("Parsing feed failed - no body found") continue cleaned_body = smart_str(bleach.clean(body, tags=(), strip=True)) try: # [Bugzilla-670890] # Needed to allow for the same article but from different sources. This ensures a unique checksum per source checksum = hashlib.md5(cleaned_body + page).hexdigest() exists = FeedEntry.objects.filter(checksum=checksum) if not exists: log.debug('Logging - %s' % parsed['title']) entry = FeedEntry( title=parsed['title'].encode('utf-8'), link=parsed['link'].encode('utf-8'), body=cleaned_body, page=page, checksum=checksum, created_on=time.strftime( "%Y-%m-%d %H:%M:%S", parsed['updated'])) entry.save() feed_id = entry.id else: # if it's already in the feed we still want to keep a reference to it's ID so we know to display it log.debug('Found a duplicate - entry') feed_id = exists[0].id ids.append(feed_id) except: log.warn("Encountered an error creating FeedEntry. Skipping.") continue log.debug(ids) return ids
def parse_feed(feed_url, page): ids = [] entries = get_feed_entries(feed_url) for entry in entries: parsed = parse_entry(entry) log.debug(parsed['title']) if not parsed: log.warn("Parsing feed failed. continuing") continue if isinstance(parsed['content'], feedparser.FeedParserDict): if 'value' in parsed['content'].keys(): body = parsed['content']['value'] else: body = parsed['content'] if not body: log.warn("Parsing feed failed - no body found") continue cleaned_body = smart_str(bleach.clean(body, tags=(), strip=True)) try: # [Bugzilla-670890] # Needed to allow for the same article but from different sources. This ensures a unique checksum per source checksum = hashlib.md5(cleaned_body + page).hexdigest() exists = FeedEntry.objects.filter(checksum=checksum) if not exists: log.debug('Logging - %s' % parsed['title']) entry = FeedEntry(title=parsed['title'].encode('utf-8'), link=parsed['link'].encode('utf-8'), body=cleaned_body, page=page, checksum=checksum, created_on=time.strftime( "%Y-%m-%d %H:%M:%S", parsed['updated'])) entry.save() feed_id = entry.id else: # if it's already in the feed we still want to keep a reference to it's ID so we know to display it log.debug('Found a duplicate - entry') feed_id = exists[0].id ids.append(feed_id) except: log.warn("Encountered an error creating FeedEntry. Skipping.") continue log.debug(ids) return ids
def parse_feed(feed_url, page): ids = [] entries = get_feed_entries(feed_url) for entry in entries: parsed = parse_entry(entry) if not parsed: log.warn("Parsing feed failed. continuing") continue if isinstance(parsed['content'], feedparser.FeedParserDict): if 'value' in parsed['content'].keys(): body = parsed['content']['value'] else: body = parsed['content'] if not body: log.warn("Parsing feed failed - no body found") continue cleaned_body = smart_str(bleach.clean(body, tags=(), strip=True, strip_comments=True)) try: checksum = hashlib.md5(cleaned_body).hexdigest() exists = FeedEntry.objects.filter(checksum=checksum) if not exists: entry = FeedEntry( title=parsed['title'].encode('utf-8'), link=parsed['link'].encode('utf-8'), body=cleaned_body, page=page, checksum=checksum, created_on=time.strftime( "%Y-%m-%d %H:%M:%S", parsed['updated'])) entry.save() ids.append(entry.id) else: ids.append(exists[0].id) except: log.warn("Encountered an error creating FeedEntry. Skipping.") continue return ids