def process_parsed_feed(cls, parsed_feed, feed, overflow, overflow_reason=OVERFLOW_REASON.BACKLOG): keys_by_guid = {guid_for_item(item): ndb.Key(cls, guid_for_item(item), parent=feed.key) for item in parsed_feed.entries} entries = yield ndb.get_multi_async(keys_by_guid.values()) old_guids = [x.key.id() for x in entries if x] new_guids = filter(lambda x: x not in old_guids, keys_by_guid.keys()) new_entries_by_guid = {x: cls(key=keys_by_guid.get(x), guid=x, creating=True) for x in new_guids} new_entries = yield ndb.put_multi_async(new_entries_by_guid.values()) published = overflow futures = [] for item in parsed_feed.entries: entry = new_entries_by_guid.get(guid_for_item(item)) if not entry: continue futures.append((entry, prepare_entry_from_item(parsed_feed, item, feed, overflow, overflow_reason, published))) for entry, future in futures: entry_kwargs = yield future if not entry_kwargs: continue entry_kwargs.pop('parent') entry_kwargs['creating'] = False entry.populate(**entry_kwargs) saved_entries = yield ndb.put_multi_async(new_entries_by_guid.values()) raise ndb.Return((new_guids, old_guids))
def process_inbound_feed(self, parsed_feed, overflow=False): entries = filter_entries(parsed_feed.entries) entries = yield self.filter_entries(entries) if not entries: logger.info( 'prospective: Feed has seen all entries nothing new %s %s', self.feed_url, self.key.urlsafe()) raise ndb.Return(([], [])) last_guid = guid_for_item(entries[0]) logger.info('prospective: entries before rss_items %s', len(entries)) rss_items = map(lambda x: RssItem.from_rss_item(x, self), entries) logger.info( 'prospective: Processing inbound prospective search %s %s %s' % (self.feed_url, len(rss_items), self.key.urlsafe())) for item in rss_items: for topic in self.topics: logger.info('prospective: matching %s %s' % (item, topic)) blah = prospective_search.match( item, topic, result_relative_url='/api/backend/queries/matched') logger.info('What do we get back %s', blah) self.last_guid = last_guid yield self.put_async() raise ndb.Return(([], []))
def filter_entries(self, items): if not self.last_guid: self.last_guid = guid_for_item(items[0]) yield self.put_async() logger.info('prospective: Feed is brand new recording latest guid and exiting %s %s', self.feed_url, self.key.urlsafe()) raise ndb.Return([]) entries = [] seen_guid = False # iterate old to new for entry in reversed(items): if seen_guid: entries += [entry] logger.info('prospective: %s', guid_for_item(entry)) if self.last_guid == guid_for_item(entry): seen_guid = True # Process newest to oldest raise ndb.Return(list(reversed(entries)))
def filter_entries(self, items): if not self.last_guid: self.last_guid = guid_for_item(items[0]) yield self.put_async() logger.info( 'prospective: Feed is brand new recording latest guid and exiting %s %s', self.feed_url, self.key.urlsafe()) raise ndb.Return([]) entries = [] seen_guid = False # iterate old to new for entry in reversed(items): if seen_guid: entries += [entry] logger.info('prospective: %s', guid_for_item(entry)) if self.last_guid == guid_for_item(entry): seen_guid = True # Process newest to oldest raise ndb.Return(list(reversed(entries)))
def prepare_entry_from_item_local(feed, item, published, added, overflow, overflow_reason, remote_fetch): title = prepare_title_from_item(item) link = iri_to_uri(get_link_for_item(feed, item)) # We can only store a title up to 500 chars title = title[0:499] guid = guid_for_item(item) if len(guid) > 500: logger.warn('Found a guid > 500 chars link: %s item: %s', guid, item) return if not link: logger.warn("Item found without link skipping item: %s", item) return if len(link) > 500: logger.warn('Found a link > 500 chars link: %s item: %s', link, item) return if not guid: logger.warn("Item found without guid skipping item: %s", item) return summary = item.get('summary', '') kwargs = dict(guid=guid, title=title, summary=summary, link=link, published=published, overflow=overflow, overflow_reason=overflow_reason) if feed: kwargs['parent'] = feed.key if feed.language: kwargs['language'] = feed.language if 'tags' in item: kwargs['tags'] = filter(None, [x['term'] for x in item.tags]) if 'author' in item and item.author: kwargs['author'] = item.author thumbnail = None try: if 'rss' not in feed.image_strategy_blacklist: thumbnail = yield find_image_in_rss_item(item, remote_fetch) if not thumbnail and 'content' not in feed.image_strategy_blacklist: thumbnail = yield find_image_in_html(summary, remote_fetch) except Exception, e: logger.info("Exception while trying to find thumbnail %s", e) logger.exception(e)
def prepare_entry_from_item(rss_feed, item, feed, overflow=False, overflow_reason=None, published=False): title_detail = item.get('title_detail') title = item.get('title', 'No Title') # If the title is HTML then we need to decode it to some kind of usable text # Definitely need to decode any entities if title_detail: if title_detail['type'] == u'text/html': title = BeautifulSoup(title).text link = iri_to_uri(get_link_for_item(feed, item)) # We can only store a title up to 500 chars title = title[0:499] guid = guid_for_item(item) if len(guid) > 500: logger.warn('Found a guid > 500 chars link: %s item: %s', guid, item) return if not link: logger.warn("Item found without link skipping item: %s", item) return if len(link) > 500: logger.warn('Found a link > 500 chars link: %s item: %s', link, item) return if not guid: logger.warn("Item found without guid skipping item: %s", item) return summary = item.get('summary', '') kwargs = dict(guid=guid, title=title, summary=summary, link=link, published=published, overflow=overflow, overflow_reason=overflow_reason) if feed: kwargs['parent'] = feed.key page_data = yield get_meta_data_for_url(link) kwargs.update(page_data) thumbnail = None try: thumbnail = yield find_thumbnail(item, kwargs.get('meta_tags', {}), feed.image_strategy_blacklist) if thumbnail: kwargs.update(thumbnail) except Exception, e: logger.info("Exception while trying to find thumbnail %s", e) logger.exception(e)
def process_inbound_feed(self, parsed_feed, overflow=False): entries = filter_entries(parsed_feed.entries) entries = yield self.filter_entries(entries) if not entries: logger.info('prospective: Feed has seen all entries nothing new %s %s', self.feed_url, self.key.urlsafe()) raise ndb.Return(([], [])) last_guid = guid_for_item(entries[0]) logger.info('prospective: entries before rss_items %s', len(entries)) rss_items = map(lambda x: RssItem.from_rss_item(x, self), entries) logger.info('prospective: Processing inbound prospective search %s %s %s' % (self.feed_url, len(rss_items), self.key.urlsafe())) for item in rss_items: for topic in self.topics: logger.info('prospective: matching %s %s' % (item, topic)) blah = prospective_search.match(item, topic, result_relative_url='/api/backend/queries/matched') logger.info('What do we get back %s', blah) self.last_guid = last_guid yield self.put_async() raise ndb.Return(([], []))
def get_entries_by_guid(parsed_feed): return OrderedDict((guid_for_item(x), x) for x in filter_entries(parsed_feed.entries))
def get_entries_by_guid(parsed_feed): return OrderedDict( (guid_for_item(x), x) for x in filter_entries(parsed_feed.entries))