Exemplo n.º 1
0
    def process_parsed_feed(cls, parsed_feed, feed, overflow, overflow_reason=OVERFLOW_REASON.BACKLOG):
        keys_by_guid = {guid_for_item(item): ndb.Key(cls, guid_for_item(item), parent=feed.key) for item in parsed_feed.entries}
        entries = yield ndb.get_multi_async(keys_by_guid.values())
        old_guids = [x.key.id() for x in entries if x]
        new_guids = filter(lambda x: x not in old_guids, keys_by_guid.keys())
        new_entries_by_guid = {x: cls(key=keys_by_guid.get(x), guid=x, creating=True) for x in new_guids}
        new_entries = yield ndb.put_multi_async(new_entries_by_guid.values())

        published = overflow
        futures = []
        for item in parsed_feed.entries:
            entry = new_entries_by_guid.get(guid_for_item(item))
            if not entry:
                continue

            futures.append((entry, prepare_entry_from_item(parsed_feed, item, feed, overflow, overflow_reason, published)))

        for entry, future in futures:
            entry_kwargs = yield future
            if not entry_kwargs:
                continue

            entry_kwargs.pop('parent')
            entry_kwargs['creating'] = False
            entry.populate(**entry_kwargs)

        saved_entries = yield ndb.put_multi_async(new_entries_by_guid.values())

        raise ndb.Return((new_guids, old_guids))
Exemplo n.º 2
0
    def process_inbound_feed(self, parsed_feed, overflow=False):
        entries = filter_entries(parsed_feed.entries)
        entries = yield self.filter_entries(entries)

        if not entries:
            logger.info(
                'prospective: Feed has seen all entries nothing new %s %s',
                self.feed_url, self.key.urlsafe())
            raise ndb.Return(([], []))

        last_guid = guid_for_item(entries[0])
        logger.info('prospective: entries before rss_items %s', len(entries))
        rss_items = map(lambda x: RssItem.from_rss_item(x, self), entries)
        logger.info(
            'prospective: Processing inbound prospective search %s %s %s' %
            (self.feed_url, len(rss_items), self.key.urlsafe()))

        for item in rss_items:
            for topic in self.topics:
                logger.info('prospective: matching %s %s' % (item, topic))
                blah = prospective_search.match(
                    item,
                    topic,
                    result_relative_url='/api/backend/queries/matched')
                logger.info('What do we get back %s', blah)

        self.last_guid = last_guid
        yield self.put_async()
        raise ndb.Return(([], []))
Exemplo n.º 3
0
    def filter_entries(self, items):

        if not self.last_guid:
            self.last_guid = guid_for_item(items[0])
            yield self.put_async()
            logger.info('prospective: Feed is brand new recording latest guid and exiting %s %s', self.feed_url, self.key.urlsafe())
            raise ndb.Return([])

        entries = []
        seen_guid = False
        # iterate old to new
        for entry in reversed(items):
            if seen_guid:
                entries += [entry]
            logger.info('prospective: %s', guid_for_item(entry))
            if self.last_guid == guid_for_item(entry):
                seen_guid = True

        # Process newest to oldest
        raise ndb.Return(list(reversed(entries)))
Exemplo n.º 4
0
    def filter_entries(self, items):

        if not self.last_guid:
            self.last_guid = guid_for_item(items[0])
            yield self.put_async()
            logger.info(
                'prospective: Feed is brand new recording latest guid and exiting %s %s',
                self.feed_url, self.key.urlsafe())
            raise ndb.Return([])

        entries = []
        seen_guid = False
        # iterate old to new
        for entry in reversed(items):
            if seen_guid:
                entries += [entry]
            logger.info('prospective: %s', guid_for_item(entry))
            if self.last_guid == guid_for_item(entry):
                seen_guid = True

        # Process newest to oldest
        raise ndb.Return(list(reversed(entries)))
Exemplo n.º 5
0
def prepare_entry_from_item_local(feed, item, published, added, overflow, overflow_reason, remote_fetch):

    title = prepare_title_from_item(item)
    link = iri_to_uri(get_link_for_item(feed, item))

    # We can only store a title up to 500 chars
    title = title[0:499]
    guid = guid_for_item(item)
    if len(guid) > 500:
        logger.warn('Found a guid > 500 chars link: %s item: %s', guid, item)
        return

    if not link:
        logger.warn("Item found without link skipping item: %s", item)
        return

    if len(link) > 500:
        logger.warn('Found a link > 500 chars link: %s item: %s', link, item)
        return

    if not guid:
        logger.warn("Item found without guid skipping item: %s", item)
        return

    summary = item.get('summary', '')
    kwargs = dict(guid=guid, title=title, summary=summary, link=link,
                  published=published, overflow=overflow, overflow_reason=overflow_reason)

    if feed:
        kwargs['parent'] = feed.key

    if feed.language:
        kwargs['language'] = feed.language

    if 'tags' in item:
        kwargs['tags'] = filter(None, [x['term'] for x in item.tags])

    if 'author' in item and item.author:
        kwargs['author'] = item.author

    thumbnail = None
    try:
        if 'rss' not in feed.image_strategy_blacklist:
            thumbnail = yield find_image_in_rss_item(item, remote_fetch)

        if not thumbnail and 'content' not in feed.image_strategy_blacklist:
            thumbnail = yield find_image_in_html(summary, remote_fetch)

    except Exception, e:
        logger.info("Exception while trying to find thumbnail %s", e)
        logger.exception(e)
Exemplo n.º 6
0
def prepare_entry_from_item(rss_feed, item, feed, overflow=False, overflow_reason=None, published=False):
    title_detail = item.get('title_detail')
    title = item.get('title', 'No Title')

    # If the title is HTML then we need to decode it to some kind of usable text
    # Definitely need to decode any entities
    if title_detail:
        if title_detail['type'] == u'text/html':
            title = BeautifulSoup(title).text

    link = iri_to_uri(get_link_for_item(feed, item))

    # We can only store a title up to 500 chars
    title = title[0:499]
    guid = guid_for_item(item)
    if len(guid) > 500:
        logger.warn('Found a guid > 500 chars link: %s item: %s', guid, item)
        return

    if not link:
        logger.warn("Item found without link skipping item: %s", item)
        return

    if len(link) > 500:
        logger.warn('Found a link > 500 chars link: %s item: %s', link, item)
        return

    if not guid:
        logger.warn("Item found without guid skipping item: %s", item)
        return

    summary = item.get('summary', '')
    kwargs = dict(guid=guid, title=title, summary=summary, link=link,
                  published=published, overflow=overflow, overflow_reason=overflow_reason)

    if feed:
        kwargs['parent'] = feed.key

    page_data = yield get_meta_data_for_url(link)
    kwargs.update(page_data)

    thumbnail = None
    try:
        thumbnail = yield find_thumbnail(item, kwargs.get('meta_tags', {}), feed.image_strategy_blacklist)
        if thumbnail:
            kwargs.update(thumbnail)
    except Exception, e:
        logger.info("Exception while trying to find thumbnail %s", e)
        logger.exception(e)
Exemplo n.º 7
0
    def process_inbound_feed(self, parsed_feed, overflow=False):
        entries = filter_entries(parsed_feed.entries)
        entries = yield self.filter_entries(entries)

        if not entries:
            logger.info('prospective: Feed has seen all entries nothing new %s %s', self.feed_url, self.key.urlsafe())
            raise ndb.Return(([], []))

        last_guid = guid_for_item(entries[0])
        logger.info('prospective: entries before rss_items %s', len(entries))
        rss_items = map(lambda x: RssItem.from_rss_item(x, self), entries)
        logger.info('prospective: Processing inbound prospective search %s %s %s' % (self.feed_url, len(rss_items), self.key.urlsafe()))

        for item in rss_items:
            for topic in self.topics:
                logger.info('prospective: matching %s %s' % (item, topic))
                blah = prospective_search.match(item, topic, result_relative_url='/api/backend/queries/matched')
                logger.info('What do we get back %s', blah)

        self.last_guid = last_guid
        yield self.put_async()
        raise ndb.Return(([], []))
Exemplo n.º 8
0
def get_entries_by_guid(parsed_feed):
    return OrderedDict((guid_for_item(x), x) for x in filter_entries(parsed_feed.entries))
Exemplo n.º 9
0
def get_entries_by_guid(parsed_feed):
    return OrderedDict(
        (guid_for_item(x), x) for x in filter_entries(parsed_feed.entries))