def SetPageCategory(user, url, category_id, retries_left=10): user_key = models.UserKey(user) page_rating = ndb.Key(models.PageRating, url, parent=user_key).get() if page_rating is None: # It could be that the PageRating has not become available yet so we retry # a few times. if retries_left > 0: deferred.defer(SetPageCategory, user, url, category_id, retries_left - 1, _queue='default') return category = None if category_id is not None: category = models.CategoryKey(category_id, user) if page_rating.category == category: return page_rating.category = category page_rating.put() deferred.defer( RatingAddedImpl, models.SerializeSource( models.Source(models.SOURCE_TYPE_USER, user.user_id(), category_id)), url, page_rating.rating, _countdown=DELAY_BEFORE_UPDATING_CONNECTIONS.total_seconds())
def SavePastRecommendations(user_id, time_period, recommendations): time_period_numeric = time_periods.Get(time_period)['numeric'] user_key = models.UserKey(user_id) save_time = datetime.now() past_recommendations = [ models.PastRecommendation( key=ndb.Key(models.PastRecommendation, str(time_period_numeric) + ':' + r.destination_url, parent=user_key), user_id=user_id, item_id=r.item_id or items.UrlToItemId(r.destination_url), url=r.destination_url, weight=r.weight, time_period_numeric=time_period_numeric, serialized_recommendation=r.Serialize(), committed=False, date=save_time, index_within_page=i) for i, r in enumerate(recommendations) ] ndb.put_multi(past_recommendations) # Commit newly saved recommendations after 30 minutes of inactivity. deferred.defer( _CommitPastRecommendations, user_id, time_period_numeric, save_time, _countdown=TIME_TO_COMMIT_PAST_RECOMMENDATIONS.total_seconds())
def AddRating(user, url, rating, source, category_id): user_id = models.UserKey(user).id() stats = models.AddRating(user_id, url, rating, source, category_id) # If the page belongs to a feed then register this feed. page_info = models.GetPageInfo(url) if page_info.feed_url: UpdateFeed(feeds.AddFeed(page_info.feed_url, page_info.feed_title)) stats['own_feed'] = page_info.feed_url # If the page itself is a feed url then also register it. if page_info.is_feed: UpdateFeed(feeds.AddFeed(page_info.canonical_url, page_info.title)) stats['own_feed'] = page_info.canonical_url RatingAdded(models.Source(models.SOURCE_TYPE_USER, user_id, category_id), url, rating) return stats
def RatingAddedImpl(source, url, rating): source = models.DeserializeSource(source) if source.source_type == models.SOURCE_TYPE_USER: deferred.defer(UpdatePopularPage, url, _queue='default') # Check if the user has changed the category or removed their vote. # In both cases we do not want to update the connection state. if source.source_type == models.SOURCE_TYPE_USER: user_key = models.UserKey(source.source_id) page_rating = ndb.Key(models.PageRating, url, parent=user_key).get() if not page_rating: return if page_rating.category != source.CategoryKey(): # We will let the deferred task scheduled from SetPageCategory to update # connections for the updated category. return connection_trainer.CreateTrainer().RecommendationAdded(source, url, rating) models.UpdateCachedConnectionInfo(source.source_id)
def _DeleteCategory(user_id): _DeleteAll( models.Category.query(ancestor=models.UserKey(user_id)).fetch( keys_only=True, limit=500), _DeleteCategory, user_id)
def _DeletePageRating(user_id): _DeleteAll( models.PageRating.query(ancestor=models.UserKey(user_id)).fetch( keys_only=True, limit=500), _DeletePageRating, user_id)
def _DeleteUser(user_id): models.UserKey(user_id).delete()
def _GetRecommendedFeedItems(user, since_time, category_id, any_category, connection_version, connection_active_days, external_connections): """Finds recent items from feeds that the user is connected to most.""" subscriber_id = models.UserKey(user).id() subscriber_category = None if any_category else models.CategoryKey( category_id, user) def GetConnections(connection_version=connection_version, subscriber_id=subscriber_id, count=400, any_category=any_category, negative=False): if external_connections: promise = ndb.Future() promise.set_result([]) return promise default_properties = ['publisher_id', 'weight', 'updated_datetime'] properties = default_properties query = models.Connection.query( models.Connection.version == connection_version, models.Connection.publisher_type == models.SOURCE_TYPE_FEED, models.Connection.subscriber_id == subscriber_id) if negative: query = query.filter(models.Connection.weight < 0).order( models.Connection.weight) else: query = query.filter( models.Connection.weight > 0).order(-models.Connection.weight) # When we filter by subscriber category we cannot include it in the # projection. Otherwise we get this error: # BadRequestError: Cannot use projection on a property with an equality # filter. if any_category: properties.append('subscriber_category') else: query = query.filter( models.Connection.subscriber_category == subscriber_category) # We do not filter negative connections out by active_days because we do # not update "active_days" field for negative connections. if connection_active_days and not negative: query = query.filter( models.Connection.active_days == connection_active_days) return query.fetch_async(count, projection=properties) def ConnectionsToDict(connections): return [ConnectionToDict(c) for c in connections] def ConnectionToDict(connection): # We do not use connection.KeyComponents() because the connection object is # a projection that does not have all the fields that KeyComponents() # accesses. key_components = models.ConnectionKeyComponents( models.SOURCE_TYPE_FEED, connection.publisher_id, None, subscriber_id, connection.SubscriberCategoryId() if any_category else models.GetCategoryId(subscriber_category), connection_version) return { 'weight': connection.weight, 'category': (connection.subscriber_category if any_category else subscriber_category), 'updated_datetime': connection.updated_datetime, 'publisher_id': connection.publisher_id, 'key_components': key_components } connections_future = GetConnections() connections = yield connections_future if external_connections: connections = [ c for c in external_connections if c.publisher_type == models.SOURCE_TYPE_FEED ] connections = ConnectionsToDict(connections) feed_url_to_connection = {} feed_urls = [] for connection in connections: # The connection weight is update each time new items are added to the feed. # It means that there is no point looking up items for a feed that was # updated before the time period we are interested in. if connection['updated_datetime'] < since_time: continue weight = connection['weight'] feed_url = connection['publisher_id'] if feed_url not in feed_url_to_connection: feed_url_to_connection[feed_url] = connection feed_urls.append(feed_url) else: feed_url_to_connection[feed_url]['weight'] += weight feed_url_to_items = yield feeds.GetBulkItemIdsAsync(feed_urls, since_time) feed_items = [] for feed_url, item_list in feed_url_to_items.iteritems(): feed_items.extend(item_list) raise ndb.Return((feed_items, feed_url_to_connection))
def RecommendationsOnDemand( user, time_period, category_id, any_category, include_popular, limit, connection_version, decay_rate=1, source_type=SOURCE_TYPE_ANY, exclude_urls=frozenset(), save_past_recommendations=False, exclude_past_recommendations=False, exclude_past_recommendations_from_all_time_periods=False, external_connections=None, exclude_rated_items=True, diversify=False): """Calculates recommendations for a user on demand. The recommendations are calculated from raw user ratings, feed items and top connection of this user to other users and other feeds. Args: user: The user we are making recommendations for. time_period: The time period that the user is interested in category_id: The category that user wants recommendations for. any_category: Whether the user wants recommendations for all categories. include_popular: Whether to include recommendations from other users that this users is not connected to. limit: Pagination size. connection_version: The version of connections to use. decay_rate: How much to penalize older items from the same source. source_type: Whether to return recommendations only from feeds, users or both. exclude_urls: The set of items that should not be returned. save_past_recommendations: Whether to save the returned recommendations as past recommendations. exclude_past_recommendations: Whether to exclude past committed recommendations. exclude_past_recommendations_from_all_time_periods: Whether to exclude past recommendations that were shown for other time periods. Otherwise only excludes past recommendations that were shown for the same time period as time_period. external_connections: If None, then these models.Connection objects will be used instead of getting them from Datastore for this user. exclude_rated_items: Whether to exclude items already rated by the user from the returned recommendations. diversify: If True, then recommendations from the same sources will be forced to be separated with recommendations from other sources. Returns: A list of recommendations. """ exclude_item_ids = set(items.UrlsToItemIds(exclude_urls).values()) subscriber_id = models.UserKey(user).id() now = datetime.now() since_time = _GetSinceTime(time_period, now) past_recommendation_item_ids_future = None past_recommendation_item_ids = frozenset() if exclude_past_recommendations: past_recommendation_item_ids_future = ( past_recommendations.GetPastRecommendationItemIdsAsync( subscriber_id, (None if exclude_past_recommendations_from_all_time_periods else time_period))) if exclude_rated_items: recent_rated_item_ids_future = models.GetRatedItemIdsAsync(subscriber_id) else: recent_rated_item_ids_future = ndb.Future() recent_rated_item_ids_future.set_result([]) connection_active_days = None for days in models.CONNECTION_ALL_ACTIVE_DAYS: if timedelta(days=days) >= now - since_time: connection_active_days = days break if source_type == SOURCE_TYPE_FEED or source_type == SOURCE_TYPE_ANY: feed_info_future = _GetRecommendedFeedItems( user, since_time, category_id, any_category, connection_version, connection_active_days, external_connections) else: feed_info_future = ndb.Future() feed_info_future.set_result(([], [])) def GetConnections(connection_version=connection_version): if external_connections: promise = ndb.Future() promise.set_result([]) return promise query = models.Connection.query( models.Connection.publisher_type == models.SOURCE_TYPE_USER, models.Connection.subscriber_id == subscriber_id, models.Connection.version == connection_version).order( -models.Connection.weight) if connection_active_days: query = query.filter( models.Connection.active_days == connection_active_days) if not any_category: subscriber_category = models.CategoryKey(category_id, user) query = query.filter( models.Connection.subscriber_category == subscriber_category) return query.fetch_async(100) connections_future = GetConnections() query = models.PageRating.query( projection=['item_id', 'user_id', 'rating', 'category', 'date']) if since_time != datetime.min: query = query.filter(models.PageRating.date > since_time) user_ratings = query.order(-models.PageRating.date).fetch(1000) if past_recommendation_item_ids_future: past_recommendation_item_ids = ( past_recommendation_item_ids_future.get_result()) positive_sources = set() negative_sources = set() for r in user_ratings: if r.rating == 0: continue if r.item_id in past_recommendation_item_ids: continue if r.item_id in exclude_item_ids: continue if r.key.parent().id() == subscriber_id: continue publisher_id = r.key.parent().id() publisher_category_id = models.GetCategoryId(r.category) source = (publisher_id, publisher_category_id) if r.rating < 0: negative_sources.add(source) else: positive_sources.add(source) r.rating_source = source positive_source_to_connection = {} negative_source_to_connection = {} connections = connections_future.get_result() if external_connections: connections = [ c for c in external_connections if c.publisher_type == models.SOURCE_TYPE_USER ] for connection in connections: source = (connection.publisher_id, connection.PublisherCategoryId()) # The user may be subscribed to the same source from multiple collections. # We only count the strongest connection here. if (connection.positive and source in positive_sources and source not in positive_source_to_connection): positive_source_to_connection[source] = connection if (not connection.positive and source in negative_sources and source not in negative_source_to_connection): negative_source_to_connection[source] = connection item_id_to_recommendation = {} recent_rated_item_ids = set(recent_rated_item_ids_future.get_result()) nominal_weight = NOMINAL_USER_VOTE_WEIGHT if include_popular else 0 num_matched_past_recommendations = 0 # Keyed by (user_id, category_id). seen_items_from_user = {} for r in user_ratings: item_id = r.item_id assert item_id if r.rating == 0: continue if r.item_id in past_recommendation_item_ids: num_matched_past_recommendations += 1 continue if r.item_id in exclude_item_ids: continue if r.item_id in recent_rated_item_ids: continue if r.key.parent().id() == subscriber_id: continue if r.rating > 0: connection = positive_source_to_connection.get(r.rating_source, None) else: connection = negative_source_to_connection.get(r.rating_source, None) if connection: category_id = connection.SubscriberCategoryId() category = connection.subscriber_category else: category = None category_id = None key = (r.item_id, category_id) if key in item_id_to_recommendation: (recommendation, top_sources, source_users, feed_connections) = item_id_to_recommendation[key] else: recommendation = models.Recommendation( item_id=r.item_id, source_category=category, first_seen_datetime=r.date) top_sources = {} source_users = {} feed_connections = [] item_id_to_recommendation[key] = (recommendation, top_sources, source_users, feed_connections) recommendation.first_seen_datetime = min(recommendation.first_seen_datetime, r.date) weight = nominal_weight if connection: connection_weight = connection.weight connection_top_sources = connection.top_sources publisher_id = connection.publisher_id if r.rating > 0 and connection_weight > 0: for source in connection_top_sources: if source.url not in top_sources: top_sources[source.url] = models.RecommendationSourcePage( source.url) top_sources[source.url].weight += connection_weight top_sources[source.url].user_count += 1 source_users[publisher_id] = connection_weight recommendation.connection_key_components.append( connection.KeyComponents()) # Skip positive recommendations if the publisher has no positive # recommendations in common with the subscriber # (ie, num_shared_items == 0). if r.rating > 0 and connection.num_shared_items == 0: pass else: if decay_rate < 1: key = (publisher_id, connection.PublisherCategoryId()) seen_items = seen_items_from_user.get(key, 0) seen_items_from_user[key] = seen_items + 1 # We are processing user ratings in most-recent first order. The most # recent rated item gets the full weight of the connection, the older # ones get progressively smaller weight. # We apply the decay rate only to the earned connection weight and # leave the nominal weight alone. That way a new user will see purely # popularity based ranking where each rating has the same weight, no # matter who those ratings came from. connection_weight *= decay_rate ** seen_items weight += connection_weight if weight > 0: recommendation.weight += r.rating * weight if r.rating > 0: recommendation.user_count += 1 (feed_items, feed_url_to_connection) = feed_info_future.get_result() seen_items_from_feed = {} if decay_rate < 1: # We need to sort so that the decay rate is applied from most recent items # to less recent items. feed_items.sort(key=lambda item: item.published_date, reverse=True) num_feed_items_matched_past_recommendations = 0 for item in feed_items: item_id = item.item_id if item_id in recent_rated_item_ids: continue if item_id in past_recommendation_item_ids: num_feed_items_matched_past_recommendations += 1 continue connection = feed_url_to_connection.get(item.feed_url, None) if not connection: continue seen_items = 0 # We need to count urls in the exclude list before we ignore them. if decay_rate < 1: seen_items = seen_items_from_feed.get(item.feed_url, 0) seen_items_from_feed[item.feed_url] = seen_items + 1 if item_id in exclude_item_ids: continue category = connection['category'] key = (item_id, models.GetCategoryId(category)) if key in item_id_to_recommendation: (recommendation, top_sources, source_users, feed_connections) = item_id_to_recommendation[key] else: recommendation = models.Recommendation( item_id=item_id, source_category=category, first_seen_datetime=item.published_date) top_sources = {} source_users = {} feed_connections = [] item_id_to_recommendation[key] = (recommendation, top_sources, source_users, feed_connections) recommendation.first_seen_datetime = min(recommendation.first_seen_datetime, item.published_date) weight = connection['weight'] if decay_rate < 1: weight = connection['weight'] * (decay_rate ** seen_items) feed_connections.append(connection) recommendation.weight += weight recommendation.connection_key_components.append( connection['key_components']) for _, (recommendation, top_sources, source_users, feed_connections) in item_id_to_recommendation.iteritems(): recommendation.source_count = len(top_sources) recommendation.top_sources = sorted( top_sources.values(), key=lambda v: v.weight, reverse=True)[:MAX_TOP_SOURCES] feed_connections = sorted( feed_connections, key=lambda c: c['weight'], reverse=True) unique_feed_urls = set( url_util.DeduplicateUrls([c['publisher_id'] for c in feed_connections])) recommendation.top_feed_urls = _GetTopFeedUrls(feed_connections, unique_feed_urls) recommendation.feed_count = len(unique_feed_urls) result = [ r for (r, _, _, _) in item_id_to_recommendation.values() if r.weight > 0 ] result.sort(key=lambda v: (v.weight, v.first_seen_datetime), reverse=True) seen = set() seen_add = seen.add # Remove duplicate items that may have been recommended under different # categories. result = [r for r in result if not (r.item_id in seen or seen_add(r.item_id))] if diversify: result = _DiversifyByKey(result, limit, lambda r: r.ConnectionsHash()) result = result[:limit] # The recommendations only have item_id populated. We need to add # destination_url. item_id_to_url = items.ItemIdsToUrls([r.item_id for r in result]) for r in result: r.destination_url = item_id_to_url.get(r.item_id, '#invalid_item') if save_past_recommendations: past_recommendations.SavePastRecommendations(subscriber_id, time_period, result) return models.DecorateRecommendations(subscriber_id, result)