def recommend_k_items( self, test, top_k=10, sort_top_k=True, remove_seen=False, normalize=False ): """Recommend top K items for all users which are in the test set Args: test (pd.DataFrame): users to test top_k (int): number of top items to recommend sort_top_k (bool): flag to sort top k results remove_seen (bool): flag to remove items seen in training from recommendation Returns: pd.DataFrame: top k recommendation items for each user """ test_scores = self.score(test, remove_seen=remove_seen, normalize=normalize) top_items, top_scores = get_top_k_scored_items( scores=test_scores, top_k=top_k, sort_top_k=sort_top_k ) df = pd.DataFrame( { self.col_user: np.repeat( test[self.col_user].drop_duplicates().values, top_items.shape[1] ), self.col_item: [self.index2item[item] for item in top_items.flatten()], self.col_prediction: top_scores.flatten(), } ) # drop invalid items return df.replace(-np.inf, np.nan).dropna()
def get_popularity_based_topk(self, top_k=10, sort_top_k=False): """Get top K most frequently occurring items across all users Args: top_k (int): number of top items to recommend sort_top_k (bool): flag to sort top k results Returns: pd.DataFrame: top k most popular items """ test_scores = np.array([self.item_frequencies]) logger.info('Getting top K') top_items, top_scores = get_top_k_scored_items( scores=test_scores, top_k=top_k, sort_top_k=sort_top_k ) return pd.DataFrame( { self.col_item: [ self.index2item[item] for item in top_items.flatten() ], self.col_prediction: top_scores.flatten(), } )
def test_get_top_k_scored_items(scores): top_items, top_scores = get_top_k_scored_items(scores=scores, top_k=3, sort_top_k=True) assert np.array_equal(top_items, np.array([[4, 3, 2], [0, 1, 2], [1, 3, 2]])) assert np.array_equal(top_scores, np.array([[5, 4, 3], [5, 4, 3], [5, 4, 3]]))
def recommend_k_items(self, test, top_k=10, sort_top_k=True, remove_seen=True, use_id=False): """Recommend top K items for all users in the test set. Args: test (pd.DataFrame): Test data. top_k (int): Number of top items to recommend. sort_top_k (bool): Flag to sort top k results. remove_seen (bool): Flag to remove items seen in training from recommendation. Returns: pd.DataFrame: Top k recommendation items for each user. """ data = self.data if use_id == False: user_ids = np.array( [data.user2id[x] for x in test[data.col_user].unique()]) else: user_ids = np.array(test[data.col_user].unique()) test_scores = self.score(user_ids, remove_seen=remove_seen) top_items, top_scores = get_top_k_scored_items(scores=test_scores, top_k=top_k, sort_top_k=sort_top_k) df = pd.DataFrame({ data.col_user: np.repeat(test[data.col_user].drop_duplicates().values, top_items.shape[1]), data.col_item: top_items.flatten() if use_id else [data.id2item[item] for item in top_items.flatten()], data.col_prediction: top_scores.flatten(), }) return df.replace(-np.inf, np.nan).dropna()
def get_item_based_topk(self, items, top_k=10, sort_top_k=True): """Get top K similar items to provided seed items based on similarity metric defined. This method will take a set of items and use them to recommend the most similar items to that set based on the similarity matrix fit during training. This allows recommendations for cold-users (unseen during training), note - the model is not updated. The following options are possible based on information provided in the items input: 1. Single user or seed of items: only item column (ratings are assumed to be 1) 2. Single user or seed of items w/ ratings: item column and rating column 3. Separate users or seeds of items: item and user column (user ids are only used to separate item sets) 4. Separate users or seeds of items with ratings: item, user and rating columns provided Args: items (pd.DataFrame): DataFrame with item, user (optional), and rating (optional) columns top_k (int): number of top items to recommend sort_top_k (bool): flag to sort top k results Returns: pd.DataFrame: sorted top k recommendation items """ # convert item ids to indices item_ids = items[self.col_item].map(self.item2index) # if no ratings were provided assume they are all 1 if self.col_rating in items.columns: ratings = items[self.col_rating] else: ratings = pd.Series(np.ones_like(item_ids)) # create local map of user ids if self.col_user in items.columns: test_users = items[self.col_user] user2index = {x[1]: x[0] for x in enumerate(items[self.col_user].unique())} user_ids = test_users.map(user2index) else: # if no user column exists assume all entries are for a single user test_users = pd.Series(np.zeros_like(item_ids)) user_ids = test_users n_users = user_ids.drop_duplicates().shape[0] # generate pseudo user affinity using seed items pseudo_affinity = sparse.coo_matrix( (ratings, (user_ids, item_ids)), shape=(n_users, self.n_items) ).tocsr() # calculate raw scores with a matrix multiplication test_scores = pseudo_affinity.dot(self.item_similarity) # remove items in the seed set so recommended items are novel test_scores[user_ids, item_ids] = -np.inf top_items, top_scores = get_top_k_scored_items( scores=test_scores, top_k=top_k, sort_top_k=sort_top_k ) df = pd.DataFrame( { self.col_user: np.repeat( test_users.drop_duplicates().values, top_items.shape[1] ), self.col_item: [self.index2item[item] for item in top_items.flatten()], self.col_prediction: top_scores.flatten(), } ) # drop invalid items return df.replace(-np.inf, np.nan).dropna()
def test_get_top_k_scored_items(): scores = np.array([[1, 2, 3, 4, 5], [5, 4, 3, 2, 1], [1, 5, 3, 4, 2]]) top_items, top_scores = get_top_k_scored_items(scores=scores, top_k=3, sort_top_k=True) assert np.array_equal(top_items, np.array([[4, 3, 2], [0, 1, 2], [1, 3, 2]])) assert np.array_equal(top_scores, np.array([[5, 4, 3], [5, 4, 3], [5, 4, 3]]))