def recommend_k_items(
        self, test, top_k=10, sort_top_k=True, remove_seen=False, normalize=False
    ):
        """Recommend top K items for all users which are in the test set

        Args:
            test (pd.DataFrame): users to test
            top_k (int): number of top items to recommend
            sort_top_k (bool): flag to sort top k results
            remove_seen (bool): flag to remove items seen in training from recommendation

        Returns:
            pd.DataFrame: top k recommendation items for each user
        """

        test_scores = self.score(test, remove_seen=remove_seen, normalize=normalize)

        top_items, top_scores = get_top_k_scored_items(
            scores=test_scores, top_k=top_k, sort_top_k=sort_top_k
        )

        df = pd.DataFrame(
            {
                self.col_user: np.repeat(
                    test[self.col_user].drop_duplicates().values, top_items.shape[1]
                ),
                self.col_item: [self.index2item[item] for item in top_items.flatten()],
                self.col_prediction: top_scores.flatten(),
            }
        )

        # drop invalid items
        return df.replace(-np.inf, np.nan).dropna()
    def get_popularity_based_topk(self, top_k=10, sort_top_k=False):
        """Get top K most frequently occurring items across all users

        Args:
            top_k (int): number of top items to recommend
            sort_top_k (bool): flag to sort top k results

        Returns:
            pd.DataFrame: top k most popular items
        """

        test_scores = np.array([self.item_frequencies])

        logger.info('Getting top K')
        top_items, top_scores = get_top_k_scored_items(
            scores=test_scores, top_k=top_k, sort_top_k=sort_top_k
        )

        return pd.DataFrame(
            {
                self.col_item: [
                    self.index2item[item] for item in top_items.flatten()
                ],
                self.col_prediction: top_scores.flatten(),
            }
        )
def test_get_top_k_scored_items(scores):
    top_items, top_scores = get_top_k_scored_items(scores=scores,
                                                   top_k=3,
                                                   sort_top_k=True)

    assert np.array_equal(top_items, np.array([[4, 3, 2], [0, 1, 2], [1, 3,
                                                                      2]]))
    assert np.array_equal(top_scores,
                          np.array([[5, 4, 3], [5, 4, 3], [5, 4, 3]]))
示例#4
0
    def recommend_k_items(self,
                          test,
                          top_k=10,
                          sort_top_k=True,
                          remove_seen=True,
                          use_id=False):
        """Recommend top K items for all users in the test set.

        Args:
            test (pd.DataFrame): Test data.
            top_k (int): Number of top items to recommend.
            sort_top_k (bool): Flag to sort top k results.
            remove_seen (bool): Flag to remove items seen in training from recommendation.

        Returns:
            pd.DataFrame: Top k recommendation items for each user.

        """
        data = self.data
        if use_id == False:
            user_ids = np.array(
                [data.user2id[x] for x in test[data.col_user].unique()])
        else:
            user_ids = np.array(test[data.col_user].unique())

        test_scores = self.score(user_ids, remove_seen=remove_seen)

        top_items, top_scores = get_top_k_scored_items(scores=test_scores,
                                                       top_k=top_k,
                                                       sort_top_k=sort_top_k)

        df = pd.DataFrame({
            data.col_user:
            np.repeat(test[data.col_user].drop_duplicates().values,
                      top_items.shape[1]),
            data.col_item:
            top_items.flatten() if use_id else
            [data.id2item[item] for item in top_items.flatten()],
            data.col_prediction:
            top_scores.flatten(),
        })

        return df.replace(-np.inf, np.nan).dropna()
    def get_item_based_topk(self, items, top_k=10, sort_top_k=True):
        """Get top K similar items to provided seed items based on similarity metric defined.
        This method will take a set of items and use them to recommend the most similar items to that set
        based on the similarity matrix fit during training.
        This allows recommendations for cold-users (unseen during training), note - the model is not updated.

        The following options are possible based on information provided in the items input:
        1. Single user or seed of items: only item column (ratings are assumed to be 1)
        2. Single user or seed of items w/ ratings: item column and rating column
        3. Separate users or seeds of items: item and user column (user ids are only used to separate item sets)
        4. Separate users or seeds of items with ratings: item, user and rating columns provided

        Args:
            items (pd.DataFrame): DataFrame with item, user (optional), and rating (optional) columns
            top_k (int): number of top items to recommend
            sort_top_k (bool): flag to sort top k results

        Returns:
            pd.DataFrame: sorted top k recommendation items
        """

        # convert item ids to indices
        item_ids = items[self.col_item].map(self.item2index)

        # if no ratings were provided assume they are all 1
        if self.col_rating in items.columns:
            ratings = items[self.col_rating]
        else:
            ratings = pd.Series(np.ones_like(item_ids))

        # create local map of user ids
        if self.col_user in items.columns:
            test_users = items[self.col_user]
            user2index = {x[1]: x[0] for x in enumerate(items[self.col_user].unique())}
            user_ids = test_users.map(user2index)
        else:
            # if no user column exists assume all entries are for a single user
            test_users = pd.Series(np.zeros_like(item_ids))
            user_ids = test_users
        n_users = user_ids.drop_duplicates().shape[0]

        # generate pseudo user affinity using seed items
        pseudo_affinity = sparse.coo_matrix(
            (ratings, (user_ids, item_ids)), shape=(n_users, self.n_items)
        ).tocsr()

        # calculate raw scores with a matrix multiplication
        test_scores = pseudo_affinity.dot(self.item_similarity)

        # remove items in the seed set so recommended items are novel
        test_scores[user_ids, item_ids] = -np.inf

        top_items, top_scores = get_top_k_scored_items(
            scores=test_scores, top_k=top_k, sort_top_k=sort_top_k
        )

        df = pd.DataFrame(
            {
                self.col_user: np.repeat(
                    test_users.drop_duplicates().values, top_items.shape[1]
                ),
                self.col_item: [self.index2item[item] for item in top_items.flatten()],
                self.col_prediction: top_scores.flatten(),
            }
        )

        # drop invalid items
        return df.replace(-np.inf, np.nan).dropna()
def test_get_top_k_scored_items():
    scores = np.array([[1, 2, 3, 4, 5], [5, 4, 3, 2, 1], [1, 5, 3, 4, 2]])
    top_items, top_scores = get_top_k_scored_items(scores=scores, top_k=3, sort_top_k=True)

    assert np.array_equal(top_items, np.array([[4, 3, 2], [0, 1, 2], [1, 3, 2]]))
    assert np.array_equal(top_scores, np.array([[5, 4, 3], [5, 4, 3], [5, 4, 3]]))