예제 #1
0
    def predict_for_user(self, user, items=None, ratings=None):
        if items is not None:
            items = np.array(items)
        else:
            items = self.items.values
        valid_mask = self.items.get_indexer(items) >= 0

        min_threshold = self.min_threshold
        min_nn = self.min_nn
        max_nn = self.max_nn
        item_sim = self.item_sim_matrix
        result = dict()
        if np.sum(~valid_mask) > 0:
            self.log.debug("user %s: %s are not valid", user,
                           items[~valid_mask])
            for e in items[~valid_mask]:
                result[e] = np.nan

        items = items[valid_mask]
        upos = self.users.get_loc(user)
        item_bias = None
        if self.bias is not None:
            item_bias = self.bias.get_item_bias()

        assert self.rmat.getformat() == 'csr'
        item_scores = _get_xs(self.rmat, upos)
        # narrow down to items were rated
        valid_item_index = np.argwhere(item_scores != 0)
        for item in items:
            ipos = self.items.get_loc(item)

            clock = Timer()
            # idx with descending similarities with itself
            sorted_idx = np.argsort(item_sim[ipos, valid_item_index])[::-1]
            item_idx = valid_item_index[sorted_idx]
            e0 = clock.restart()

            # sim need to meet min_threshold
            if min_threshold is not None:
                item_idx = item_idx[item_sim[ipos, item_idx] > min_threshold]
            if len(item_idx) < min_nn:
                self.log.debug(
                    "item %s does not have enough neighbors (%s < %s)", item,
                    len(item_idx), min_nn)
                result[item] = np.nan
                continue

            item_idx = item_idx[:max_nn]
            e1 = clock.restart()
            score = _nn_score(item_scores, item_sim, ipos, item_idx, item_bias)
            e2 = clock.restart()
            # print(e0, e1, e2)
            result[item] = score

        df = pd.Series(result)
        return df
예제 #2
0
    def predict_for_user(self, user, items=None, ratings=None):
        uidx = self.users.get_loc(user)
        scores = _get_xs(self.rmat, uidx)
        scores[scores == 0] = np.nan

        if items is not None:
            items = np.array(items)
        else:
            items = self.items.values
        df = scores_to_series(scores, self.items, items)
        return df
예제 #3
0
    def select(self, user, candidates=None):
        """
        :param user: user id
        :param candidates: a list or np.array of items
        :return: return items not reviewed by user id
        """
        upos = self.users.get_loc(user)
        ratings = _get_xs(self.rmat, upos)
        idx = np.argwhere(ratings != 0).flatten()
        if candidates is not None:
            candidates = np.array(candidates)
            base = self.items.get_indexer(candidates)
            idx = np.intersect1d(idx, base)

        items = self.items[idx]
        items = np.array(items)
        return items
예제 #4
0
    def predict_for_user(self, user, items=None, ratings=None):
        """

        :param user: user id
        :param items: a list of item ids
        :param ratings:
        :return:
        """

        min_threshold = self.min_threshold
        min_nn = self.min_nn
        max_nn = self.max_nn
        user_sim = self.user_sim_matrix

        result = dict()
        if items is not None:
            items = np.array(items)
        else:
            items = self.items.values

        valid_mask = self.items.get_indexer(items) >= 0
        if np.sum(~valid_mask) > 0:
            # self.log.warning("%s are not valid" % items[~valid_mask])
            for e in items[~valid_mask]:
                result[e] = np.nan

        items = items[valid_mask]
        upos = self.users.get_loc(user)

        # idx with decending similarities with itself

        full_user_idx = np.argsort(user_sim[upos, :])[::-1]
        if False:
            min_sim = user_sim[upos, full_user_idx].min()
            max_sim = user_sim[upos, full_user_idx].max()
            self.log.info("max similarity and min similarity are %.3f and %.3f", max_sim, min_sim)

        # sim need to meet min_threshold
        if min_threshold is not None:
            full_user_idx = full_user_idx[user_sim[upos, full_user_idx] > min_threshold]

        # convert rmat to array
        # clock = Timer()
        # rmat_array = self.rmat.toarray()
        # print("XXX", clock.restart())
        u_bias = None
        if self.bias is not None:
            u_bias = self.bias.get_user_bias()

        for item in items:
            ipos = self.items.get_loc(item)
            assert self.rmat.getformat() == 'csc'
            user_scores = _get_xs(self.rmat, ipos)
            # narrow down to users who rated the item
            user_idx = full_user_idx[user_scores[full_user_idx] != 0]
            # user_idx = full_user_idx[rmat_array[full_user_idx, ipos] != 0]
            # user_scores = rmat_array[:, ipos]
            user_idx = user_idx[:max_nn]
            if len(user_idx) < min_nn:
                self.log.debug("user %s does not have enough neighbors (%s < %s)", user, len(user_idx), min_nn)
                result[item] = np.nan

            result[item] = _nn_score(user_scores, user_sim, upos, user_idx, u_bias)

        df = pd.Series(result)
        return df