示例#1
0
    def fit(self,
            topK=50,
            shrink=100,
            similarity='cosine',
            feature_weighting="none",
            **similarity_args):

        # Similaripy returns also self similarity, which will be set to 0 afterwards
        topK += 1
        self.topK = topK
        self.shrink = shrink

        if feature_weighting not in self.FEATURE_WEIGHTING_VALUES:
            raise ValueError(
                "Value for 'feature_weighting' not recognized. Acceptable values are {}, provided was '{}'"
                .format(self.FEATURE_WEIGHTING_VALUES, feature_weighting))

        if feature_weighting == "BM25":
            self.URM_train = self.URM_train.astype(np.float32)
            self.URM_train = okapi_BM_25(self.URM_train.T).T
            self.URM_train = check_matrix(self.URM_train, 'csr')

        elif feature_weighting == "TF-IDF":
            self.URM_train = self.URM_train.astype(np.float32)
            self.URM_train = TF_IDF(self.URM_train.T).T
            self.URM_train = check_matrix(self.URM_train, 'csr')

        if similarity == "cosine":
            self.W_sparse = sim.cosine(self.URM_train,
                                       k=topK,
                                       shrink=shrink,
                                       **similarity_args)
        elif similarity == "jaccard":
            self.W_sparse = sim.jaccard(self.URM_train,
                                        k=topK,
                                        shrink=shrink,
                                        **similarity_args)
        elif similarity == "dice":
            self.W_sparse = sim.dice(self.URM_train,
                                     k=topK,
                                     shrink=shrink,
                                     **similarity_args)
        elif similarity == "jaccard":
            self.W_sparse = sim.tversky(self.URM_train,
                                        k=topK,
                                        shrink=shrink,
                                        **similarity_args)
        elif similarity == "splus":
            self.W_sparse = sim.s_plus(self.URM_train,
                                       k=topK,
                                       shrink=shrink,
                                       **similarity_args)
        else:
            raise ValueError(
                "Unknown value '{}' for similarity".format(similarity))

        self.W_sparse.setdiag(0)
        self.W_sparse = self.W_sparse.transpose().tocsr()
    def fit(self,
            topK=50,
            shrink=100,
            similarity='cosine',
            feature_weighting="none"):

        self.topK = topK
        self.shrink = shrink

        if feature_weighting == "bm25":
            self.ICM_train = similaripy.normalization.bm25(self.ICM_train)
        elif feature_weighting == "bm25plus":
            self.ICM_train = similaripy.normalization.bm25plus(self.ICM_train)
        elif feature_weighting == "tfidf":
            self.ICM_train = similaripy.normalization.tfidf(self.ICM_train)

        if similarity == "cosine":
            similarity_matrix = similaripy.cosine(self.ICM_train,
                                                  k=self.topK,
                                                  shrink=self.shrink,
                                                  binary=False,
                                                  verbose=False)
        if similarity == "s_plus":
            similarity_matrix = similaripy.s_plus(self.ICM_train,
                                                  k=self.topK,
                                                  shrink=self.shrink,
                                                  binary=False,
                                                  verbose=False)
        if similarity == "dice":
            similarity_matrix = similaripy.dice(self.ICM_train,
                                                k=self.topK,
                                                shrink=self.shrink,
                                                binary=False,
                                                verbose=False)
        if similarity == "rp3beta":
            similarity_matrix = similaripy.rp3beta(self.ICM_train,
                                                   alpha=0.3,
                                                   beta=0.61,
                                                   k=self.topK,
                                                   shrink=self.shrink,
                                                   binary=False,
                                                   verbose=False)
        if similarity == "p3alpha":
            similarity_matrix = similaripy.p3alpha(self.ICM_train,
                                                   k=self.topK,
                                                   shrink=self.shrink,
                                                   binary=False,
                                                   verbose=False)
        if similarity == "jaccard":
            similarity_matrix = similaripy.jaccard(self.ICM_train,
                                                   k=self.topK,
                                                   shrink=self.shrink,
                                                   binary=False,
                                                   verbose=False)

        self.W_sparse = similarity_matrix.transpose().tocsr()
        self.W_sparse = check_matrix(self.W_sparse, format='csr')
    def fit(self, topK=50, shrink=100, similarity='cosine', pre_normalization="none", post_normalization = "none", **similarity_args):

        self.topK = topK
        self.shrink = shrink

        #ucm = sps.load_npz("FULL_UCM.npz")
        interactions = self.URM_train

        if pre_normalization == "bm25plus":
            interactions = similaripy.normalization.bm25plus(self.URM_train, axis=1, k1=1.2, b=0.75, delta=0.85, tf_mode="raw", idf_mode="bm25", inplace=False)
        if pre_normalization == "tfidf":
            interactions = similaripy.normalization.tfidf(self.URM_train, axis=1)

        #interactions = sps.hstack((interactions, ucm))
        
        if similarity == "cosine":
            similarity_matrix = similaripy.cosine(interactions, k=self.topK, shrink=self.shrink, binary=False, verbose=False)
            similarity_matrix = similarity_matrix.transpose().tocsr()
        if similarity == "s_plus":
            similarity_matrix = similaripy.s_plus(interactions, k=self.topK, shrink=self.shrink, binary = False, verbose = False)
            similarity_matrix = similarity_matrix.transpose().tocsr()
        if similarity == "dice":
            similarity_matrix = similaripy.dice(interactions, k=self.topK, shrink=self.shrink, binary=False, verbose=False)
            similarity_matrix = similarity_matrix.transpose().tocsr()
        if similarity == "rp3beta":
            similarity_matrix = similaripy.rp3beta(interactions, alpha=0.3, beta=0.61, k=self.topK, shrink=self.shrink, binary=False, verbose=False)
            similarity_matrix = similarity_matrix.transpose().tocsr()
        if similarity == "asym":
            similarity_matrix = similaripy.asymmetric_cosine(interactions, k=self.topK, shrink=self.shrink, alpha=0.5, binary=False, verbose=False)
            similarity_matrix = similarity_matrix.transpose().tocsr()
        if similarity == "jaccard":
            similarity_matrix = similaripy.jaccard(interactions, k=self.topK, shrink=self.shrink, binary=False, verbose=False)
            similarity_matrix = similarity_matrix.transpose().tocsr()
        
        
        if post_normalization == "bm25plus_once":
            self.URM_train = similaripy.normalization.bm25plus(self.URM_train, axis=1, k1=1.2, b=0.75, delta=0.8, tf_mode='raw', idf_mode='bm25', inplace=False)
        if post_normalization == "bm25plus_twice":
            self.URM_train = similaripy.normalization.bm25plus(self.URM_train, axis=1, k1=1.2, b=0.75, delta=0.8, tf_mode='raw', idf_mode='bm25', inplace=False)
            self.URM_train = similaripy.normalization.bm25plus(self.URM_train, axis=1, k1=1.2, b=0.75, delta=0.8, tf_mode='raw', idf_mode='bm25', inplace=False)
        if post_normalization == "tfidf":
            self.URM_train = similaripy.normalization.tfidf(self.URM_train, axis=1)
        if post_normalization == "bm25":
            self.URM_train = similaripy.normalization.bm25(self.URM_train, axis=1)

        
        self.W_sparse = similarity_matrix
        self.W_sparse = check_matrix(self.W_sparse, format='csr')
示例#4
0
    def fit(self,
            matrix,
            k,
            distance,
            shrink=0,
            threshold=0,
            implicit=True,
            alpha=None,
            beta=None,
            l=None,
            c=None,
            verbose=False):
        """
        Initialize the model and compute the similarity matrix S with a distance metric.
        Access the similarity matrix using: self._sim_matrix

        Parameters
        ----------
        matrix : csr_matrix
            A sparse matrix. For example, it can be the URM of shape (number_users, number_items).
        k : int
            K nearest neighbour to consider.
        distance : str
            One of the supported distance metrics, check collaborative_filtering_base constants.
        shrink : float, optional
            Shrink term used in the normalization
        threshold: float, optional
            All the values under this value are cutted from the final result
        implicit: bool, optional
            If true, treat the URM as implicit, otherwise consider explicit ratings (real values) in the URM
        alpha: float, optional, included in [0,1]
        beta: float, optional, included in [0,1]
        l: float, optional, balance coefficient used in s_plus distance, included in [0,1]
        c: float, optional, cosine coefficient, included in [0,1]
        """
        alpha = -1 if alpha is None else alpha
        beta = -1 if beta is None else beta
        l = -1 if l is None else l
        c = -1 if c is None else c
        if distance == self.SIM_ASYMCOSINE and not (0 <= alpha <= 1):
            log.error(
                'Invalid parameter alpha in asymmetric cosine similarity!')
            return
        if distance == self.SIM_TVERSKY and not (0 <= alpha <= 1
                                                 and 0 <= beta <= 1):
            log.error('Invalid parameter alpha/beta in tversky similarity!')
            return
        if distance == self.SIM_P3ALPHA and alpha is None:
            log.error('Invalid parameter alpha in p3alpha similarity')
            return
        if distance == self.SIM_RP3BETA and alpha is None and beta is None:
            log.error('Invalid parameter alpha/beta in rp3beta similarity')
            return
        if distance == self.SIM_SPLUS and not (0 <= l <= 1 and 0 <= c <= 1
                                               and 0 <= alpha <= 1
                                               and 0 <= beta <= 1):
            log.error('Invalid parameter alpha/beta/l/c in s_plus similarity')
            return

        # compute and stores the similarity matrix using one of the distance metric: S = R•R'
        if distance == self.SIM_COSINE:
            self._sim_matrix = sim.cosine(matrix,
                                          k=k,
                                          shrink=shrink,
                                          threshold=threshold,
                                          binary=implicit)
        elif distance == self.SIM_ASYMCOSINE:
            self._sim_matrix = sim.asymmetric_cosine(matrix,
                                                     k=k,
                                                     shrink=shrink,
                                                     threshold=threshold,
                                                     binary=implicit,
                                                     alpha=alpha)
        elif distance == self.SIM_JACCARD:
            self._sim_matrix = sim.jaccard(matrix,
                                           k=k,
                                           shrink=shrink,
                                           threshold=threshold,
                                           binary=implicit)
        elif distance == self.SIM_DICE:
            self._sim_matrix = sim.dice(matrix,
                                        k=k,
                                        shrink=shrink,
                                        threshold=threshold,
                                        binary=implicit)
        elif distance == self.SIM_TVERSKY:
            self._sim_matrix = sim.tversky(matrix,
                                           k=k,
                                           shrink=shrink,
                                           threshold=threshold,
                                           binary=implicit,
                                           alpha=alpha,
                                           beta=beta)
        elif distance == self.SIM_P3ALPHA:
            self._sim_matrix = sim.p3alpha(matrix,
                                           k=k,
                                           shrink=shrink,
                                           threshold=threshold,
                                           binary=implicit,
                                           alpha=alpha)
        elif distance == self.SIM_RP3BETA:
            self._sim_matrix = sim.rp3beta(matrix,
                                           k=k,
                                           shrink=shrink,
                                           threshold=threshold,
                                           binary=implicit,
                                           alpha=alpha,
                                           beta=beta)
        elif distance == self.SIM_SPLUS:
            self._sim_matrix = sim.s_plus(matrix,
                                          k=k,
                                          shrink=shrink,
                                          threshold=threshold,
                                          binary=implicit,
                                          l=l,
                                          t1=alpha,
                                          t2=beta,
                                          c=c)
        else:
            log.error('Invalid distance metric: {}'.format(distance))
        #self.SIM_DOTPRODUCT: sim.dot_product(matrix, k=k, shrink=shrink, threshold=threshold, binary=implicit)
        return self._sim_matrix
示例#5
0
def similarity(matrix,
               k=100,
               sim_type='cosine',
               shrink=0,
               threshold=0,
               implicit=True,
               alpha=None,
               beta=None,
               l=None,
               c=None):

    # similarity type
    SIM_COSINE = 'cosine'
    SIM_TVERSKY = 'tversky'
    SIM_P3ALPHA = 'p3alpha'
    SIM_ASYMCOSINE = 'asymcosine'
    SIM_RP3BETA = 'rp3beta'
    SIM_SPLUS = 'splus'
    SIM_JACCARD = 'jaccard'
    SIM_DICE = 'dice'

    matrix = matrix.T

    if sim_type == SIM_COSINE:
        return sim.cosine(matrix,
                          k=k,
                          shrink=shrink,
                          threshold=threshold,
                          binary=implicit)
    elif sim_type == SIM_ASYMCOSINE:
        return sim.asymmetric_cosine(matrix,
                                     k=k,
                                     shrink=shrink,
                                     threshold=threshold,
                                     binary=implicit,
                                     alpha=alpha)
    elif sim_type == SIM_JACCARD:
        return sim.jaccard(matrix,
                           k=k,
                           shrink=shrink,
                           threshold=threshold,
                           binary=implicit)
    elif sim_type == SIM_TVERSKY:
        return sim.tversky(matrix,
                           k=k,
                           shrink=shrink,
                           threshold=threshold,
                           binary=implicit,
                           alpha=alpha,
                           beta=beta)
    elif sim_type == SIM_P3ALPHA:
        return sim.p3alpha(matrix,
                           k=k,
                           shrink=shrink,
                           threshold=threshold,
                           binary=implicit,
                           alpha=alpha)
    elif sim_type == SIM_RP3BETA:
        return sim.rp3beta(matrix,
                           k=k,
                           shrink=shrink,
                           threshold=threshold,
                           binary=implicit,
                           alpha=alpha,
                           beta=beta)
    elif sim_type == SIM_SPLUS:
        return sim.s_plus(matrix,
                          k=k,
                          shrink=shrink,
                          threshold=threshold,
                          binary=implicit)  #, l=l, t1=alpha, t2=beta, c=c)
    elif sim_type == SIM_DICE:
        return sim.dice(matrix,
                        k=k,
                        shrink=shrink,
                        threshold=threshold,
                        binary=implicit)
    else:
        print('Error wrong distance metric')
    def fit(self):
        self.alpha = -1 if self.alpha is None else self.alpha
        self.beta = -1 if self.beta is None else self.beta
        self.l = -1 if self.l is None else self.l
        self.c = -1 if self.c is None else self.c
        if self.distance == self.SIM_ASYMCOSINE and not (0 <= self.alpha <= 1):
            log.error(
                'Invalid parameter alpha in asymmetric cosine Similarity_MFD!')
            return
        if self.distance == self.SIM_TVERSKY and not (0 <= self.alpha <= 1
                                                      and 0 <= self.beta <= 1):
            log.error(
                'Invalid parameter alpha/beta in tversky Similarity_MFD!')
            return
        if self.distance == self.SIM_P3ALPHA and self.alpha is None:
            log.error('Invalid parameter alpha in p3alpha Similarity_MFD')
            return
        if self.distance == self.SIM_RP3BETA and self.alpha is None and self.beta is None:
            log.error('Invalid parameter alpha/beta in rp3beta Similarity_MFD')
            return
        if self.distance == self.SIM_SPLUS and not (
                0 <= self.l <= 1 and 0 <= self.c <= 1 and 0 <= self.alpha <= 1
                and 0 <= self.beta <= 1):
            log.error(
                'Invalid parameter alpha/beta/l/c in s_plus Similarity_MFD')
            return

        # compute and stores the Similarity_MFD matrix using one of the distance metric: S = R•R'
        if self.distance == self.SIM_COSINE:
            self._sim_matrix = sim.cosine(self.matrix,
                                          k=self.k,
                                          shrink=self.shrink,
                                          threshold=self.threshold,
                                          binary=self.implicit)
        elif self.distance == self.SIM_ASYMCOSINE:
            self._sim_matrix = sim.asymmetric_cosine(self.matrix,
                                                     k=self.k,
                                                     shrink=self.shrink,
                                                     threshold=self.threshold,
                                                     binary=self.implicit,
                                                     alpha=self.alpha)
        elif self.distance == self.SIM_JACCARD:
            self._sim_matrix = sim.jaccard(self.matrix,
                                           k=self.k,
                                           shrink=self.shrink,
                                           threshold=self.threshold,
                                           binary=self.implicit)
        elif self.distance == self.SIM_DICE:
            self._sim_matrix = sim.dice(self.matrix,
                                        k=self.k,
                                        shrink=self.shrink,
                                        threshold=self.threshold,
                                        binary=self.implicit)
        elif self.distance == self.SIM_TVERSKY:
            self._sim_matrix = sim.tversky(self.matrix,
                                           k=self.k,
                                           shrink=self.shrink,
                                           threshold=self.threshold,
                                           binary=self.implicit,
                                           alpha=self.alpha,
                                           beta=self.beta)
        elif self.distance == self.SIM_P3ALPHA:
            self._sim_matrix = sim.p3alpha(self.matrix,
                                           k=self.k,
                                           shrink=self.shrink,
                                           threshold=self.threshold,
                                           binary=self.implicit,
                                           alpha=self.alpha)
        elif self.distance == self.SIM_RP3BETA:
            self._sim_matrix = sim.rp3beta(self.matrix,
                                           k=self.k,
                                           shrink=self.shrink,
                                           threshold=self.threshold,
                                           binary=self.implicit,
                                           alpha=self.alpha,
                                           beta=self.beta)
        elif self.distance == self.SIM_SPLUS:
            self._sim_matrix = prep.normalize(sim.s_plus(
                self.matrix,
                k=self.k,
                shrink=self.shrink,
                threshold=self.threshold,
                binary=self.implicit,
                l=self.l,
                t1=self.alpha,
                t2=self.beta,
                c=self.c),
                                              norm='l2',
                                              axis=0)
        else:
            log.error('Invalid distance metric: {}'.format(self.distance))
        return self._sim_matrix