def fit(self, topK=50, shrink=100, similarity='cosine', feature_weighting="none", **similarity_args): # Similaripy returns also self similarity, which will be set to 0 afterwards topK += 1 self.topK = topK self.shrink = shrink if feature_weighting not in self.FEATURE_WEIGHTING_VALUES: raise ValueError( "Value for 'feature_weighting' not recognized. Acceptable values are {}, provided was '{}'" .format(self.FEATURE_WEIGHTING_VALUES, feature_weighting)) if feature_weighting == "BM25": self.URM_train = self.URM_train.astype(np.float32) self.URM_train = okapi_BM_25(self.URM_train.T).T self.URM_train = check_matrix(self.URM_train, 'csr') elif feature_weighting == "TF-IDF": self.URM_train = self.URM_train.astype(np.float32) self.URM_train = TF_IDF(self.URM_train.T).T self.URM_train = check_matrix(self.URM_train, 'csr') if similarity == "cosine": self.W_sparse = sim.cosine(self.URM_train, k=topK, shrink=shrink, **similarity_args) elif similarity == "jaccard": self.W_sparse = sim.jaccard(self.URM_train, k=topK, shrink=shrink, **similarity_args) elif similarity == "dice": self.W_sparse = sim.dice(self.URM_train, k=topK, shrink=shrink, **similarity_args) elif similarity == "jaccard": self.W_sparse = sim.tversky(self.URM_train, k=topK, shrink=shrink, **similarity_args) elif similarity == "splus": self.W_sparse = sim.s_plus(self.URM_train, k=topK, shrink=shrink, **similarity_args) else: raise ValueError( "Unknown value '{}' for similarity".format(similarity)) self.W_sparse.setdiag(0) self.W_sparse = self.W_sparse.transpose().tocsr()
def fit(self, topK=50, shrink=100, similarity='cosine', feature_weighting="none"): self.topK = topK self.shrink = shrink if feature_weighting == "bm25": self.ICM_train = similaripy.normalization.bm25(self.ICM_train) elif feature_weighting == "bm25plus": self.ICM_train = similaripy.normalization.bm25plus(self.ICM_train) elif feature_weighting == "tfidf": self.ICM_train = similaripy.normalization.tfidf(self.ICM_train) if similarity == "cosine": similarity_matrix = similaripy.cosine(self.ICM_train, k=self.topK, shrink=self.shrink, binary=False, verbose=False) if similarity == "s_plus": similarity_matrix = similaripy.s_plus(self.ICM_train, k=self.topK, shrink=self.shrink, binary=False, verbose=False) if similarity == "dice": similarity_matrix = similaripy.dice(self.ICM_train, k=self.topK, shrink=self.shrink, binary=False, verbose=False) if similarity == "rp3beta": similarity_matrix = similaripy.rp3beta(self.ICM_train, alpha=0.3, beta=0.61, k=self.topK, shrink=self.shrink, binary=False, verbose=False) if similarity == "p3alpha": similarity_matrix = similaripy.p3alpha(self.ICM_train, k=self.topK, shrink=self.shrink, binary=False, verbose=False) if similarity == "jaccard": similarity_matrix = similaripy.jaccard(self.ICM_train, k=self.topK, shrink=self.shrink, binary=False, verbose=False) self.W_sparse = similarity_matrix.transpose().tocsr() self.W_sparse = check_matrix(self.W_sparse, format='csr')
def fit(self, topK=50, shrink=100, similarity='cosine', pre_normalization="none", post_normalization = "none", **similarity_args): self.topK = topK self.shrink = shrink #ucm = sps.load_npz("FULL_UCM.npz") interactions = self.URM_train if pre_normalization == "bm25plus": interactions = similaripy.normalization.bm25plus(self.URM_train, axis=1, k1=1.2, b=0.75, delta=0.85, tf_mode="raw", idf_mode="bm25", inplace=False) if pre_normalization == "tfidf": interactions = similaripy.normalization.tfidf(self.URM_train, axis=1) #interactions = sps.hstack((interactions, ucm)) if similarity == "cosine": similarity_matrix = similaripy.cosine(interactions, k=self.topK, shrink=self.shrink, binary=False, verbose=False) similarity_matrix = similarity_matrix.transpose().tocsr() if similarity == "s_plus": similarity_matrix = similaripy.s_plus(interactions, k=self.topK, shrink=self.shrink, binary = False, verbose = False) similarity_matrix = similarity_matrix.transpose().tocsr() if similarity == "dice": similarity_matrix = similaripy.dice(interactions, k=self.topK, shrink=self.shrink, binary=False, verbose=False) similarity_matrix = similarity_matrix.transpose().tocsr() if similarity == "rp3beta": similarity_matrix = similaripy.rp3beta(interactions, alpha=0.3, beta=0.61, k=self.topK, shrink=self.shrink, binary=False, verbose=False) similarity_matrix = similarity_matrix.transpose().tocsr() if similarity == "asym": similarity_matrix = similaripy.asymmetric_cosine(interactions, k=self.topK, shrink=self.shrink, alpha=0.5, binary=False, verbose=False) similarity_matrix = similarity_matrix.transpose().tocsr() if similarity == "jaccard": similarity_matrix = similaripy.jaccard(interactions, k=self.topK, shrink=self.shrink, binary=False, verbose=False) similarity_matrix = similarity_matrix.transpose().tocsr() if post_normalization == "bm25plus_once": self.URM_train = similaripy.normalization.bm25plus(self.URM_train, axis=1, k1=1.2, b=0.75, delta=0.8, tf_mode='raw', idf_mode='bm25', inplace=False) if post_normalization == "bm25plus_twice": self.URM_train = similaripy.normalization.bm25plus(self.URM_train, axis=1, k1=1.2, b=0.75, delta=0.8, tf_mode='raw', idf_mode='bm25', inplace=False) self.URM_train = similaripy.normalization.bm25plus(self.URM_train, axis=1, k1=1.2, b=0.75, delta=0.8, tf_mode='raw', idf_mode='bm25', inplace=False) if post_normalization == "tfidf": self.URM_train = similaripy.normalization.tfidf(self.URM_train, axis=1) if post_normalization == "bm25": self.URM_train = similaripy.normalization.bm25(self.URM_train, axis=1) self.W_sparse = similarity_matrix self.W_sparse = check_matrix(self.W_sparse, format='csr')
def fit(self, topK=50, shrink=100, similarity='cosine', normalization="none", feature_weighting="none", rp3_alpha=0.5, rp3_beta=0.5): self.topK = topK self.shrink = shrink reader = DataReader() icm = reader.load_icm() if normalization == "bm25plus": self.URM_train = similaripy.normalization.bm25plus(self.URM_train, axis=1) if feature_weighting == "bm25": icm = similaripy.normalization.bm25(icm, axis=1) matrix = sps.hstack((self.URM_train.transpose().tocsr(), icm)) if similarity == "cosine": self.W_sparse = similaripy.cosine(matrix, k=self.topK, shrink=self.shrink, binary=False, threshold=0) if similarity == "dice": self.W_sparse = similaripy.dice(matrix, k=self.topK, shrink=self.shrink, binary=False, threshold=0) if similarity == "rp3beta": self.W_sparse = similaripy.rp3beta(matrix, k=self.topK, shrink=self.shrink, binary=False, threshold=0, alpha=rp3_alpha, beta=rp3_beta) self.W_sparse = check_matrix(self.W_sparse, format='csr')
def check_similarity(m, k, rtol=0.0001, full=False): # cython dot = sim.dot_product(m, k=k) cosine = sim.cosine(m, k=k) asy_cosine = sim.asymmetric_cosine(m, alpha=0.2, k=k) jaccard = sim.jaccard(m, k=k) dice = sim.dice(m, k=k) tversky = sim.tversky(m, alpha=0.8, beta=0.4, k=k) p3alpha = sim.p3alpha(m, alpha=0.8, k=k) rp3beta = sim.rp3beta(m, alpha=0.8, beta=0.4, k=k) # python dot2 = py_dot(m, k) cosine2 = py_cosine(m, k).tocsr() asy_cosine2 = py_asy_cosine(m, 0.2, k=k) jaccard2 = py_jaccard(m, k) dice2 = py_dice(m, k) tversky2 = py_tversky(m, alpha=0.8, beta=0.4, k=k) p3alpha2 = py_p3alpha(m, alpha=0.8, k=k) rp3beta2 = py_rp3beta(m, alpha=0.8, beta=0.4, k=k) # test np.testing.assert_allclose(check_sum(dot), check_sum(dot2), rtol=rtol, err_msg='dot error') np.testing.assert_allclose(check_sum(cosine), check_sum(cosine2), rtol=rtol, err_msg='cosine error') np.testing.assert_allclose(check_sum(asy_cosine), check_sum(asy_cosine2), rtol=rtol, err_msg='asy_cosine error') np.testing.assert_allclose(check_sum(jaccard), check_sum(jaccard2), rtol=rtol, err_msg='jaccard error') np.testing.assert_allclose(check_sum(dice), check_sum(dice2), rtol=rtol, err_msg='dice error') np.testing.assert_allclose(check_sum(tversky), check_sum(tversky2), rtol=rtol, err_msg='tversky error') np.testing.assert_allclose(check_sum(p3alpha), check_sum(p3alpha2), rtol=rtol, err_msg='p3alpha error') np.testing.assert_allclose(check_sum(rp3beta), check_sum(rp3beta2), rtol=rtol, err_msg='rp3beta error') # test full rows if full: np.testing.assert_(check_full(dot, dot2, rtol) == 0, msg='dot error') np.testing.assert_(check_full(cosine, cosine2, rtol) == 0, msg='cosine error') np.testing.assert_(check_full(asy_cosine, asy_cosine2, rtol) == 0, msg='asy_cosine error') np.testing.assert_(check_full(jaccard, jaccard2, rtol) == 0, msg='jaccard error') np.testing.assert_(check_full(dice, dice2, rtol) == 0, msg='dice error') np.testing.assert_(check_full(tversky, tversky2, rtol) == 0, msg='tversky error') np.testing.assert_(check_full(p3alpha, p3alpha2, rtol) == 0, msg='p3alpha error') np.testing.assert_(check_full(rp3beta, rp3beta2, rtol) == 0, msg='rp3beta error') return
def fit(self, matrix, k, distance, shrink=0, threshold=0, implicit=True, alpha=None, beta=None, l=None, c=None, verbose=False): """ Initialize the model and compute the similarity matrix S with a distance metric. Access the similarity matrix using: self._sim_matrix Parameters ---------- matrix : csr_matrix A sparse matrix. For example, it can be the URM of shape (number_users, number_items). k : int K nearest neighbour to consider. distance : str One of the supported distance metrics, check collaborative_filtering_base constants. shrink : float, optional Shrink term used in the normalization threshold: float, optional All the values under this value are cutted from the final result implicit: bool, optional If true, treat the URM as implicit, otherwise consider explicit ratings (real values) in the URM alpha: float, optional, included in [0,1] beta: float, optional, included in [0,1] l: float, optional, balance coefficient used in s_plus distance, included in [0,1] c: float, optional, cosine coefficient, included in [0,1] """ alpha = -1 if alpha is None else alpha beta = -1 if beta is None else beta l = -1 if l is None else l c = -1 if c is None else c if distance == self.SIM_ASYMCOSINE and not (0 <= alpha <= 1): log.error( 'Invalid parameter alpha in asymmetric cosine similarity!') return if distance == self.SIM_TVERSKY and not (0 <= alpha <= 1 and 0 <= beta <= 1): log.error('Invalid parameter alpha/beta in tversky similarity!') return if distance == self.SIM_P3ALPHA and alpha is None: log.error('Invalid parameter alpha in p3alpha similarity') return if distance == self.SIM_RP3BETA and alpha is None and beta is None: log.error('Invalid parameter alpha/beta in rp3beta similarity') return if distance == self.SIM_SPLUS and not (0 <= l <= 1 and 0 <= c <= 1 and 0 <= alpha <= 1 and 0 <= beta <= 1): log.error('Invalid parameter alpha/beta/l/c in s_plus similarity') return # compute and stores the similarity matrix using one of the distance metric: S = R•R' if distance == self.SIM_COSINE: self._sim_matrix = sim.cosine(matrix, k=k, shrink=shrink, threshold=threshold, binary=implicit) elif distance == self.SIM_ASYMCOSINE: self._sim_matrix = sim.asymmetric_cosine(matrix, k=k, shrink=shrink, threshold=threshold, binary=implicit, alpha=alpha) elif distance == self.SIM_JACCARD: self._sim_matrix = sim.jaccard(matrix, k=k, shrink=shrink, threshold=threshold, binary=implicit) elif distance == self.SIM_DICE: self._sim_matrix = sim.dice(matrix, k=k, shrink=shrink, threshold=threshold, binary=implicit) elif distance == self.SIM_TVERSKY: self._sim_matrix = sim.tversky(matrix, k=k, shrink=shrink, threshold=threshold, binary=implicit, alpha=alpha, beta=beta) elif distance == self.SIM_P3ALPHA: self._sim_matrix = sim.p3alpha(matrix, k=k, shrink=shrink, threshold=threshold, binary=implicit, alpha=alpha) elif distance == self.SIM_RP3BETA: self._sim_matrix = sim.rp3beta(matrix, k=k, shrink=shrink, threshold=threshold, binary=implicit, alpha=alpha, beta=beta) elif distance == self.SIM_SPLUS: self._sim_matrix = sim.s_plus(matrix, k=k, shrink=shrink, threshold=threshold, binary=implicit, l=l, t1=alpha, t2=beta, c=c) else: log.error('Invalid distance metric: {}'.format(distance)) #self.SIM_DOTPRODUCT: sim.dot_product(matrix, k=k, shrink=shrink, threshold=threshold, binary=implicit) return self._sim_matrix
def similarity(matrix, k=100, sim_type='cosine', shrink=0, threshold=0, implicit=True, alpha=None, beta=None, l=None, c=None): # similarity type SIM_COSINE = 'cosine' SIM_TVERSKY = 'tversky' SIM_P3ALPHA = 'p3alpha' SIM_ASYMCOSINE = 'asymcosine' SIM_RP3BETA = 'rp3beta' SIM_SPLUS = 'splus' SIM_JACCARD = 'jaccard' SIM_DICE = 'dice' matrix = matrix.T if sim_type == SIM_COSINE: return sim.cosine(matrix, k=k, shrink=shrink, threshold=threshold, binary=implicit) elif sim_type == SIM_ASYMCOSINE: return sim.asymmetric_cosine(matrix, k=k, shrink=shrink, threshold=threshold, binary=implicit, alpha=alpha) elif sim_type == SIM_JACCARD: return sim.jaccard(matrix, k=k, shrink=shrink, threshold=threshold, binary=implicit) elif sim_type == SIM_TVERSKY: return sim.tversky(matrix, k=k, shrink=shrink, threshold=threshold, binary=implicit, alpha=alpha, beta=beta) elif sim_type == SIM_P3ALPHA: return sim.p3alpha(matrix, k=k, shrink=shrink, threshold=threshold, binary=implicit, alpha=alpha) elif sim_type == SIM_RP3BETA: return sim.rp3beta(matrix, k=k, shrink=shrink, threshold=threshold, binary=implicit, alpha=alpha, beta=beta) elif sim_type == SIM_SPLUS: return sim.s_plus(matrix, k=k, shrink=shrink, threshold=threshold, binary=implicit) #, l=l, t1=alpha, t2=beta, c=c) elif sim_type == SIM_DICE: return sim.dice(matrix, k=k, shrink=shrink, threshold=threshold, binary=implicit) else: print('Error wrong distance metric')
def fit(self): self.alpha = -1 if self.alpha is None else self.alpha self.beta = -1 if self.beta is None else self.beta self.l = -1 if self.l is None else self.l self.c = -1 if self.c is None else self.c if self.distance == self.SIM_ASYMCOSINE and not (0 <= self.alpha <= 1): log.error( 'Invalid parameter alpha in asymmetric cosine Similarity_MFD!') return if self.distance == self.SIM_TVERSKY and not (0 <= self.alpha <= 1 and 0 <= self.beta <= 1): log.error( 'Invalid parameter alpha/beta in tversky Similarity_MFD!') return if self.distance == self.SIM_P3ALPHA and self.alpha is None: log.error('Invalid parameter alpha in p3alpha Similarity_MFD') return if self.distance == self.SIM_RP3BETA and self.alpha is None and self.beta is None: log.error('Invalid parameter alpha/beta in rp3beta Similarity_MFD') return if self.distance == self.SIM_SPLUS and not ( 0 <= self.l <= 1 and 0 <= self.c <= 1 and 0 <= self.alpha <= 1 and 0 <= self.beta <= 1): log.error( 'Invalid parameter alpha/beta/l/c in s_plus Similarity_MFD') return # compute and stores the Similarity_MFD matrix using one of the distance metric: S = R•R' if self.distance == self.SIM_COSINE: self._sim_matrix = sim.cosine(self.matrix, k=self.k, shrink=self.shrink, threshold=self.threshold, binary=self.implicit) elif self.distance == self.SIM_ASYMCOSINE: self._sim_matrix = sim.asymmetric_cosine(self.matrix, k=self.k, shrink=self.shrink, threshold=self.threshold, binary=self.implicit, alpha=self.alpha) elif self.distance == self.SIM_JACCARD: self._sim_matrix = sim.jaccard(self.matrix, k=self.k, shrink=self.shrink, threshold=self.threshold, binary=self.implicit) elif self.distance == self.SIM_DICE: self._sim_matrix = sim.dice(self.matrix, k=self.k, shrink=self.shrink, threshold=self.threshold, binary=self.implicit) elif self.distance == self.SIM_TVERSKY: self._sim_matrix = sim.tversky(self.matrix, k=self.k, shrink=self.shrink, threshold=self.threshold, binary=self.implicit, alpha=self.alpha, beta=self.beta) elif self.distance == self.SIM_P3ALPHA: self._sim_matrix = sim.p3alpha(self.matrix, k=self.k, shrink=self.shrink, threshold=self.threshold, binary=self.implicit, alpha=self.alpha) elif self.distance == self.SIM_RP3BETA: self._sim_matrix = sim.rp3beta(self.matrix, k=self.k, shrink=self.shrink, threshold=self.threshold, binary=self.implicit, alpha=self.alpha, beta=self.beta) elif self.distance == self.SIM_SPLUS: self._sim_matrix = prep.normalize(sim.s_plus( self.matrix, k=self.k, shrink=self.shrink, threshold=self.threshold, binary=self.implicit, l=self.l, t1=self.alpha, t2=self.beta, c=self.c), norm='l2', axis=0) else: log.error('Invalid distance metric: {}'.format(self.distance)) return self._sim_matrix