def get_S_incremental_and_set_W(self): self.S_incremental = self.cythonEpoch.get_S() if self.train_with_sparse_weights: self.W_sparse = self.S_incremental self.W_sparse = check_matrix(self.W_sparse, format='csr') else: self.W_sparse = similarityMatrixTopK(self.S_incremental, k=self.topK) self.W_sparse = check_matrix(self.W_sparse, format='csr')
def fit(self, topK=100, alpha=0.5): self.topK = topK self.alpha = alpha W_sparse = self.Similarity_1 * self.alpha + self.Similarity_2 * ( 1 - self.alpha) self.W_sparse = similarityMatrixTopK(W_sparse, k=self.topK) self.W_sparse = check_matrix(self.W_sparse, format='csr')
def fit(self, W_sparse, selectTopK=False, topK=100): assert W_sparse.shape[0] == W_sparse.shape[1],\ "ItemKNNCustomSimilarityRecommender: W_sparse matrice is not square. Current shape is {}".format(W_sparse.shape) assert self.URM_train.shape[1] == W_sparse.shape[0],\ "ItemKNNCustomSimilarityRecommender: URM_train and W_sparse matrices are not consistent. " \ "The number of columns in URM_train must be equal to the rows in W_sparse. " \ "Current shapes are: URM_train {}, W_sparse {}".format(self.URM_train.shape, W_sparse.shape) if selectTopK: W_sparse = similarityMatrixTopK(W_sparse, k=topK) self.W_sparse = check_matrix(W_sparse, format='csr')
def fit(self, topK=50, shrink=100, similarity='cosine', normalize=True, feature_weighting="none", **similarity_args): self.topK = topK self.topComputeK = topK + len(self.cold_users) self.shrink = shrink if feature_weighting not in self.FEATURE_WEIGHTING_VALUES: raise ValueError( "Value for 'feature_weighting' not recognized. Acceptable values are {}, provided was '{}'" .format(self.FEATURE_WEIGHTING_VALUES, feature_weighting)) if feature_weighting == "BM25": self.UCM_train = self.UCM_train.astype(np.float32) self.UCM_train = okapi_BM_25(self.UCM_train) elif feature_weighting == "TF-IDF": self.UCM_train = self.UCM_train.astype(np.float32) self.UCM_train = TF_IDF(self.UCM_train) similarity = Compute_Similarity(self.UCM_train.T, shrink=shrink, topK=self.topComputeK, normalize=normalize, similarity=similarity, **similarity_args) self.W_sparse = similarity.compute_similarity() self.W_sparse = self.W_sparse.tocsc() for user in self.cold_users: self.W_sparse.data[self.W_sparse.indptr[user]:self.W_sparse. indptr[user + 1]] = 0 self.W_sparse.eliminate_zeros() self.W_sparse = self.W_sparse.tocsr() self.W_sparse = similarityMatrixTopK(self.W_sparse, k=self.topK).tocsr() self.W_sparse = check_matrix(self.W_sparse, format='csr') # Add identity matrix to the recommender self.recommender.W_sparse = self.recommender.W_sparse + sps.identity( self.recommender.W_sparse.shape[0], format="csr")
def fit(self, topK=100, alpha1=1, alpha2=1, alpha3=1, alpha4=1): self.topK = topK alpha_list = [alpha1, alpha2, alpha3, alpha4] if len(alpha_list) != len(self.W_sparse_list): raise RuntimeError( "Weighting list is not right. {} expected, {} found".format( len(self.W_sparse_list), len(alpha_list))) self.W_sparse = alpha_list[0] * self.W_sparse_list[0] for i in range(1, len(alpha_list)): self.W_sparse += alpha_list[i] * self.W_sparse_list[i] self.W_sparse = similarityMatrixTopK(self.W_sparse, k=self.topK).tocsr()
def fit(self, topK=100, alpha=1., min_rating=0, implicit=False, normalize_similarity=False): self.topK = topK self.alpha = alpha self.min_rating = min_rating self.implicit = implicit self.normalize_similarity = normalize_similarity # # if X.dtype != np.float32: # print("P3ALPHA fit: For memory usage reasons, we suggest to use np.float32 as dtype for the dataset") if self.min_rating > 0: self.URM_train.data[self.URM_train.data < self.min_rating] = 0 self.URM_train.eliminate_zeros() if self.implicit: self.URM_train.data = np.ones(self.URM_train.data.size, dtype=np.float32) self.Pui = sps.hstack([self.user_W_sparse, self.URM_train], format="csr") self.Piu = sps.hstack([self.URM_train.T, self.item_W_sparse], format="csr") self.P = sps.vstack([self.Pui, self.Piu], format="csr") # Pui is the row-normalized urm Pui = normalize(self.P.copy(), norm='l1', axis=1) # Piu is the column-normalized X_bool = self.P.copy() X_bool.data = np.ones(X_bool.data.size, np.float32) Piu = normalize(X_bool, norm='l1', axis=0) del (X_bool) # Alfa power if self.alpha != 1.: Pui = Pui.power(self.alpha) Piu = Piu.power(self.alpha) # Final matrix is computed as Pui * Piu * Pui # Multiplication unpacked for memory usage reasons block_dim = 200 d_t = Piu # Use array as it reduces memory requirements compared to lists dataBlock = 10000000 rows = np.zeros(dataBlock, dtype=np.int32) cols = np.zeros(dataBlock, dtype=np.int32) values = np.zeros(dataBlock, dtype=np.float32) numCells = 0 start_time = time.time() start_time_printBatch = start_time for current_block_start_row in range(0, Pui.shape[1], block_dim): if current_block_start_row + block_dim > Pui.shape[1]: block_dim = Pui.shape[1] - current_block_start_row similarity_block = d_t[ current_block_start_row:current_block_start_row + block_dim, :] * Pui similarity_block = similarity_block.toarray() for row_in_block in range(block_dim): row_data = similarity_block[row_in_block, :] row_data[current_block_start_row + row_in_block] = 0 best = row_data.argsort()[::-1][:self.topK] notZerosMask = row_data[best] != 0.0 values_to_add = row_data[best][notZerosMask] cols_to_add = best[notZerosMask] for index in range(len(values_to_add)): if numCells == len(rows): rows = np.concatenate( (rows, np.zeros(dataBlock, dtype=np.int32))) cols = np.concatenate( (cols, np.zeros(dataBlock, dtype=np.int32))) values = np.concatenate( (values, np.zeros(dataBlock, dtype=np.float32))) rows[numCells] = current_block_start_row + row_in_block cols[numCells] = cols_to_add[index] values[numCells] = values_to_add[index] numCells += 1 if time.time() - start_time_printBatch > 60: self._print( "Processed {} ( {:.2f}% ) in {:.2f} minutes. Rows per second: {:.0f}" .format( current_block_start_row, 100.0 * float(current_block_start_row) / Pui.shape[1], (time.time() - start_time) / 60, float(current_block_start_row) / (time.time() - start_time))) sys.stdout.flush() sys.stderr.flush() start_time_printBatch = time.time() # Set rows, cols, values rows = rows[:numCells] cols = cols[:numCells] values = values[:numCells] self.W_sparse = sps.csr_matrix((values, (rows, cols)), shape=self.P.shape) if self.normalize_similarity: self.W_sparse = normalize(self.W_sparse, norm='l1', axis=1) if self.topK != False: self.W_sparse = similarityMatrixTopK(self.W_sparse, k=self.topK) self.W_sparse = check_matrix(self.W_sparse, format='csr')
def _reconcile_model(self, num_models): self.W_sparse = self.W_sparse / num_models if self.topK >= 0: self.W_sparse = similarityMatrixTopK(self.W_sparse, k=self.topK).tocsr()
def fit(self, alpha=1., beta=0.6, min_rating=0, topK=100, implicit=False, normalize_similarity=True): self.alpha = alpha self.beta = beta self.min_rating = min_rating self.topK = topK self.implicit = implicit self.normalize_similarity = normalize_similarity # if X.dtype != np.float32: # print("RP3beta fit: For memory usage reasons, we suggest to use np.float32 as dtype for the dataset") if self.min_rating > 0: self.URM_train.data[self.URM_train.data < self.min_rating] = 0 self.URM_train.eliminate_zeros() if self.implicit: self.URM_train.data = np.ones(self.URM_train.data.size, dtype=np.float32) # Pui is the row-normalized urm Pui_raw = sps.hstack([self.URM_train, self.UCM_train], format="csr") Pui_raw = TF_IDF(Pui_raw).tocsr() Pui = normalize(Pui_raw, norm='l1', axis=1) # Piu is the column-normalized, "boolean" urm transposed # X_bool = self.URM_train.transpose(copy=True) X_bool = Pui_raw.transpose(copy=True) X_bool.data = np.ones(X_bool.data.size, np.float32) # Taking the degree of each item to penalize top popular # Some rows might be zero, make sure their degree remains zero X_bool_sum = np.array(X_bool.sum(axis=1)).ravel() degree = np.zeros(Pui_raw.shape[1]) nonZeroMask = X_bool_sum != 0.0 degree[nonZeroMask] = np.power(X_bool_sum[nonZeroMask], -self.beta) # ATTENTION: axis is still 1 because i transposed before the normalization Piu = normalize(X_bool, norm='l1', axis=1) del (X_bool) # Alfa power if self.alpha != 1.: Pui = Pui.power(self.alpha) Piu = Piu.power(self.alpha) # Final matrix is computed as Pui * Piu * Pui # Multiplication unpacked for memory usage reasons block_dim = 200 d_t = Piu # Use array as it reduces memory requirements compared to lists dataBlock = 10000000 rows = np.zeros(dataBlock, dtype=np.int32) cols = np.zeros(dataBlock, dtype=np.int32) values = np.zeros(dataBlock, dtype=np.float32) numCells = 0 start_time = time.time() start_time_printBatch = start_time for current_block_start_row in range(0, Pui.shape[1], block_dim): if current_block_start_row + block_dim > Pui.shape[1]: block_dim = Pui.shape[1] - current_block_start_row similarity_block = d_t[ current_block_start_row:current_block_start_row + block_dim, :] * Pui similarity_block = similarity_block.toarray() for row_in_block in range(block_dim): row_data = np.multiply(similarity_block[row_in_block, :], degree) row_data[current_block_start_row + row_in_block] = 0 best = row_data.argsort()[::-1][:self.topK] notZerosMask = row_data[best] != 0.0 values_to_add = row_data[best][notZerosMask] cols_to_add = best[notZerosMask] for index in range(len(values_to_add)): if numCells == len(rows): rows = np.concatenate( (rows, np.zeros(dataBlock, dtype=np.int32))) cols = np.concatenate( (cols, np.zeros(dataBlock, dtype=np.int32))) values = np.concatenate( (values, np.zeros(dataBlock, dtype=np.float32))) rows[numCells] = current_block_start_row + row_in_block cols[numCells] = cols_to_add[index] values[numCells] = values_to_add[index] numCells += 1 if time.time() - start_time_printBatch > 60: self._print( "Processed {} ( {:.2f}% ) in {:.2f} minutes. Rows per second: {:.0f}" .format( current_block_start_row, 100.0 * float(current_block_start_row) / Pui.shape[1], (time.time() - start_time) / 60, float(current_block_start_row) / (time.time() - start_time))) sys.stdout.flush() sys.stderr.flush() start_time_printBatch = time.time() self.W_sparse = sps.csr_matrix( (values[:numCells], (rows[:numCells], cols[:numCells])), shape=(Pui.shape[1], Pui.shape[1])) if self.normalize_similarity: self.W_sparse = normalize(self.W_sparse, norm='l1', axis=1) if self.topK != False: self.W_sparse = similarityMatrixTopK(self.W_sparse, k=self.topK) self.W_sparse = check_matrix(self.W_sparse, format='csr')