def fit(self, topK=50, shrink=100, similarity='cosine', normalize=True, feature_weighting="none", **similarity_args): self.topK = topK self.shrink = shrink if feature_weighting not in self.FEATURE_WEIGHTING_VALUES: raise ValueError( "Value for 'feature_weighting' not recognized. Acceptable values are {}, provided was '{}'" .format(self.FEATURE_WEIGHTING_VALUES, feature_weighting)) if feature_weighting == "BM25": self.URM_train = self.URM_train.astype(np.float32) self.URM_train = okapi_BM_25(self.URM_train.T).T self.URM_train = check_matrix(self.URM_train, 'csr') elif feature_weighting == "TF-IDF": self.URM_train = self.URM_train.astype(np.float32) self.URM_train = TF_IDF(self.URM_train.T).T self.URM_train = check_matrix(self.URM_train, 'csr') similarity = Compute_Similarity(self.URM_train, shrink=shrink, topK=topK, normalize=normalize, similarity=similarity, **similarity_args) self.W_sparse = similarity.compute_similarity() self.W_sparse = check_matrix(self.W_sparse, format='csr')
def __init__(self, URM_train, ICM, S_matrix_target): super(CFW_D_Similarity_Linalg, self).__init__(URM_train) if (URM_train.shape[1] != ICM.shape[0]): raise ValueError( "Number of items not consistent. URM contains {} but ICM contains {}" .format(URM_train.shape[1], ICM.shape[0])) if (S_matrix_target.shape[0] != S_matrix_target.shape[1]): raise ValueError( "Items imilarity matrix is not square: rows are {}, columns are {}" .format(S_matrix_target.shape[0], S_matrix_target.shape[1])) if (S_matrix_target.shape[0] != ICM.shape[0]): raise ValueError( "Number of items not consistent. S_matrix contains {} but ICM contains {}" .format(S_matrix_target.shape[0], ICM.shape[0])) self.S_matrix_target = check_matrix(S_matrix_target, 'csr') self.ICM = check_matrix(ICM, 'csr') self.n_items = self.URM_train.shape[1] self.n_users = self.URM_train.shape[0] self.n_features = self.ICM.shape[1] self.sparse_weights = True
def __init__(self, URM_train, Similarity_1, Similarity_2, verbose = True): super(ItemKNNSimilarityHybridRecommender, self).__init__(URM_train, verbose = verbose) if Similarity_1.shape != Similarity_2.shape: raise ValueError("ItemKNNSimilarityHybridRecommender: similarities have different size, S1 is {}, S2 is {}".format( Similarity_1.shape, Similarity_2.shape )) # CSR is faster during evaluation self.Similarity_1 = check_matrix(Similarity_1.copy(), 'csr') self.Similarity_2 = check_matrix(Similarity_2.copy(), 'csr')
def __init__(self, URM_train, verbose=True): super(BaseRecommender, self).__init__() self.URM_train = check_matrix(URM_train.copy(), 'csr', dtype=np.float32) self.URM_train.eliminate_zeros() self.n_users, self.n_items = self.URM_train.shape self.verbose = verbose self.filterTopPop = False self.filterTopPop_ItemsID = np.array([], dtype=np.int) self.items_to_ignore_flag = False self.items_to_ignore_ID = np.array([], dtype=np.int) self._cold_user_mask = np.ediff1d(self.URM_train.indptr) == 0 if self._cold_user_mask.any(): self._print("URM Detected {} ({:.2f} %) cold users.".format( self._cold_user_mask.sum(), self._cold_user_mask.sum() / self.n_users * 100)) self._cold_item_mask = np.ediff1d(self.URM_train.tocsc().indptr) == 0 if self._cold_item_mask.any(): self._print("URM Detected {} ({:.2f} %) cold items.".format( self._cold_item_mask.sum(), self._cold_item_mask.sum() / self.n_items * 100))
def fit(self, topK=100, alpha = 0.5): self.topK = topK self.alpha = alpha self.W_sparse = self.Similarity_1*self.alpha + self.Similarity_2*(1-self.alpha) self.W_sparse = similarityMatrixTopK(self.W_sparse, k=self.topK) self.W_sparse = check_matrix(self.W_sparse, format='csr')
def _remove_seen_on_scores(self, user_id, scores): self.URM_train = check_matrix(self.URM_train.copy(), 'csr', dtype=np.float32) assert self.URM_train.getformat( ) == "csr", "Recommender_Base_Class: URM_train is not CSR, this will cause errors in filtering seen items" seen = self.URM_train.indices[self.URM_train.indptr[user_id]:self. URM_train.indptr[user_id + 1]] scores[seen] = -np.inf return scores
def fit(self, l1_ratio=0.1, positive_only=True, topK=100, workers=multiprocessing.cpu_count()): assert l1_ratio >= 0 and l1_ratio <= 1, "SLIM_ElasticNet: l1_ratio must be between 0 and 1, provided value was {}".format( l1_ratio) self.l1_ratio = l1_ratio self.positive_only = positive_only self.topK = topK self.workers = workers self.URM_train = check_matrix(self.URM_train, 'csc', dtype=np.float32) n_items = self.URM_train.shape[1] # fit item's factors in parallel # oggetto riferito alla funzione nel quale predefinisco parte dell'input _pfit = partial(self._partial_fit, X=self.URM_train, topK=self.topK) # creo un pool con un certo numero di processi pool = Pool(processes=self.workers) # avvio il pool passando la funzione (con la parte fissa dell'input) # e il rimanente parametro, variabile res = pool.map(_pfit, np.arange(n_items)) # res contains a vector of (values, rows, cols) tuples values, rows, cols = [], [], [] for values_, rows_, cols_ in res: values.extend(values_) rows.extend(rows_) cols.extend(cols_) # generate the sparse weight matrix self.W_sparse = sps.csr_matrix((values, (rows, cols)), shape=(n_items, n_items), dtype=np.float32)
def fit(self, topK=100, alpha=1., min_rating=0, implicit=False, normalize_similarity=False): self.topK = topK self.alpha = alpha self.min_rating = min_rating self.implicit = implicit self.normalize_similarity = normalize_similarity # # if X.dtype != np.float32: # print("P3ALPHA fit: For memory usage reasons, we suggest to use np.float32 as dtype for the dataset") if self.min_rating > 0: self.URM_train.data[self.URM_train.data < self.min_rating] = 0 self.URM_train.eliminate_zeros() if self.implicit: self.URM_train.data = np.ones(self.URM_train.data.size, dtype=np.float32) #Pui is the row-normalized urm Pui = normalize(self.URM_train, norm='l1', axis=1) #Piu is the column-normalized, "boolean" urm transposed X_bool = self.URM_train.transpose(copy=True) X_bool.data = np.ones(X_bool.data.size, np.float32) #ATTENTION: axis is still 1 because i transposed before the normalization Piu = normalize(X_bool, norm='l1', axis=1) del(X_bool) # Alfa power if self.alpha != 1.: Pui = Pui.power(self.alpha) Piu = Piu.power(self.alpha) # Final matrix is computed as Pui * Piu * Pui # Multiplication unpacked for memory usage reasons block_dim = 200 d_t = Piu # Use array as it reduces memory requirements compared to lists dataBlock = 10000000 rows = np.zeros(dataBlock, dtype=np.int32) cols = np.zeros(dataBlock, dtype=np.int32) values = np.zeros(dataBlock, dtype=np.float32) numCells = 0 start_time = time.time() start_time_printBatch = start_time for current_block_start_row in range(0, Pui.shape[1], block_dim): if current_block_start_row + block_dim > Pui.shape[1]: block_dim = Pui.shape[1] - current_block_start_row similarity_block = d_t[current_block_start_row:current_block_start_row + block_dim, :] * Pui similarity_block = similarity_block.toarray() for row_in_block in range(block_dim): row_data = similarity_block[row_in_block, :] row_data[current_block_start_row + row_in_block] = 0 best = row_data.argsort()[::-1][:self.topK] notZerosMask = row_data[best] != 0.0 values_to_add = row_data[best][notZerosMask] cols_to_add = best[notZerosMask] for index in range(len(values_to_add)): if numCells == len(rows): rows = np.concatenate((rows, np.zeros(dataBlock, dtype=np.int32))) cols = np.concatenate((cols, np.zeros(dataBlock, dtype=np.int32))) values = np.concatenate((values, np.zeros(dataBlock, dtype=np.float32))) rows[numCells] = current_block_start_row + row_in_block cols[numCells] = cols_to_add[index] values[numCells] = values_to_add[index] numCells += 1 if time.time() - start_time_printBatch > 60: self._print("Processed {} ( {:.2f}% ) in {:.2f} minutes. Rows per second: {:.0f}".format( current_block_start_row, 100.0 * float(current_block_start_row) / Pui.shape[1], (time.time() - start_time) / 60, float(current_block_start_row) / (time.time() - start_time))) sys.stdout.flush() sys.stderr.flush() start_time_printBatch = time.time() self.W_sparse = sps.csr_matrix((values[:numCells], (rows[:numCells], cols[:numCells])), shape=(Pui.shape[1], Pui.shape[1])) if self.normalize_similarity: self.W_sparse = normalize(self.W_sparse, norm='l1', axis=1) if self.topK != False: self.W_sparse = similarityMatrixTopK(self.W_sparse, k=self.topK) self.W_sparse = check_matrix(self.W_sparse, format='csr')
def _generateTrainData_low_ram(self): print(self.RECOMMENDER_NAME + ": Generating train data") start_time_batch = time.time() # Here is important only the structure self.similarity = Compute_Similarity(self.ICM.T, shrink=0, topK=self.topK, normalize=False) S_matrix_contentKNN = self.similarity.compute_similarity() S_matrix_contentKNN = check_matrix(S_matrix_contentKNN, "csr") self._writeLog( self.RECOMMENDER_NAME + ": Collaborative S density: {:.2E}, nonzero cells {}".format( self.S_matrix_target.nnz / self.S_matrix_target.shape[0]**2, self.S_matrix_target.nnz)) self._writeLog( self.RECOMMENDER_NAME + ": Content S density: {:.2E}, nonzero cells {}".format( S_matrix_contentKNN.nnz / S_matrix_contentKNN.shape[0]**2, S_matrix_contentKNN.nnz)) if self.normalize_similarity: # Compute sum of squared sum_of_squared_features = np.array( self.ICM.T.power(2).sum(axis=0)).ravel() sum_of_squared_features = np.sqrt(sum_of_squared_features) num_common_coordinates = 0 estimated_n_samples = int(S_matrix_contentKNN.nnz * (1 + self.add_zeros_quota) * 1.2) self.row_list = np.zeros(estimated_n_samples, dtype=np.int32) self.col_list = np.zeros(estimated_n_samples, dtype=np.int32) self.data_list = np.zeros(estimated_n_samples, dtype=np.float64) num_samples = 0 for row_index in range(self.n_items): start_pos_content = S_matrix_contentKNN.indptr[row_index] end_pos_content = S_matrix_contentKNN.indptr[row_index + 1] content_coordinates = S_matrix_contentKNN.indices[ start_pos_content:end_pos_content] start_pos_target = self.S_matrix_target.indptr[row_index] end_pos_target = self.S_matrix_target.indptr[row_index + 1] target_coordinates = self.S_matrix_target.indices[ start_pos_target:end_pos_target] # Chech whether the content coordinate is associated to a non zero target value # If true, the content coordinate has a collaborative non-zero value # if false, the content coordinate has a collaborative zero value is_common = np.in1d(content_coordinates, target_coordinates) num_common_in_current_row = is_common.sum() num_common_coordinates += num_common_in_current_row for index in range(len(is_common)): if num_samples == estimated_n_samples: dataBlock = 1000000 self.row_list = np.concatenate( (self.row_list, np.zeros(dataBlock, dtype=np.int32))) self.col_list = np.concatenate( (self.col_list, np.zeros(dataBlock, dtype=np.int32))) self.data_list = np.concatenate( (self.data_list, np.zeros(dataBlock, dtype=np.float64))) if is_common[index]: # If cell exists in target matrix, add its value # Otherwise it will remain zero with a certain probability col_index = content_coordinates[index] self.row_list[num_samples] = row_index self.col_list[num_samples] = col_index new_data_value = self.S_matrix_target[row_index, col_index] if self.normalize_similarity: new_data_value *= sum_of_squared_features[ row_index] * sum_of_squared_features[col_index] self.data_list[num_samples] = new_data_value num_samples += 1 elif np.random.rand() <= self.add_zeros_quota: col_index = content_coordinates[index] self.row_list[num_samples] = row_index self.col_list[num_samples] = col_index self.data_list[num_samples] = 0.0 num_samples += 1 if time.time( ) - start_time_batch > 30 or num_samples == S_matrix_contentKNN.nnz * ( 1 + self.add_zeros_quota): print(self.RECOMMENDER_NAME + ": Generating train data. Sample {} ( {:.2f} %) ".format( num_samples, num_samples / S_matrix_contentKNN.nnz * (1 + self.add_zeros_quota) * 100)) sys.stdout.flush() sys.stderr.flush() start_time_batch = time.time() self._writeLog( self.RECOMMENDER_NAME + ": Content S structure has {} out of {} ( {:.2f}%) nonzero collaborative cells" .format(num_common_coordinates, S_matrix_contentKNN.nnz, num_common_coordinates / S_matrix_contentKNN.nnz * 100)) # Discard extra cells at the left of the array self.row_list = self.row_list[:num_samples] self.col_list = self.col_list[:num_samples] self.data_list = self.data_list[:num_samples] data_nnz = sum(np.array(self.data_list) != 0) data_sum = sum(self.data_list) collaborative_nnz = self.S_matrix_target.nnz collaborative_sum = sum(self.S_matrix_target.data) self._writeLog( self.RECOMMENDER_NAME + ": Nonzero collaborative cell sum is: {:.2E}, average is: {:.2E}, " "average over all collaborative data is {:.2E}".format( data_sum, data_sum / data_nnz, collaborative_sum / collaborative_nnz))
def fit(self, l1_ratio=0.1, alpha=1.0, positive_only=True, topK=100): # to control the L1 and L2 penalty separately # a * L1 + b * L2 # alpha = a + b and l1_ratio = a / (a + b) assert l1_ratio >= 0 and l1_ratio <= 1, "{}: l1_ratio must be between 0 and 1, provided value was {}".format( self.RECOMMENDER_NAME, l1_ratio) self.l1_ratio = l1_ratio self.positive_only = positive_only self.topK = topK # Display ConvergenceWarning only once and not for every item it occurs # warnings.simplefilter("once", category=ConvergenceWarning) # initialize the ElasticNet model self.model = ElasticNet(alpha=alpha, l1_ratio=self.l1_ratio, positive=self.positive_only, fit_intercept=False, copy_X=False, precompute=True, selection='random', max_iter=100, tol=1e-4) URM_train = check_matrix(self.URM_train, 'csc', dtype=np.float32) n_items = URM_train.shape[1] # Use array as it reduces memory requirements compared to lists dataBlock = 10000000 rows = np.zeros(dataBlock, dtype=np.int32) cols = np.zeros(dataBlock, dtype=np.int32) values = np.zeros(dataBlock, dtype=np.float32) numCells = 0 start_time = time.time() start_time_printBatch = start_time # fit each item's factors sequentially (not in parallel) for currentItem in range(n_items): # get the target column y = URM_train[:, currentItem].toarray() # set the j-th column of X to zero start_pos = URM_train.indptr[currentItem] end_pos = URM_train.indptr[currentItem + 1] current_item_data_backup = URM_train.data[start_pos:end_pos].copy() URM_train.data[start_pos:end_pos] = 0.0 # fit one ElasticNet model per column self.model.fit(URM_train, y) # self.model.coef_ contains the coefficient of the ElasticNet model # let's keep only the non-zero values # Select topK values # Sorting is done in three steps. Faster then plain np.argsort for higher number of items # - Partition the data to extract the set of relevant items # - Sort only the relevant items # - Get the original item index nonzero_model_coef_index = self.model.sparse_coef_.indices nonzero_model_coef_value = self.model.sparse_coef_.data local_topK = min(len(nonzero_model_coef_value) - 1, self.topK) relevant_items_partition = ( -nonzero_model_coef_value ).argpartition(local_topK)[0:local_topK] relevant_items_partition_sorting = np.argsort( -nonzero_model_coef_value[relevant_items_partition]) ranking = relevant_items_partition[ relevant_items_partition_sorting] for index in range(len(ranking)): if numCells == len(rows): rows = np.concatenate( (rows, np.zeros(dataBlock, dtype=np.int32))) cols = np.concatenate( (cols, np.zeros(dataBlock, dtype=np.int32))) values = np.concatenate( (values, np.zeros(dataBlock, dtype=np.float32))) rows[numCells] = nonzero_model_coef_index[ranking[index]] cols[numCells] = currentItem values[numCells] = nonzero_model_coef_value[ranking[index]] numCells += 1 # finally, replace the original values of the j-th column URM_train.data[start_pos:end_pos] = current_item_data_backup elapsed_time = time.time() - start_time new_time_value, new_time_unit = seconds_to_biggest_unit( elapsed_time) if time.time( ) - start_time_printBatch > 300 or currentItem == n_items - 1: self._print( "Processed {} ( {:.2f}% ) in {:.2f} {}. Items per second: {:.2f}" .format(currentItem + 1, 100.0 * float(currentItem + 1) / n_items, new_time_value, new_time_unit, float(currentItem) / elapsed_time)) sys.stdout.flush() sys.stderr.flush() start_time_printBatch = time.time() # generate the sparse weight matrix self.W_sparse = sps.csr_matrix( (values[:numCells], (rows[:numCells], cols[:numCells])), shape=(n_items, n_items), dtype=np.float32)
def __init__(self, URM_train, Recommender_1, Recommender_2): super(ItemKNNScoresHybridRecommender, self).__init__(URM_train) self.URM_train = check_matrix(URM_train.copy(), 'csr') self.Recommender_1 = Recommender_1 self.Recommender_2 = Recommender_2