def __init__(self, URM_train, ICM, S_matrix_target): super(HP3_Similarity_Cython, self).__init__(URM_train) if (URM_train.shape[1] != ICM.shape[0]): raise ValueError( "Number of items not consistent. URM contains {} but ICM contains {}" .format(URM_train.shape[1], ICM.shape[0])) if (S_matrix_target.shape[0] != S_matrix_target.shape[1]): raise ValueError( "Items imilarity matrix is not square: rows are {}, columns are {}" .format(S_matrix_target.shape[0], S_matrix_target.shape[1])) if (S_matrix_target.shape[0] != ICM.shape[0]): raise ValueError( "Number of items not consistent. S_matrix contains {} but ICM contains {}" .format(S_matrix_target.shape[0], ICM.shape[0])) self.S_matrix_target = check_matrix(S_matrix_target, 'csr') self.ICM = check_matrix(ICM, 'csr') self.n_features = self.ICM.shape[1] self.D_incremental = np.ones(self.n_features, dtype=np.float64) self.D_best = self.D_incremental.copy()
def __init__(self, URM_train, ICM, S_matrix_target): super(CFW_DVV_Similarity_Cython, self).__init__(URM_train) self.ICM = check_matrix(ICM, 'csr') self.n_features = self.ICM.shape[1] self.S_matrix_target = check_matrix(S_matrix_target, 'csr')
def fit(self, topK=None, l2_norm=1e3, normalize_matrix=False, verbose=True): self.verbose = verbose start_time = time.time() self._print("Fitting model... ") if normalize_matrix: # Normalize rows and then columns self.URM_train = normalize(self.URM_train, norm='l2', axis=1) self.URM_train = normalize(self.URM_train, norm='l2', axis=0) self.URM_train = sps.csr_matrix(self.URM_train) # Grahm matrix is X^t X, compute dot product similarity = Compute_Similarity(self.URM_train, shrink=0, topK=self.URM_train.shape[1], normalize=False, similarity="cosine") grahm_matrix = similarity.compute_similarity().toarray() diag_indices = np.diag_indices(grahm_matrix.shape[0]) # The Compute_Similarity object ensures the diagonal of the similarity matrix is zero # in this case we need the diagonal as well, which is just the item popularity item_popularity = np.ediff1d(self.URM_train.tocsc().indptr) grahm_matrix[diag_indices] = item_popularity + l2_norm P = np.linalg.inv(grahm_matrix) B = P / (-np.diag(P)) B[diag_indices] = 0.0 new_time_value, new_time_unit = seconds_to_biggest_unit(time.time() - start_time) self._print("Fitting model... done in {:.2f} {}".format( new_time_value, new_time_unit)) # Check if the matrix should be saved in a sparse or dense format # The matrix is sparse, regardless of the presence of the topK, if nonzero cells are less than sparse_threshold_quota % if topK is not None: B = similarityMatrixTopK(B, k=topK, verbose=False) if self._is_content_sparse_check(B): self._print("Detected model matrix to be sparse, changing format.") self.W_sparse = check_matrix(B, format='csr', dtype=np.float32) else: self.W_sparse = check_matrix(B, format='npy', dtype=np.float32) self._W_sparse_format_checked = True self._compute_item_score = self._compute_score_W_dense
def get_S_incremental_and_set_W(self): self.S_incremental = self.cythonEpoch.get_S() if self.train_with_sparse_weights: self.W_sparse = self.S_incremental self.W_sparse = check_matrix(self.W_sparse, format='csr') else: self.W_sparse = similarityMatrixTopK(self.S_incremental, k = self.topK) self.W_sparse = check_matrix(self.W_sparse, format='csr')
def __init__(self, URM_train, Similarity_1, Similarity_2, verbose=True): super(ItemKNNSimilarityHybridRecommender, self).__init__(URM_train, verbose=verbose) if Similarity_1.shape != Similarity_2.shape: raise ValueError( "ItemKNNSimilarityHybridRecommender: similarities have different size, S1 is {}, S2 is {}" .format(Similarity_1.shape, Similarity_2.shape)) # CSR is faster during evaluation self.Similarity_1 = check_matrix(Similarity_1.copy(), 'csr') self.Similarity_2 = check_matrix(Similarity_2.copy(), 'csr')
def fit(self, lambda_user=10, lambda_item=25): self.lambda_user = lambda_user self.lambda_item = lambda_item self.n_items = self.URM_train.shape[1] # convert to csc matrix for faster column-wise sum self.URM_train = check_matrix(self.URM_train, 'csc', dtype=np.float32) # 1) global average self.mu = self.URM_train.data.sum( dtype=np.float32) / self.URM_train.data.shape[0] # 2) item average bias # compute the number of non-zero elements for each column col_nnz = np.diff(self.URM_train.indptr) # it is equivalent to: # col_nnz = X.indptr[1:] - X.indptr[:-1] # and it is **much faster** than # col_nnz = (X != 0).sum(axis=0) URM_train_unbiased = self.URM_train.copy() URM_train_unbiased.data -= self.mu self.item_bias = URM_train_unbiased.sum(axis=0) / (col_nnz + self.lambda_item) self.item_bias = np.asarray(self.item_bias).ravel( ) # converts 2-d matrix to 1-d array without anycopy # 3) user average bias # NOTE: the user bias is *useless* for the sake of ranking items. We just show it here for educational purposes. # first subtract the item biases from each column # then repeat each element of the item bias vector a number of times equal to col_nnz # and subtract it from the data vector URM_train_unbiased.data -= np.repeat(self.item_bias, col_nnz) # now convert the csc matrix to csr for efficient row-wise computation URM_train_unbiased_csr = URM_train_unbiased.tocsr() row_nnz = np.diff(URM_train_unbiased_csr.indptr) # finally, let's compute the bias self.user_bias = URM_train_unbiased_csr.sum( axis=1).ravel() / (row_nnz + self.lambda_user) # 4) precompute the item ranking by using the item bias only # the global average and user bias won't change the ranking, so there is no need to use them #self.item_ranking = np.argsort(self.bi)[::-1] self.URM_train = check_matrix(self.URM_train, 'csr', dtype=np.float32)
def applyAdjustedCosine(self): """ Remove from every data point the average for the corresponding row :return: """ self.dataMatrix = check_matrix(self.dataMatrix, 'csr') interactionsPerRow = np.diff(self.dataMatrix.indptr) nonzeroRows = interactionsPerRow > 0 sumPerRow = np.asarray(self.dataMatrix.sum(axis=1)).ravel() rowAverage = np.zeros_like(sumPerRow) rowAverage[nonzeroRows] = sumPerRow[nonzeroRows] / interactionsPerRow[nonzeroRows] # Split in blocks to avoid duplicating the whole data structure start_row = 0 end_row= 0 blockSize = 1000 while end_row < self.n_rows: end_row = min(self.n_rows, end_row + blockSize) self.dataMatrix.data[self.dataMatrix.indptr[start_row]:self.dataMatrix.indptr[end_row]] -= \ np.repeat(rowAverage[start_row:end_row], interactionsPerRow[start_row:end_row]) start_row += blockSize
def set_URM_train(self, URM_train_new, estimate_item_similarity_for_cold_users = False, **kwargs): """ :param URM_train_new: :param estimate_item_similarity_for_cold_users: Set to TRUE if you want to estimate the USER_factors for cold users :param kwargs: :return: """ assert self.URM_train.shape == URM_train_new.shape, "{}: set_URM_train old and new URM train have different shapes".format(self.RECOMMENDER_NAME) if len(kwargs)>0: self._print("set_URM_train keyword arguments not supported for this recommender class. Received: {}".format(kwargs)) self.URM_train = check_matrix(URM_train_new.copy(), 'csr', dtype=np.float32) self.URM_train.eliminate_zeros() # No need to ever use a knn model self._cold_user_KNN_model_available = False self._cold_user_mask = np.ediff1d(self.URM_train.indptr) == 0 if estimate_item_similarity_for_cold_users: self._print("Estimating USER_factors for cold users...") self.USER_factors = self._estimate_user_factors(self.ITEM_factors_Y_best) self._print("Estimating USER_factors for cold users... done!")
def applyPearsonCorrelation(self): """ Remove from every data point the average for the corresponding column :return: """ self.dataMatrix = check_matrix(self.dataMatrix, 'csc') interactionsPerCol = np.diff(self.dataMatrix.indptr) nonzeroCols = interactionsPerCol > 0 sumPerCol = np.asarray(self.dataMatrix.sum(axis=0)).ravel() colAverage = np.zeros_like(sumPerCol) colAverage[nonzeroCols] = sumPerCol[nonzeroCols] / interactionsPerCol[nonzeroCols] # Split in blocks to avoid duplicating the whole data structure start_col = 0 end_col= 0 blockSize = 1000 while end_col < self.n_columns: end_col = min(self.n_columns, end_col + blockSize) self.dataMatrix.data[self.dataMatrix.indptr[start_col]:self.dataMatrix.indptr[end_col]] -= \ np.repeat(colAverage[start_col:end_col], interactionsPerCol[start_col:end_col]) start_col += blockSize
def compute_W_sparse(self, model_to_use="best"): if model_to_use == "last": feature_weights = self.D_incremental elif model_to_use == "best": feature_weights = self.D_best else: assert False, "{}: compute_W_sparse, 'model_to_use' parameter not recognized".format( self.RECOMMENDER_NAME) block_dim = 300 d_t = self.ICM * sps.diags([feature_weights.squeeze()], [0]) icm_t = self.ICM.astype(np.bool).T indptr, indices, data = [0], [], [] for r in range(0, self.n_items, block_dim): if r + block_dim > self.n_items: block_dim = self.n_items - r sim = d_t[r:r + block_dim, :] * icm_t for s in range(block_dim): row = sim[s].toarray().ravel() row[r + s] = 0 best = row.argsort()[::-1][:self.topK] indices.extend(best) indptr.append(len(indices)) data.extend(row[best].flatten().tolist()) self.W_sparse = normalize(sps.csr_matrix( (data, indices, indptr), shape=(self.n_items, self.n_items)), norm="l1", axis=1) self.W_sparse = check_matrix(self.W_sparse, format='csr')
def __init__(self, URM_train, verbose=True): super(BaseRecommender, self).__init__() self.URM_train = check_matrix(URM_train.copy(), 'csr', dtype=np.float32) self.URM_train.eliminate_zeros() self.n_users, self.n_items = self.URM_train.shape self.verbose = verbose self.filterTopPop = False self.filterTopPop_ItemsID = np.array([], dtype=np.int) self.items_to_ignore_flag = False self.items_to_ignore_ID = np.array([], dtype=np.int) self._cold_user_mask = np.ediff1d(self.URM_train.indptr) == 0 if self._cold_user_mask.any(): self._print( "URM Detected {} ({:4.1f}%) users with no interactions.". format(self._cold_user_mask.sum(), self._cold_user_mask.sum() / self.n_users * 100)) self._cold_item_mask = np.ediff1d(self.URM_train.tocsc().indptr) == 0 if self._cold_item_mask.any(): self._print( "URM Detected {} ({:4.1f}%) items with no interactions.". format(self._cold_item_mask.sum(), self._cold_item_mask.sum() / self.n_items * 100))
def fit(self, topK=50, shrink=100, similarity='cosine', normalize=True, feature_weighting="none", **similarity_args): self.topK = topK self.shrink = shrink if feature_weighting not in self.FEATURE_WEIGHTING_VALUES: raise ValueError( "Value for 'feature_weighting' not recognized. Acceptable values are {}, provided was '{}'" .format(self.FEATURE_WEIGHTING_VALUES, feature_weighting)) if feature_weighting == "BM25": self.UCM_train = self.UCM_train.astype(np.float32) self.UCM_train = okapi_BM_25(self.UCM_train) elif feature_weighting == "TF-IDF": self.UCM_train = self.UCM_train.astype(np.float32) self.UCM_train = TF_IDF(self.UCM_train) similarity = Compute_Similarity(self.UCM_train.T, shrink=shrink, topK=topK, normalize=normalize, similarity=similarity, **similarity_args) self.W_sparse = similarity.compute_similarity() self.W_sparse = check_matrix(self.W_sparse, format='csr')
def remove_empty_rows_and_cols(URM, ICM=None): URM = check_matrix(URM, "csr") rows = URM.indptr numRatings = np.ediff1d(rows) user_mask = numRatings >= 1 URM = URM[user_mask, :] cols = URM.tocsc().indptr numRatings = np.ediff1d(cols) item_mask = numRatings >= 1 URM = URM[:, item_mask] removedUsers = np.arange(0, len(user_mask))[np.logical_not(user_mask)] removedItems = np.arange(0, len(item_mask))[np.logical_not(item_mask)] if ICM is not None: ICM = ICM[item_mask, :] return URM.tocsr(), ICM.tocsr(), removedUsers, removedItems return URM.tocsr(), removedUsers, removedItems
def __init__(self, URM_train, ICM): super(FBSM_Rating_Cython, self).__init__(URM_train) self.n_items_icm, self.n_features = ICM.shape self.ICM = check_matrix(ICM, 'csr')
def _add_zeros_in_train_data_row_wise(self): """ This function uses a set of tuples to ensure the zero elements to be added are not already existent :return: """ if self.verbose: print(self.RECOMMENDER_NAME + ": Adding zeros in train data...") self.S_matrix_target = check_matrix(self.S_matrix_target, "csr") numSamples = self.S_matrix_target.nnz n_items = self.S_matrix_target.shape[0] zeros_to_add_global = int(numSamples * self.add_zeros_quota) zeros_added_global = 0 if zeros_to_add_global + numSamples >= n_items**2: raise ValueError( self.RECOMMENDER_NAME + ": Too many zeros to add, not enough unique coordinates in matrix" ) zeros_to_add_per_item = int(zeros_to_add_global / self.n_items) while zeros_added_global < zeros_to_add_global: for current_item_row in range(self.n_items): start_pos = self.S_matrix_target.indptr[current_item_row] end_pos = self.S_matrix_target.indptr[current_item_row + 1] nonzero_coordinates = set( self.S_matrix_target.indices[start_pos:end_pos]) zeros_added_per_item = 0 while zeros_added_per_item < zeros_to_add_per_item and zeros_added_global < zeros_to_add_global: new_coordinate = np.random.randint(0, n_items) if new_coordinate not in nonzero_coordinates: nonzero_coordinates.add(new_coordinate) self.row_list[numSamples + zeros_added_global] = current_item_row self.col_list[numSamples + zeros_added_global] = new_coordinate self.data_list[numSamples + zeros_added_global] = 0.0 zeros_added_per_item += 1 zeros_added_global += 1 if self.verbose: print("Added: {} zeros. Average per item is: {} ".format( zeros_added_global, zeros_to_add_per_item)) print(self.RECOMMENDER_NAME + ": Added zeros, data points are {}".format( len(self.data_list)))
def _build_confidence_matrix(self, confidence_scaling): if confidence_scaling == 'linear': self.C = self._linear_scaling_confidence() else: self.C = self._log_scaling_confidence() self.C_csc= check_matrix(self.C.copy(), format="csc", dtype = np.float32)
def __init__(self, URM_recommendations_items): super(PredefinedListRecommender, self).__init__() # convert to csc matrix for faster column-wise sum self.URM_recommendations = check_matrix(URM_recommendations_items, 'csr', dtype=np.int) self.URM_train = sps.csr_matrix((self.URM_recommendations.shape))
def __init__(self, URM_train, ICM=None, UCM=None): super(SVDFeature, self).__init__() self.URM_train = check_matrix(URM_train, "csr") self.ICM = ICM self.UCM = UCM self.n_users, self.n_items = URM_train.shape self.normalize = False
def fit(self, topK=100, alpha=0.5): self.topK = topK self.alpha = alpha W_sparse = self.Similarity_1 * self.alpha + self.Similarity_2 * ( 1 - self.alpha) self.W_sparse = similarityMatrixTopK(W_sparse, k=self.topK) self.W_sparse = check_matrix(self.W_sparse, format='csr')
def compute_W_sparse(self): self.similarity = Compute_Similarity( self.ICM.T, shrink=0, topK=self.topK, normalize=self.normalize_similarity, row_weights=self.D_best) self.W_sparse = self.similarity.compute_similarity() self.W_sparse = check_matrix(self.W_sparse, format='csr')
def __init__(self, URM_train, ICM_train, S_matrix_target): super(CFW_D_Similarity_Linalg, self).__init__(URM_train, ICM_train) if (URM_train.shape[1] != ICM_train.shape[0]): raise ValueError( "Number of items not consistent. URM contains {} but ICM contains {}" .format(URM_train.shape[1], ICM_train.shape[0])) if (S_matrix_target.shape[0] != S_matrix_target.shape[1]): raise ValueError( "Items imilarity matrix is not square: rows are {}, columns are {}" .format(S_matrix_target.shape[0], S_matrix_target.shape[1])) if (S_matrix_target.shape[0] != ICM_train.shape[0]): raise ValueError( "Number of items not consistent. S_matrix contains {} but ICM contains {}" .format(S_matrix_target.shape[0], ICM_train.shape[0])) self.S_matrix_target = check_matrix(S_matrix_target, 'csr') self.ICM = check_matrix(ICM_train, 'csr') self.n_features = self.ICM.shape[1]
def _write_feature_format_file(self): self.n_item_features = self.n_items if self.ICM is not None: self.ICM = check_matrix(self.ICM, "csr") self.n_item_features += self.ICM.shape[1] self.n_user_features = self.n_users if self.UCM is not None: self.UCM = check_matrix(self.UCM, "csr") self.n_user_features += self.UCM.shape[1] nnz_rows, nnz_cols = self.URM_train.nonzero() with open(self.temp_file_folder + self.FILE_MODEL_NAME, "w") as fileout: for i in tqdm(range(len(nnz_rows))): userid, itemid = nnz_rows[i], nnz_cols[i] output = self._get_feature_format(userid, itemid) print(output, file=fileout)
def fit(self, W_sparse, selectTopK=False, topK=100): assert W_sparse.shape[0] == W_sparse.shape[1],\ "ItemKNNCustomSimilarityRecommender: W_sparse matrice is not square. Current shape is {}".format(W_sparse.shape) assert self.URM_train.shape[1] == W_sparse.shape[0],\ "ItemKNNCustomSimilarityRecommender: URM_train and W_sparse matrices are not consistent. " \ "The number of columns in URM_train must be equal to the rows in W_sparse. " \ "Current shapes are: URM_train {}, W_sparse {}".format(self.URM_train.shape, W_sparse.shape) if selectTopK: W_sparse = similarityMatrixTopK(W_sparse, k=topK) self.W_sparse = check_matrix(W_sparse, format='csr')
def remove_features(ICM, min_occurrence=5, max_percentage_occurrence=0.30, reconcile_mapper=None): """ The function eliminates the values associated to feature occurring in less than the minimal percentage of items or more then the max. Shape of ICM is reduced deleting features. :param ICM: :param minPercOccurrence: :param max_percentage_occurrence: :param reconcile_mapper: DICT mapper [token] -> index :return: ICM :return: deletedFeatures :return: DICT mapper [token] -> index """ ICM = check_matrix(ICM, 'csc') n_items = ICM.shape[0] cols = ICM.indptr numOccurrences = np.ediff1d(cols) feature_mask = np.logical_and( numOccurrences >= min_occurrence, numOccurrences <= n_items * max_percentage_occurrence) ICM = ICM[:, feature_mask] deletedFeatures = np.arange( 0, len(feature_mask))[np.logical_not(feature_mask)] print( "RemoveFeatures: removed {} features with less then {} occurrences, removed {} features with more than {} occurrencies" .format(sum(numOccurrences < min_occurrence), min_occurrence, sum(numOccurrences > n_items * max_percentage_occurrence), int(n_items * max_percentage_occurrence))) if reconcile_mapper is not None: reconcile_mapper = reconcile_mapper_with_removed_tokens( reconcile_mapper, deletedFeatures) return ICM, deletedFeatures, reconcile_mapper return ICM, deletedFeatures
def compute_W_sparse(self, model_to_use="best"): if model_to_use == "last": feature_weights = self.D_incremental elif model_to_use == "best": feature_weights = self.D_best else: assert False, "{}: compute_W_sparse, 'model_to_use' parameter not recognized".format( self.RECOMMENDER_NAME) self.similarity = Compute_Similarity( self.ICM.T, shrink=0, topK=self.topK, normalize=self.normalize_similarity, row_weights=feature_weights) self.W_sparse = self.similarity.compute_similarity() self.W_sparse = check_matrix(self.W_sparse, format='csr')
def __init__(self, URM_train, UCM_train, verbose=True): super(BaseUserCBFRecommender, self).__init__(URM_train, verbose=verbose) assert self.n_users == UCM_train.shape[ 0], "{}: URM_train has {} users but UCM_train has {}".format( self.RECOMMENDER_NAME, self.n_items, UCM_train.shape[0]) self.UCM_train = check_matrix(UCM_train.copy(), 'csr', dtype=np.float32) self.UCM_train.eliminate_zeros() _, self.n_features = self.UCM_train.shape self._cold_user_CBF_mask = np.ediff1d(self.UCM_train.indptr) == 0 if self._cold_user_CBF_mask.any(): print("{}: UCM Detected {} ({:4.1f}%) cold users.".format( self.RECOMMENDER_NAME, self._cold_user_CBF_mask.sum(), self._cold_user_CBF_mask.sum() / self.n_users * 100))
def set_URM_train(self, URM_train_new, **kwargs): assert self.URM_train.shape == URM_train_new.shape, "{}: set_URM_train old and new URM train have different shapes".format( self.RECOMMENDER_NAME) if len(kwargs) > 0: self._print( "set_URM_train keyword arguments not supported for this recommender class. Received: {}" .format(kwargs)) self.URM_train = check_matrix(URM_train_new.copy(), 'csr', dtype=np.float32) self.URM_train.eliminate_zeros() self._cold_user_mask = np.ediff1d(self.URM_train.indptr) == 0 if self._cold_user_mask.any(): self._print( "Detected {} ({:4.1f}%) users with no interactions.".format( self._cold_user_mask.sum(), self._cold_user_mask.sum() / len(self._cold_user_mask) * 100))
def fit(self, alpha=1., beta=0.6, min_rating=0, topK=100, implicit=False, normalize_similarity=True): self.alpha = alpha self.beta = beta self.min_rating = min_rating self.topK = topK self.implicit = implicit self.normalize_similarity = normalize_similarity # if X.dtype != np.float32: # print("RP3beta fit: For memory usage reasons, we suggest to use np.float32 as dtype for the dataset") if self.min_rating > 0: self.URM_train.data[self.URM_train.data < self.min_rating] = 0 self.URM_train.eliminate_zeros() if self.implicit: self.URM_train.data = np.ones(self.URM_train.data.size, dtype=np.float32) #Pui is the row-normalized urm Pui = normalize(self.URM_train, norm='l1', axis=1) #Piu is the column-normalized, "boolean" urm transposed X_bool = self.URM_train.transpose(copy=True) X_bool.data = np.ones(X_bool.data.size, np.float32) # Taking the degree of each item to penalize top popular # Some rows might be zero, make sure their degree remains zero X_bool_sum = np.array(X_bool.sum(axis=1)).ravel() degree = np.zeros(self.URM_train.shape[1]) nonZeroMask = X_bool_sum!=0.0 degree[nonZeroMask] = np.power(X_bool_sum[nonZeroMask], -self.beta) #ATTENTION: axis is still 1 because i transposed before the normalization Piu = normalize(X_bool, norm='l1', axis=1) del(X_bool) # Alfa power if self.alpha != 1.: Pui = Pui.power(self.alpha) Piu = Piu.power(self.alpha) # Final matrix is computed as Pui * Piu * Pui # Multiplication unpacked for memory usage reasons block_dim = 200 d_t = Piu # Use array as it reduces memory requirements compared to lists dataBlock = 10000000 rows = np.zeros(dataBlock, dtype=np.int32) cols = np.zeros(dataBlock, dtype=np.int32) values = np.zeros(dataBlock, dtype=np.float32) numCells = 0 start_time = time.time() start_time_printBatch = start_time for current_block_start_row in range(0, Pui.shape[1], block_dim): if current_block_start_row + block_dim > Pui.shape[1]: block_dim = Pui.shape[1] - current_block_start_row similarity_block = d_t[current_block_start_row:current_block_start_row + block_dim, :] * Pui similarity_block = similarity_block.toarray() for row_in_block in range(block_dim): row_data = np.multiply(similarity_block[row_in_block, :], degree) row_data[current_block_start_row + row_in_block] = 0 best = row_data.argsort()[::-1][:self.topK] notZerosMask = row_data[best] != 0.0 values_to_add = row_data[best][notZerosMask] cols_to_add = best[notZerosMask] for index in range(len(values_to_add)): if numCells == len(rows): rows = np.concatenate((rows, np.zeros(dataBlock, dtype=np.int32))) cols = np.concatenate((cols, np.zeros(dataBlock, dtype=np.int32))) values = np.concatenate((values, np.zeros(dataBlock, dtype=np.float32))) rows[numCells] = current_block_start_row + row_in_block cols[numCells] = cols_to_add[index] values[numCells] = values_to_add[index] numCells += 1 if time.time() - start_time_printBatch > 300: new_time_value, new_time_unit = seconds_to_biggest_unit(time.time() - start_time) self._print("Similarity column {} ({:4.1f}%), {:.2f} column/sec. Elapsed time {:.2f} {}".format( current_block_start_row + block_dim, 100.0 * float( current_block_start_row + block_dim) / Pui.shape[1], float( current_block_start_row + block_dim) / (time.time() - start_time), new_time_value, new_time_unit)) sys.stdout.flush() sys.stderr.flush() start_time_printBatch = time.time() self.W_sparse = sps.csr_matrix((values[:numCells], (rows[:numCells], cols[:numCells])), shape=(Pui.shape[1], Pui.shape[1])) if self.normalize_similarity: self.W_sparse = normalize(self.W_sparse, norm='l1', axis=1) if self.topK != False: self.W_sparse = similarityMatrixTopK(self.W_sparse, k=self.topK) self.W_sparse = check_matrix(self.W_sparse, format='csr')
def fit(self, l1_ratio=0.1, alpha=1.0, positive_only=True, topK=100): assert l1_ratio >= 0 and l1_ratio <= 1, "{}: l1_ratio must be between 0 and 1, provided value was {}".format( self.RECOMMENDER_NAME, l1_ratio) self.l1_ratio = l1_ratio self.positive_only = positive_only self.topK = topK # initialize the ElasticNet model self.model = ElasticNet(alpha=alpha, l1_ratio=self.l1_ratio, positive=self.positive_only, fit_intercept=False, copy_X=False, precompute=True, selection='random', max_iter=100, tol=1e-4) URM_train = check_matrix(self.URM_train, 'csc', dtype=np.float32) n_items = URM_train.shape[1] # Use array as it reduces memory requirements compared to lists dataBlock = 10000000 rows = np.zeros(dataBlock, dtype=np.int32) cols = np.zeros(dataBlock, dtype=np.int32) values = np.zeros(dataBlock, dtype=np.float32) numCells = 0 start_time = time.time() start_time_printBatch = start_time # fit each item's factors sequentially (not in parallel) for currentItem in range(n_items): # get the target column y = URM_train[:, currentItem].toarray() # set the j-th column of X to zero start_pos = URM_train.indptr[currentItem] end_pos = URM_train.indptr[currentItem + 1] current_item_data_backup = URM_train.data[start_pos:end_pos].copy() URM_train.data[start_pos:end_pos] = 0.0 # fit one ElasticNet model per column self.model.fit(URM_train, y) # self.model.coef_ contains the coefficient of the ElasticNet model # let's keep only the non-zero values # Select topK values # Sorting is done in three steps. Faster then plain np.argsort for higher number of items # - Partition the data to extract the set of relevant items # - Sort only the relevant items # - Get the original item index nonzero_model_coef_index = self.model.sparse_coef_.indices nonzero_model_coef_value = self.model.sparse_coef_.data local_topK = min(len(nonzero_model_coef_value) - 1, self.topK) relevant_items_partition = ( -nonzero_model_coef_value ).argpartition(local_topK)[0:local_topK] relevant_items_partition_sorting = np.argsort( -nonzero_model_coef_value[relevant_items_partition]) ranking = relevant_items_partition[ relevant_items_partition_sorting] for index in range(len(ranking)): if numCells == len(rows): rows = np.concatenate( (rows, np.zeros(dataBlock, dtype=np.int32))) cols = np.concatenate( (cols, np.zeros(dataBlock, dtype=np.int32))) values = np.concatenate( (values, np.zeros(dataBlock, dtype=np.float32))) rows[numCells] = nonzero_model_coef_index[ranking[index]] cols[numCells] = currentItem values[numCells] = nonzero_model_coef_value[ranking[index]] numCells += 1 # finally, replace the original values of the j-th column URM_train.data[start_pos:end_pos] = current_item_data_backup elapsed_time = time.time() - start_time new_time_value, new_time_unit = seconds_to_biggest_unit( elapsed_time) if time.time( ) - start_time_printBatch > 300 or currentItem == n_items - 1: self._print( "Processed {} ({:4.1f}%) in {:.2f} {}. Items per second: {:.2f}" .format(currentItem + 1, 100.0 * float(currentItem + 1) / n_items, new_time_value, new_time_unit, float(currentItem) / elapsed_time)) sys.stdout.flush() sys.stderr.flush() start_time_printBatch = time.time() # generate the sparse weight matrix self.W_sparse = sps.csr_matrix( (values[:numCells], (rows[:numCells], cols[:numCells])), shape=(n_items, n_items), dtype=np.float32)
def _generate_train_data(self): if self.verbose: print(self.RECOMMENDER_NAME + ": Generating train data") start_time_batch = time.time() # Here is important only the structure self.compute_W_sparse() S_matrix_contentKNN = check_matrix(self.W_sparse, "csr") self.write_log( "Collaborative S density: {:.2E}, nonzero cells {}".format( self.S_matrix_target.nnz / self.S_matrix_target.shape[0]**2, self.S_matrix_target.nnz)) self.write_log("Content S density: {:.2E}, nonzero cells {}".format( S_matrix_contentKNN.nnz / S_matrix_contentKNN.shape[0]**2, S_matrix_contentKNN.nnz)) num_common_coordinates = 0 estimated_n_samples = int(S_matrix_contentKNN.nnz * (1 + self.add_zeros_quota) * 1.2) self.row_list = np.zeros(estimated_n_samples, dtype=np.int32) self.col_list = np.zeros(estimated_n_samples, dtype=np.int32) self.data_list = np.zeros(estimated_n_samples, dtype=np.float64) num_samples = 0 for row_index in range(self.n_items): start_pos_content = S_matrix_contentKNN.indptr[row_index] end_pos_content = S_matrix_contentKNN.indptr[row_index + 1] content_coordinates = S_matrix_contentKNN.indices[ start_pos_content:end_pos_content] start_pos_target = self.S_matrix_target.indptr[row_index] end_pos_target = self.S_matrix_target.indptr[row_index + 1] target_coordinates = self.S_matrix_target.indices[ start_pos_target:end_pos_target] # Chech whether the content coordinate is associated to a non zero target value # If true, the content coordinate has a collaborative non-zero value # if false, the content coordinate has a collaborative zero value is_common = np.in1d(content_coordinates, target_coordinates) num_common_in_current_row = is_common.sum() num_common_coordinates += num_common_in_current_row for index in range(len(is_common)): if num_samples == estimated_n_samples: dataBlock = 1000000 self.row_list = np.concatenate( (self.row_list, np.zeros(dataBlock, dtype=np.int32))) self.col_list = np.concatenate( (self.col_list, np.zeros(dataBlock, dtype=np.int32))) self.data_list = np.concatenate( (self.data_list, np.zeros(dataBlock, dtype=np.float64))) if is_common[index]: # If cell exists in target matrix, add its value # Otherwise it will remain zero with a certain probability col_index = content_coordinates[index] self.row_list[num_samples] = row_index self.col_list[num_samples] = col_index self.data_list[num_samples] = self.S_matrix_target[ row_index, col_index] num_samples += 1 elif np.random.rand() <= self.add_zeros_quota: col_index = content_coordinates[index] self.row_list[num_samples] = row_index self.col_list[num_samples] = col_index self.data_list[num_samples] = 0.0 num_samples += 1 if self.verbose and (time.time() - start_time_batch > 30 or num_samples == S_matrix_contentKNN.nnz * (1 + self.add_zeros_quota)): print(self.RECOMMENDER_NAME + ": Generating train data. Sample {} ({:4.1f}%) ".format( num_samples, num_samples / S_matrix_contentKNN.nnz * (1 + self.add_zeros_quota) * 100)) sys.stdout.flush() sys.stderr.flush() start_time_batch = time.time() self.write_log( "Content S structure has {} out of {} ({:4.1f}%) nonzero collaborative cells" .format(num_common_coordinates, S_matrix_contentKNN.nnz, num_common_coordinates / S_matrix_contentKNN.nnz * 100)) # Discard extra cells at the left of the array self.row_list = self.row_list[:num_samples] self.col_list = self.col_list[:num_samples] self.data_list = self.data_list[:num_samples] data_nnz = sum(np.array(self.data_list) != 0) data_sum = sum(self.data_list) collaborative_nnz = self.S_matrix_target.nnz collaborative_sum = sum(self.S_matrix_target.data) self.write_log( "Nonzero collaborative cell sum is: {:.2E}, average is: {:.2E}, " "average over all collaborative data is {:.2E}".format( data_sum, data_sum / data_nnz, collaborative_sum / collaborative_nnz))