def __init__(self, URM_train, UCM, S_matrix_target): super(User_CFW_D_Similarity_Linalg, self).__init__(URM_train) if (URM_train.shape[0] != UCM.shape[0]): raise ValueError( "Number of users not consistent. URM contains {} but UCM contains {}" .format(URM_train.shape[1], UCM.shape[0])) if (S_matrix_target.shape[0] != S_matrix_target.shape[1]): raise ValueError( "User similarity matrix is not square: rows are {}, columns are {}" .format(S_matrix_target.shape[0], S_matrix_target.shape[1])) if (S_matrix_target.shape[0] != UCM.shape[0]): raise ValueError( "Number of items not consistent. S_matrix contains {} but ICM contains {}" .format(S_matrix_target.shape[0], UCM.shape[0])) self.S_matrix_target = check_matrix(S_matrix_target, 'csr') self.UCM = check_matrix(UCM, 'csr') self.n_items = self.URM_train.shape[1] self.n_users = self.URM_train.shape[0] self.n_features = self.UCM.shape[1] self.sparse_weights = True
def fit(self, topK=50, shrink=100, similarity='cosine', normalize=True, feature_weighting="none", **similarity_args): self.topK = topK self.shrink = shrink if feature_weighting not in self.FEATURE_WEIGHTING_VALUES: raise ValueError( "Value for 'feature_weighting' not recognized. Acceptable values are {}, provided was '{}'".format( self.FEATURE_WEIGHTING_VALUES, feature_weighting)) if feature_weighting == "BM25": self.URM_train = self.URM_train.astype(np.float32) self.URM_train = okapi_BM_25(self.URM_train) self.URM_train = check_matrix(self.URM_train, 'csr') elif feature_weighting == "TF-IDF": self.URM_train = self.URM_train.astype(np.float32) self.URM_train = TF_IDF(self.URM_train) self.URM_train = check_matrix(self.URM_train, 'csr') similarity = Compute_Similarity(self.URM_train.T, shrink=shrink, topK=topK, normalize=normalize, similarity=similarity, **similarity_args) self.W_sparse = similarity.compute_similarity() self.W_sparse = check_matrix(self.W_sparse, format='csr')
def fit(self, topK=50, shrink=100, similarity='cosine', normalize=True, feature_weighting="none", interactions_feature_weighting="none", **similarity_args): if interactions_feature_weighting not in self.FEATURE_WEIGHTING_VALUES: raise ValueError( "Value for 'feature_weighting' not recognized. Acceptable values are {}, provided was '{}'" .format(self.FEATURE_WEIGHTING_VALUES, interactions_feature_weighting)) if interactions_feature_weighting == "BM25": self.URM_train = self.URM_train.astype(np.float32) self.URM_train = okapi_BM_25(self.URM_train.T).T self.URM_train = check_matrix(self.URM_train, 'csr') elif interactions_feature_weighting == "TF-IDF": self.URM_train = self.URM_train.astype(np.float32) self.URM_train = TF_IDF(self.URM_train.T).T self.URM_train = check_matrix(self.URM_train, 'csr') super().fit(topK=topK, shrink=shrink, similarity=similarity, normalize=normalize, feature_weighting=feature_weighting, **similarity_args)
def get_S_incremental_and_set_W(self): self.S_incremental = self.cythonEpoch.get_S() if self.train_with_sparse_weights: self.W_sparse = self.S_incremental self.W_sparse = check_matrix(self.W_sparse, format='csr') else: self.W_sparse = similarityMatrixTopK(self.S_incremental, k=self.topK) self.W_sparse = check_matrix(self.W_sparse, format='csr')
def __init__(self, URM_train, Similarity_1, Similarity_2, verbose=True): super(ItemKNNSimilarityHybridRecommender, self).__init__(URM_train, verbose=verbose) if Similarity_1.shape != Similarity_2.shape: raise ValueError( "ItemKNNSimilarityHybridRecommender: similarities have different size, S1 is {}, S2 is {}" .format(Similarity_1.shape, Similarity_2.shape)) # CSR is faster during evaluation self.Similarity_1 = check_matrix(Similarity_1.copy(), 'csr') self.Similarity_2 = check_matrix(Similarity_2.copy(), 'csr')
def fit(self, lambda_user=10, lambda_item=25): self.lambda_user = lambda_user self.lambda_item = lambda_item self.n_items = self.URM_train.shape[1] # convert to csc matrix for faster column-wise sum self.URM_train = check_matrix(self.URM_train, 'csc', dtype=np.float32) # 1) global average self.mu = self.URM_train.data.sum( dtype=np.float32) / self.URM_train.data.shape[0] # 2) item average bias # compute the number of non-zero elements for each column col_nnz = np.diff(self.URM_train.indptr) # it is equivalent to: # col_nnz = X.indptr[1:] - X.indptr[:-1] # and it is **much faster** than # col_nnz = (X != 0).sum(axis=0) URM_train_unbiased = self.URM_train.copy() URM_train_unbiased.data -= self.mu self.item_bias = URM_train_unbiased.sum(axis=0) / (col_nnz + self.lambda_item) self.item_bias = np.asarray(self.item_bias).ravel( ) # converts 2-d matrix to 1-d array without anycopy # 3) user average bias # NOTE: the user bias is *useless* for the sake of ranking items. We just show it here for educational purposes. # first subtract the item biases from each column # then repeat each element of the item bias vector a number of times equal to col_nnz # and subtract it from the data vector URM_train_unbiased.data -= np.repeat(self.item_bias, col_nnz) # now convert the csc matrix to csr for efficient row-wise computation URM_train_unbiased_csr = URM_train_unbiased.tocsr() row_nnz = np.diff(URM_train_unbiased_csr.indptr) # finally, let's compute the bias self.user_bias = URM_train_unbiased_csr.sum( axis=1).ravel() / (row_nnz + self.lambda_user) # 4) precompute the item ranking by using the item bias only # the global average and user bias won't change the ranking, so there is no need to use them #self.item_ranking = np.argsort(self.bi)[::-1] self.URM_train = check_matrix(self.URM_train, 'csr', dtype=np.float32)
def applyPearsonCorrelation(self): """ Remove from every data point the average for the corresponding column :return: """ self.dataMatrix = check_matrix(self.dataMatrix, 'csc') interactionsPerCol = np.diff(self.dataMatrix.indptr) nonzeroCols = interactionsPerCol > 0 sumPerCol = np.asarray(self.dataMatrix.sum(axis=0)).ravel() colAverage = np.zeros_like(sumPerCol) colAverage[nonzeroCols] = sumPerCol[nonzeroCols] / interactionsPerCol[ nonzeroCols] # Split in blocks to avoid duplicating the whole data structure start_col = 0 end_col = 0 blockSize = 1000 while end_col < self.n_columns: end_col = min(self.n_columns, end_col + blockSize) self.dataMatrix.data[self.dataMatrix.indptr[start_col]:self.dataMatrix.indptr[end_col]] -= \ np.repeat(colAverage[start_col:end_col], interactionsPerCol[start_col:end_col]) start_col += blockSize
def applyAdjustedCosine(self): """ Remove from every data point the average for the corresponding row :return: """ self.dataMatrix = check_matrix(self.dataMatrix, 'csr') interactionsPerRow = np.diff(self.dataMatrix.indptr) nonzeroRows = interactionsPerRow > 0 sumPerRow = np.asarray(self.dataMatrix.sum(axis=1)).ravel() rowAverage = np.zeros_like(sumPerRow) rowAverage[nonzeroRows] = sumPerRow[nonzeroRows] / interactionsPerRow[ nonzeroRows] # Split in blocks to avoid duplicating the whole data structure start_row = 0 end_row = 0 blockSize = 1000 while end_row < self.n_rows: end_row = min(self.n_rows, end_row + blockSize) self.dataMatrix.data[self.dataMatrix.indptr[start_row]:self.dataMatrix.indptr[end_row]] -= \ np.repeat(rowAverage[start_row:end_row], interactionsPerRow[start_row:end_row]) start_row += blockSize
def __init__(self, URM_train, verbose=True): super(BaseRecommender, self).__init__() self.URM_train = check_matrix(URM_train.copy(), 'csr', dtype=np.float32) self.URM_train.eliminate_zeros() self.n_users, self.n_items = self.URM_train.shape self.verbose = verbose self.filterTopPop = False self.filterTopPop_ItemsID = np.array([], dtype=np.int) self.items_to_ignore_flag = False self.items_to_ignore_ID = np.array([], dtype=np.int) self._cold_user_mask = np.ediff1d(self.URM_train.indptr) == 0 if self._cold_user_mask.any(): self._print("URM Detected {} ({:.2f} %) cold users.".format( self._cold_user_mask.sum(), self._cold_user_mask.sum() / self.n_users * 100)) self._cold_item_mask = np.ediff1d(self.URM_train.tocsc().indptr) == 0 if self._cold_item_mask.any(): self._print("URM Detected {} ({:.2f} %) cold items.".format( self._cold_item_mask.sum(), self._cold_item_mask.sum() / self.n_items * 100))
def remove_empty_rows_and_cols(URM, ICM = None): URM = check_matrix(URM, "csr") rows = URM.indptr numRatings = np.ediff1d(rows) user_mask = numRatings >= 1 URM = URM[user_mask,:] cols = URM.tocsc().indptr numRatings = np.ediff1d(cols) item_mask = numRatings >= 1 URM = URM[:,item_mask] removedUsers = np.arange(0, len(user_mask))[np.logical_not(user_mask)] removedItems = np.arange(0, len(item_mask))[np.logical_not(item_mask)] if ICM is not None: ICM = ICM[item_mask,:] return URM.tocsr(), ICM.tocsr(), removedUsers, removedItems return URM.tocsr(), removedUsers, removedItems
def _build_confidence_matrix(self, confidence_scaling): if confidence_scaling == 'linear': self.C = self._linear_scaling_confidence() else: self.C = self._log_scaling_confidence() self.C_iu = check_matrix(self.C.transpose().copy(), format="csr", dtype=np.float32)
def __init__(self, URM_recommendations_items): super(PredefinedListRecommender, self).__init__() # convert to csc matrix for faster column-wise sum self.URM_recommendations = check_matrix(URM_recommendations_items, 'csr', dtype=np.int) self.URM_train = sps.csr_matrix((self.URM_recommendations.shape))
def fit(self, topK=100, alpha=0.5): self.topK = topK self.alpha = alpha W_sparse = self.Similarity_1 * self.alpha + self.Similarity_2 * ( 1 - self.alpha) self.W_sparse = similarityMatrixTopK(W_sparse, k=self.topK) self.W_sparse = check_matrix(self.W_sparse, format='csr')
def apply_feature_weighting(matrix, feature_weighting="none"): from course_lib.Base.IR_feature_weighting import okapi_BM_25, TF_IDF from course_lib.Base.Recommender_utils import check_matrix FEATURE_WEIGHTING_VALUES = ["BM25", "TF-IDF", "none"] if feature_weighting not in FEATURE_WEIGHTING_VALUES: raise ValueError( "Value for 'feature_weighting' not recognized. Acceptable values are {}, provided was '{}'" .format(FEATURE_WEIGHTING_VALUES, feature_weighting)) if feature_weighting == "BM25": matrix = matrix.astype(np.float32) matrix = okapi_BM_25(matrix) matrix = check_matrix(matrix, 'csr') elif feature_weighting == "TF-IDF": matrix = matrix.astype(np.float32) matrix = TF_IDF(matrix) matrix = check_matrix(matrix, 'csr') return matrix
def precompute_best_item_indices(self, URM: sps.csr_matrix): URM = URM.copy() if self.feature_weighting == "BM25": URM = URM.astype(np.float32) URM = okapi_BM_25(URM) URM = check_matrix(URM, 'csr') elif self.feature_weighting == "TF-IDF": URM = URM.astype(np.float32) URM = TF_IDF(URM) URM = check_matrix(URM, 'csr') similarity = Compute_Similarity(URM, shrink=self.shrink, topK=self.topK, normalize=self.normalize, similarity="cosine") similarity_matrix = similarity.compute_similarity() self.sorted_indices = np.array( np.argsort(-similarity_matrix.todense(), axis=1))
def test_mse(self): curr_item = 0 URM_train = check_matrix(self.urm1, 'csc', dtype=np.float32) target_column = URM_train[:, curr_item].toarray() start_pos = URM_train.indptr[curr_item] end_pos = URM_train.indptr[curr_item + 1] URM_train.data[start_pos: end_pos] = 0.0 loss = MSELoss(only_positive=False) qubo = loss.get_qubo_problem(urm=URM_train, target_column=target_column) self.assertEqual(qubo.tolist(), [[0, 0, 0], [0, -7, 10], [0, 10, -9]])
def test_non_zero_sim_norm_mse(self): curr_item = 0 URM_train = check_matrix(self.urm1, 'csc', dtype=np.float32) target_column = URM_train[:, curr_item].toarray() start_pos = URM_train.indptr[curr_item] end_pos = URM_train.indptr[curr_item + 1] URM_train.data[start_pos: end_pos] = 0.0 loss = NormMSELoss(only_positive=True, is_simplified=True) qubo = loss.get_qubo_problem(urm=URM_train, target_column=target_column) self.assertEqual(qubo.tolist(), [[0, 0, 0], [-16, -8, -6], [-26, -16, -9]])
def fit(self, W_sparse, selectTopK=False, topK=100): assert W_sparse.shape[0] == W_sparse.shape[1],\ "ItemKNNCustomSimilarityRecommender: W_sparse matrice is not square. Current shape is {}".format(W_sparse.shape) assert self.URM_train.shape[1] == W_sparse.shape[0],\ "ItemKNNCustomSimilarityRecommender: URM_train and W_sparse matrices are not consistent. " \ "The number of columns in URM_train must be equal to the rows in W_sparse. " \ "Current shapes are: URM_train {}, W_sparse {}".format(self.URM_train.shape, W_sparse.shape) if selectTopK: W_sparse = similarityMatrixTopK(W_sparse, k=topK) self.W_sparse = check_matrix(W_sparse, format='csr')
def fit(self, topK=50, shrink=100, normalize=True, feature_weighting="none"): self.topK = topK self.shrink = shrink if feature_weighting not in self.FEATURE_WEIGHTING_VALUES: raise ValueError( "Value for 'feature_weighting' not recognized. Acceptable values are {}, provided was '{}'" .format(self.FEATURE_WEIGHTING_VALUES, feature_weighting)) if feature_weighting == "BM25": self.URM_train = self.URM_train.astype(np.float32) self.URM_train = okapi_BM_25(self.URM_train.T).T self.URM_train = check_matrix(self.URM_train, 'csr') elif feature_weighting == "TF-IDF": self.URM_train = self.URM_train.astype(np.float32) self.URM_train = TF_IDF(self.URM_train.T).T self.URM_train = check_matrix(self.URM_train, 'csr') denominator = 1 if shrink == 0 else shrink self.W_sparse = self.URM_train.T.dot( self.URM_train) * (1 / denominator) if self.topK >= 0: self.W_sparse = userSimilarityMatrixTopK(self.W_sparse, k=self.topK).tocsr() if normalize: self.W_sparse = normalize_sk(self.W_sparse, norm="l2", axis=1) self.W_sparse = check_matrix(self.W_sparse, format='csr')
def fit(self, topK=50, shrink=100, similarity='cosine', normalize=True, feature_weighting="none", **similarity_args): self.topK = topK self.topComputeK = topK + len(self.cold_users) self.shrink = shrink if feature_weighting not in self.FEATURE_WEIGHTING_VALUES: raise ValueError( "Value for 'feature_weighting' not recognized. Acceptable values are {}, provided was '{}'" .format(self.FEATURE_WEIGHTING_VALUES, feature_weighting)) if feature_weighting == "BM25": self.UCM_train = self.UCM_train.astype(np.float32) self.UCM_train = okapi_BM_25(self.UCM_train) elif feature_weighting == "TF-IDF": self.UCM_train = self.UCM_train.astype(np.float32) self.UCM_train = TF_IDF(self.UCM_train) similarity = Compute_Similarity(self.UCM_train.T, shrink=shrink, topK=self.topComputeK, normalize=normalize, similarity=similarity, **similarity_args) self.W_sparse = similarity.compute_similarity() self.W_sparse = self.W_sparse.tocsc() for user in self.cold_users: self.W_sparse.data[self.W_sparse.indptr[user]:self.W_sparse. indptr[user + 1]] = 0 self.W_sparse.eliminate_zeros() self.W_sparse = self.W_sparse.tocsr() self.W_sparse = similarityMatrixTopK(self.W_sparse, k=self.topK).tocsr() self.W_sparse = check_matrix(self.W_sparse, format='csr') # Add identity matrix to the recommender self.recommender.W_sparse = self.recommender.W_sparse + sps.identity( self.recommender.W_sparse.shape[0], format="csr")
def userSimilarityMatrixTopK(user_weights, k=100): """ The function selects the TopK most similar elements, column-wise :param user_weights: :param k: :return: """ assert (user_weights.shape[0] == user_weights.shape[1] ), "selectTopK: UserWeights is not a square matrix" n_users = user_weights.shape[1] k = min(k, n_users) # iterate over each column and keep only the top-k similar items data, rows_indices, cols_indptr = [], [], [] user_weights = check_matrix(user_weights, format='csc', dtype=np.float32) for user_idx in range(n_users): cols_indptr.append(len(data)) start_position = user_weights.indptr[user_idx] end_position = user_weights.indptr[user_idx + 1] column_data = user_weights.data[start_position:end_position] column_row_index = user_weights.indices[start_position:end_position] if min(k, column_data.size) == k: top_k_idx = np.argpartition(-column_data, kth=min(k, column_data.size - 1))[:k] else: top_k_idx = np.ones(column_data.size, dtype=np.bool) data.extend(column_data[top_k_idx]) rows_indices.extend(column_row_index[top_k_idx]) cols_indptr.append(len(data)) # During testing CSR is faster W_sparse = sps.csc_matrix((data, rows_indices, cols_indptr), shape=(n_users, n_users), dtype=np.float32) return W_sparse
def fit(self, l1_ratio=0.1, positive_only=True, topK=100, workers=multiprocessing.cpu_count()): assert l1_ratio >= 0 and l1_ratio <= 1, "SLIM_ElasticNet: l1_ratio must be between 0 and 1, provided value was {}".format( l1_ratio) self.l1_ratio = l1_ratio self.positive_only = positive_only self.topK = topK self.workers = workers self.URM_train = check_matrix(self.URM_train, 'csc', dtype=np.float32) n_items = self.URM_train.shape[1] # fit item's factors in parallel #oggetto riferito alla funzione nel quale predefinisco parte dell'input _pfit = partial(self._partial_fit, X=self.URM_train, topK=self.topK) #creo un pool con un certo numero di processi pool = Pool(processes=self.workers) #avvio il pool passando la funzione (con la parte fissa dell'input) #e il rimanente parametro, variabile res = pool.map(_pfit, np.arange(n_items)) # res contains a vector of (values, rows, cols) tuples values, rows, cols = [], [], [] for values_, rows_, cols_ in res: values.extend(values_) rows.extend(rows_) cols.extend(cols_) # generate the sparse weight matrix self.W_sparse = sps.csr_matrix((values, (rows, cols)), shape=(n_items, n_items), dtype=np.float32)
def removeFeatures(ICM, minOccurrence = 5, maxPercOccurrence = 0.30, reconcile_mapper = None): """ The function eliminates the values associated to feature occurring in less than the minimal percentage of items or more then the max. Shape of ICM is reduced deleting features. :param ICM: :param minPercOccurrence: :param maxPercOccurrence: :param reconcile_mapper: DICT mapper [token] -> index :return: ICM :return: deletedFeatures :return: DICT mapper [token] -> index """ ICM = check_matrix(ICM, 'csc') n_items = ICM.shape[0] cols = ICM.indptr numOccurrences = np.ediff1d(cols) feature_mask = np.logical_and(numOccurrences >= minOccurrence, numOccurrences <= n_items*maxPercOccurrence) ICM = ICM[:,feature_mask] deletedFeatures = np.arange(0, len(feature_mask))[np.logical_not(feature_mask)] print("RemoveFeatures: removed {} features with less then {} occurrencies, removed {} features with more than {} occurrencies".format( sum(numOccurrences < minOccurrence), minOccurrence, sum(numOccurrences > n_items*maxPercOccurrence), int(n_items*maxPercOccurrence) )) if reconcile_mapper is not None: reconcile_mapper = reconcile_mapper_with_removed_tokens(reconcile_mapper, deletedFeatures) return ICM, deletedFeatures, reconcile_mapper return ICM, deletedFeatures
def fit(self, agg_strategy="FIRST", filter_sample_method="NONE", topK=5, alpha_multiplier=0, constraint_multiplier=1, chain_multiplier=1, filter_items_method="NONE", filter_items_n=100, num_reads=100, **filter_items_parameters): """ It fits the data (i.e. URM_train) by solving an optimization problem for each item. Each optimization problem is generated from the URM_train without the target column and the target column by means of transformation to a QUBO based on "transform_fn" with some regulators; then it is solved by a solver given at the initialization of the class. Then by using the samples collected from the solver, it builds the item-similarity matrix. :param agg_strategy: the post-processing aggregation to be used on the samples :param filter_sample_method: the filter technique used before the post-processing aggregation :param topK: a regulator number that indicates the number of selected variables forced during the optimization :param alpha_multiplier: a multiplier number applied on the constraint of the sparsity regulator term :param constraint_multiplier: a multiplier number applied on the constraint strength of the variable selection regulator :param chain_multiplier: a multiplier number applied on the chain strength of the embedding :param filter_items_method: name of the filtering method to select a set of items for the resolution of the optimization problem :param filter_items_n: number of items to be selected by the filtering method :param num_reads: number of samples to compute from the solver :param filter_items_parameters: other parameters regarding the filter items method """ self._check_fit_parameters(agg_strategy, filter_items_method, filter_sample_method) if filter_items_method == "COSINE": self.FILTER_ITEMS_METHODS["COSINE"] = ItemSelectorByCosineSimilarity(**filter_items_parameters) URM_train = check_matrix(self.URM_train, 'csc', dtype=np.float32) n_items = URM_train.shape[1] item_pop = np.array((URM_train > 0).sum(axis=0)).flatten() # Need a labeling of variables to order the variables from 0 to n_items. With variable leading zeros based on # the highest number of digits leading_zeros = len(str(n_items - 1)) variables = ["a{:0{}d}".format(i, leading_zeros) for i in range(n_items)] if self.to_resume: start_item = self.df_responses[self.ITEM_ID_COLUMN_NAME].max() else: self.df_responses = pd.DataFrame() start_item = 0 self.FILTER_ITEMS_METHODS[filter_items_method].precompute_best_item_indices(URM_train) matrix_builder = IncrementalSparseMatrix(n_rows=n_items, n_cols=n_items) for curr_item in tqdm(range(start_item, n_items), desc="%s: Computing W_sparse matrix" % self.RECOMMENDER_NAME): # get the target column target_column = URM_train[:, curr_item].toarray() # set the "curr_item"-th column of URM_train to zero start_pos = URM_train.indptr[curr_item] end_pos = URM_train.indptr[curr_item + 1] current_item_data_backup = URM_train.data[start_pos: end_pos].copy() URM_train.data[start_pos: end_pos] = 0.0 # select items to be used in the QUBO optimization problem URM = URM_train.copy() URM, mapping_array = self.FILTER_ITEMS_METHODS[filter_items_method].filter_items(URM, target_column, curr_item, filter_items_n) n_variables = len(mapping_array) # get BQM/QUBO problem for the current item qubo = self.LOSSES[self.obj_function].get_qubo_problem(URM, target_column) qubo = qubo + (np.log1p(item_pop[curr_item]) ** 2 + 1) * alpha_multiplier * (np.max(qubo) - np.min(qubo)) \ * np.identity(n_variables) if topK > -1: constraint_strength = max(self.MIN_CONSTRAINT_STRENGTH, constraint_multiplier * (np.max(qubo) - np.min(qubo))) # avoid using the "combinations" function of dimod in order to speed up the computation qubo += -2 * constraint_strength * topK * np.identity(n_variables) + constraint_strength * np.ones( (n_variables, n_variables)) # Generation of the BQM with qubo in a quicker way checked with some performance measuring. On a test of # 2000 n_items, this method is quicker w.r.t. from_numpy_matrix function of dimod bqm = dimod.BinaryQuadraticModel.empty(dimod.BINARY) bqm.add_variables_from(dict(zip(variables, np.diag(qubo)))) for i in range(n_variables): values = np.array(qubo[i, i + 1:]).flatten() + np.array(qubo[i + 1:, i]).flatten() keys = [(variables[i], variables[j]) for j in range(i + 1, n_variables)] bqm.add_interactions_from(dict(zip(keys, values))) self._print("The BQM for item {} is {}".format(curr_item, bqm)) # solve the problem with the solver try: if ("child_properties" in self.solver.properties and self.solver.properties["child_properties"]["category"] == "qpu") \ or "qpu_properties" in self.solver.properties: chain_strength = max(self.MIN_CONSTRAINT_STRENGTH, chain_multiplier * (np.max(qubo) - np.min(qubo))) response = self.solver.sample(bqm, chain_strength=chain_strength, num_reads=num_reads) self._print("Break chain percentage of item {} is {}" .format(curr_item, list(response.data(fields=["chain_break_fraction"])))) self._print("Timing of QPU is %s" % response.info["timing"]) else: response = self.solver.sample(bqm, num_reads=num_reads) self._print("The response for item {} is {}".format(curr_item, response.aggregate())) except OSError as err: traceback.print_exc() raise err # save response in self.responses if self.do_save_responses is True; otherwise apply post-processing # and put the results in the matrix builder response_df = response.to_pandas_dataframe() response_df[self.ITEM_ID_COLUMN_NAME] = curr_item if self.do_save_responses: self.df_responses = self.df_responses.append(response_df, ignore_index=True) self.mapping_matrix.append(mapping_array) else: self.df_responses = self.df_responses.reindex(sorted(self.df_responses.columns), axis=1) self.add_sample_responses_to_matrix_builder(matrix_builder, agg_strategy, filter_sample_method, response_df, curr_item, mapping_array) # restore URM_train URM_train.data[start_pos:end_pos] = current_item_data_backup if self.do_save_responses: self.df_responses = self.df_responses.reindex(sorted(self.df_responses.columns), axis=1) self.W_sparse = self.build_similarity_matrix(self.df_responses, agg_strategy, filter_sample_method, self.mapping_matrix) else: self.W_sparse = matrix_builder.get_SparseMatrix()
def fit(self, topK=100, alpha=1., min_rating=0, implicit=False, normalize_similarity=False): self.topK = topK self.alpha = alpha self.min_rating = min_rating self.implicit = implicit self.normalize_similarity = normalize_similarity # # if X.dtype != np.float32: # print("P3ALPHA fit: For memory usage reasons, we suggest to use np.float32 as dtype for the dataset") if self.min_rating > 0: self.URM_train.data[self.URM_train.data < self.min_rating] = 0 self.URM_train.eliminate_zeros() if self.implicit: self.URM_train.data = np.ones(self.URM_train.data.size, dtype=np.float32) self.Pui = sps.hstack([self.user_W_sparse, self.URM_train], format="csr") self.Piu = sps.hstack([self.URM_train.T, self.item_W_sparse], format="csr") self.P = sps.vstack([self.Pui, self.Piu], format="csr") # Pui is the row-normalized urm Pui = normalize(self.P.copy(), norm='l1', axis=1) # Piu is the column-normalized X_bool = self.P.copy() X_bool.data = np.ones(X_bool.data.size, np.float32) Piu = normalize(X_bool, norm='l1', axis=0) del (X_bool) # Alfa power if self.alpha != 1.: Pui = Pui.power(self.alpha) Piu = Piu.power(self.alpha) # Final matrix is computed as Pui * Piu * Pui # Multiplication unpacked for memory usage reasons block_dim = 200 d_t = Piu # Use array as it reduces memory requirements compared to lists dataBlock = 10000000 rows = np.zeros(dataBlock, dtype=np.int32) cols = np.zeros(dataBlock, dtype=np.int32) values = np.zeros(dataBlock, dtype=np.float32) numCells = 0 start_time = time.time() start_time_printBatch = start_time for current_block_start_row in range(0, Pui.shape[1], block_dim): if current_block_start_row + block_dim > Pui.shape[1]: block_dim = Pui.shape[1] - current_block_start_row similarity_block = d_t[ current_block_start_row:current_block_start_row + block_dim, :] * Pui similarity_block = similarity_block.toarray() for row_in_block in range(block_dim): row_data = similarity_block[row_in_block, :] row_data[current_block_start_row + row_in_block] = 0 best = row_data.argsort()[::-1][:self.topK] notZerosMask = row_data[best] != 0.0 values_to_add = row_data[best][notZerosMask] cols_to_add = best[notZerosMask] for index in range(len(values_to_add)): if numCells == len(rows): rows = np.concatenate( (rows, np.zeros(dataBlock, dtype=np.int32))) cols = np.concatenate( (cols, np.zeros(dataBlock, dtype=np.int32))) values = np.concatenate( (values, np.zeros(dataBlock, dtype=np.float32))) rows[numCells] = current_block_start_row + row_in_block cols[numCells] = cols_to_add[index] values[numCells] = values_to_add[index] numCells += 1 if time.time() - start_time_printBatch > 60: self._print( "Processed {} ( {:.2f}% ) in {:.2f} minutes. Rows per second: {:.0f}" .format( current_block_start_row, 100.0 * float(current_block_start_row) / Pui.shape[1], (time.time() - start_time) / 60, float(current_block_start_row) / (time.time() - start_time))) sys.stdout.flush() sys.stderr.flush() start_time_printBatch = time.time() # Set rows, cols, values rows = rows[:numCells] cols = cols[:numCells] values = values[:numCells] self.W_sparse = sps.csr_matrix((values, (rows, cols)), shape=self.P.shape) if self.normalize_similarity: self.W_sparse = normalize(self.W_sparse, norm='l1', axis=1) if self.topK != False: self.W_sparse = similarityMatrixTopK(self.W_sparse, k=self.topK) self.W_sparse = check_matrix(self.W_sparse, format='csr')
def fit(self, user_topK=50, user_shrink=100, user_similarity_type='cosine', user_normalize=True, user_feature_weighting="none", user_asymmetric_alpha=0.5, item_topK=50, item_shrink=100, item_similarity_type='cosine', item_normalize=True, item_feature_weighting="none", item_asymmetric_alpha=0.5, interactions_feature_weighting="none"): if interactions_feature_weighting not in self.FEATURE_WEIGHTING_VALUES: raise ValueError( "Value for 'feature_weighting' not recognized. Acceptable values are {}, provided was '{}'" .format(self.FEATURE_WEIGHTING_VALUES, interactions_feature_weighting)) if interactions_feature_weighting == "BM25": self.URM_train = self.URM_train.astype(np.float32) self.URM_train = okapi_BM_25(self.URM_train) self.URM_train = check_matrix(self.URM_train, 'csr') elif interactions_feature_weighting == "TF-IDF": self.URM_train = self.URM_train.astype(np.float32) self.URM_train = TF_IDF(self.URM_train) self.URM_train = check_matrix(self.URM_train, 'csr') # User Similarity Computation self.user_topK = user_topK self.user_shrink = user_shrink if user_feature_weighting not in self.FEATURE_WEIGHTING_VALUES: raise ValueError( "Value for 'feature_weighting' not recognized. Acceptable values are {}, provided was '{}'" .format(self.FEATURE_WEIGHTING_VALUES, user_feature_weighting)) if user_feature_weighting == "BM25": self.UCM_train = self.UCM_train.astype(np.float32) self.UCM_train = okapi_BM_25(self.UCM_train) elif user_feature_weighting == "TF-IDF": self.UCM_train = self.UCM_train.astype(np.float32) self.UCM_train = TF_IDF(self.UCM_train) kwargs = {"asymmetric_alpha": user_asymmetric_alpha} user_similarity_compute = Compute_Similarity( self.UCM_train.T, shrink=user_shrink, topK=user_topK, normalize=user_normalize, similarity=user_similarity_type, **kwargs) self.user_W_sparse = user_similarity_compute.compute_similarity() self.user_W_sparse = check_matrix(self.user_W_sparse, format='csr') # Item Similarity Computation self.item_topK = item_topK self.item_shrink = item_shrink if item_feature_weighting not in self.FEATURE_WEIGHTING_VALUES: raise ValueError( "Value for 'feature_weighting' not recognized. Acceptable values are {}, provided was '{}'" .format(self.FEATURE_WEIGHTING_VALUES, item_feature_weighting)) if item_feature_weighting == "BM25": self.ICM_train = self.ICM_train.astype(np.float32) self.ICM_train = okapi_BM_25(self.ICM_train) elif item_feature_weighting == "TF-IDF": self.ICM_train = self.ICM_train.astype(np.float32) self.ICM_train = TF_IDF(self.ICM_train) kwargs = {"asymmetric_alpha": item_asymmetric_alpha} item_similarity_compute = Compute_Similarity( self.ICM_train.T, shrink=item_shrink, topK=item_topK, normalize=item_normalize, similarity=item_similarity_type, **kwargs) self.item_W_sparse = item_similarity_compute.compute_similarity() self.item_W_sparse = check_matrix(self.item_W_sparse, format='csr')
def fit(self, alpha=1., beta=0.6, min_rating=0, topK=100, implicit=False, normalize_similarity=True): self.alpha = alpha self.beta = beta self.min_rating = min_rating self.topK = topK self.implicit = implicit self.normalize_similarity = normalize_similarity # if X.dtype != np.float32: # print("RP3beta fit: For memory usage reasons, we suggest to use np.float32 as dtype for the dataset") if self.min_rating > 0: self.URM_train.data[self.URM_train.data < self.min_rating] = 0 self.URM_train.eliminate_zeros() if self.implicit: self.URM_train.data = np.ones(self.URM_train.data.size, dtype=np.float32) # Pui is the row-normalized urm Pui_raw = sps.hstack([self.URM_train, self.UCM_train], format="csr") Pui_raw = TF_IDF(Pui_raw).tocsr() Pui = normalize(Pui_raw, norm='l1', axis=1) # Piu is the column-normalized, "boolean" urm transposed # X_bool = self.URM_train.transpose(copy=True) X_bool = Pui_raw.transpose(copy=True) X_bool.data = np.ones(X_bool.data.size, np.float32) # Taking the degree of each item to penalize top popular # Some rows might be zero, make sure their degree remains zero X_bool_sum = np.array(X_bool.sum(axis=1)).ravel() degree = np.zeros(Pui_raw.shape[1]) nonZeroMask = X_bool_sum != 0.0 degree[nonZeroMask] = np.power(X_bool_sum[nonZeroMask], -self.beta) # ATTENTION: axis is still 1 because i transposed before the normalization Piu = normalize(X_bool, norm='l1', axis=1) del (X_bool) # Alfa power if self.alpha != 1.: Pui = Pui.power(self.alpha) Piu = Piu.power(self.alpha) # Final matrix is computed as Pui * Piu * Pui # Multiplication unpacked for memory usage reasons block_dim = 200 d_t = Piu # Use array as it reduces memory requirements compared to lists dataBlock = 10000000 rows = np.zeros(dataBlock, dtype=np.int32) cols = np.zeros(dataBlock, dtype=np.int32) values = np.zeros(dataBlock, dtype=np.float32) numCells = 0 start_time = time.time() start_time_printBatch = start_time for current_block_start_row in range(0, Pui.shape[1], block_dim): if current_block_start_row + block_dim > Pui.shape[1]: block_dim = Pui.shape[1] - current_block_start_row similarity_block = d_t[ current_block_start_row:current_block_start_row + block_dim, :] * Pui similarity_block = similarity_block.toarray() for row_in_block in range(block_dim): row_data = np.multiply(similarity_block[row_in_block, :], degree) row_data[current_block_start_row + row_in_block] = 0 best = row_data.argsort()[::-1][:self.topK] notZerosMask = row_data[best] != 0.0 values_to_add = row_data[best][notZerosMask] cols_to_add = best[notZerosMask] for index in range(len(values_to_add)): if numCells == len(rows): rows = np.concatenate( (rows, np.zeros(dataBlock, dtype=np.int32))) cols = np.concatenate( (cols, np.zeros(dataBlock, dtype=np.int32))) values = np.concatenate( (values, np.zeros(dataBlock, dtype=np.float32))) rows[numCells] = current_block_start_row + row_in_block cols[numCells] = cols_to_add[index] values[numCells] = values_to_add[index] numCells += 1 if time.time() - start_time_printBatch > 60: self._print( "Processed {} ( {:.2f}% ) in {:.2f} minutes. Rows per second: {:.0f}" .format( current_block_start_row, 100.0 * float(current_block_start_row) / Pui.shape[1], (time.time() - start_time) / 60, float(current_block_start_row) / (time.time() - start_time))) sys.stdout.flush() sys.stderr.flush() start_time_printBatch = time.time() self.W_sparse = sps.csr_matrix( (values[:numCells], (rows[:numCells], cols[:numCells])), shape=(Pui.shape[1], Pui.shape[1])) if self.normalize_similarity: self.W_sparse = normalize(self.W_sparse, norm='l1', axis=1) if self.topK != False: self.W_sparse = similarityMatrixTopK(self.W_sparse, k=self.topK) self.W_sparse = check_matrix(self.W_sparse, format='csr')
def _log_scaling_confidence(self): C = check_matrix(self.URM_train, format="csr", dtype=np.float32) C.data = self.alpha * np.log(1.0 + C.data / self.epsilon) return C
def _linear_scaling_confidence(self): C = check_matrix(self.URM_train, format="csr", dtype=np.float32) C.data = self.alpha * C.data return C
def compute_similarity(self, start_col=None, end_col=None, block_size=100): """ Compute the similarity for the given dataset :param self: :param start_col: column to begin with :param end_col: column to stop before, end_col is excluded :return: """ values = [] rows = [] cols = [] start_time = time.time() start_time_print_batch = start_time processedItems = 0 if self.adjusted_cosine: self.applyAdjustedCosine() elif self.pearson_correlation: self.applyPearsonCorrelation() elif self.tanimoto_coefficient or self.dice_coefficient or self.tversky_coefficient: self.useOnlyBooleanInteractions() # We explore the matrix column-wise self.dataMatrix = check_matrix(self.dataMatrix, 'csc') # Compute sum of squared values to be used in normalization sumOfSquared = np.array(self.dataMatrix.power(2).sum(axis=0)).ravel() # Tanimoto does not require the square root to be applied if not (self.tanimoto_coefficient or self.dice_coefficient or self.tversky_coefficient): sumOfSquared = np.sqrt(sumOfSquared) if self.asymmetric_cosine: sumOfSquared_to_1_minus_alpha = np.power( sumOfSquared, 2 * (1 - self.asymmetric_alpha)) sumOfSquared_to_alpha = np.power(sumOfSquared, 2 * self.asymmetric_alpha) self.dataMatrix = check_matrix(self.dataMatrix, 'csc') start_col_local = 0 end_col_local = self.n_columns if start_col is not None and start_col > 0 and start_col < self.n_columns: start_col_local = start_col if end_col is not None and end_col > start_col_local and end_col < self.n_columns: end_col_local = end_col start_col_block = start_col_local this_block_size = 0 # Compute all similarities for each item using vectorization while start_col_block < end_col_local: end_col_block = min(start_col_block + block_size, end_col_local) this_block_size = end_col_block - start_col_block # All data points for a given item item_data = self.dataMatrix[:, start_col_block:end_col_block] item_data = item_data.toarray().squeeze() # If only 1 feature avoid last dimension to disappear if item_data.ndim == 1: item_data = np.atleast_2d(item_data) if self.use_row_weights: this_block_weights = self.dataMatrix_weighted.T.dot(item_data) else: # Compute item similarities this_block_weights = self.dataMatrix.T.dot(item_data) for col_index_in_block in range(this_block_size): if this_block_size == 1: this_column_weights = this_block_weights else: this_column_weights = this_block_weights[:, col_index_in_block] columnIndex = col_index_in_block + start_col_block this_column_weights[columnIndex] = 0.0 # Apply normalization and shrinkage, ensure denominator != 0 if self.normalize: if self.asymmetric_cosine: denominator = sumOfSquared_to_alpha[ columnIndex] * sumOfSquared_to_1_minus_alpha + self.shrink + 1e-6 else: denominator = sumOfSquared[ columnIndex] * sumOfSquared + self.shrink + 1e-6 this_column_weights = np.multiply(this_column_weights, 1 / denominator) # Apply the specific denominator for Tanimoto elif self.tanimoto_coefficient: denominator = sumOfSquared[ columnIndex] + sumOfSquared - this_column_weights + self.shrink + 1e-6 this_column_weights = np.multiply(this_column_weights, 1 / denominator) elif self.dice_coefficient: denominator = sumOfSquared[ columnIndex] + sumOfSquared + self.shrink + 1e-6 this_column_weights = np.multiply(this_column_weights, 1 / denominator) elif self.tversky_coefficient: denominator = this_column_weights + \ (sumOfSquared[columnIndex] - this_column_weights)*self.tversky_alpha + \ (sumOfSquared - this_column_weights)*self.tversky_beta + self.shrink + 1e-6 this_column_weights = np.multiply(this_column_weights, 1 / denominator) # If no normalization or tanimoto is selected, apply only shrink elif self.shrink != 0: this_column_weights = this_column_weights / self.shrink #this_column_weights = this_column_weights.toarray().ravel() # Sort indices and select TopK # Sorting is done in three steps. Faster then plain np.argsort for higher number of items # - Partition the data to extract the set of relevant items # - Sort only the relevant items # - Get the original item index relevant_items_partition = ( -this_column_weights).argpartition(self.TopK - 1)[0:self.TopK] relevant_items_partition_sorting = np.argsort( -this_column_weights[relevant_items_partition]) top_k_idx = relevant_items_partition[ relevant_items_partition_sorting] # Incrementally build sparse matrix, do not add zeros notZerosMask = this_column_weights[top_k_idx] != 0.0 numNotZeros = np.sum(notZerosMask) values.extend(this_column_weights[top_k_idx][notZerosMask]) rows.extend(top_k_idx[notZerosMask]) cols.extend(np.ones(numNotZeros) * columnIndex) # Add previous block size processedItems += this_block_size if time.time( ) - start_time_print_batch >= 30 or end_col_block == end_col_local: columnPerSec = processedItems / (time.time() - start_time + 1e-9) print( "Similarity column {} ( {:2.0f} % ), {:.2f} column/sec, elapsed time {:.2f} min" .format( processedItems, processedItems / (end_col_local - start_col_local) * 100, columnPerSec, (time.time() - start_time) / 60)) sys.stdout.flush() sys.stderr.flush() start_time_print_batch = time.time() start_col_block += block_size # End while on columns W_sparse = sps.csr_matrix((values, (rows, cols)), shape=(self.n_columns, self.n_columns), dtype=np.float32) return W_sparse