def format_URM_positive_user_compressed(URM: csr_matrix): """ Format positive interactions of an URM in the way that is needed for the FM model. Here, however, users information are grouped w.r.t. items, meaning that, we will have: - We have #warm_items @row - We have #users+items+1 @cols - We have #(interactions)+(warm_items*2) @data Each row is representing a warm item and all users that interacted with that item are stored in that row. :param URM: URM to be preprocessed :return: preprocessed URM in sparse matrix csr format """ warm_items_mask = np.ediff1d(URM.tocsc().indptr) > 0 warm_items = np.arange(URM.shape[1])[warm_items_mask] new_train = URM.copy().tocoo() fm_matrix = coo_matrix((warm_items.size, URM.shape[0] + URM.shape[1] + 1), dtype=np.int8) # Index offset item_offset = URM.shape[0] # Set up initial vectors row_v = np.zeros(new_train.data.size + (warm_items.size * 2)) col_v = np.zeros(new_train.data.size + (warm_items.size * 2)) data_v = np.zeros(new_train.data.size + (warm_items.size * 2)) # Already ok, nothing to be added # For all the items, set up its content j = 0 # Index to scan and modify the vectors URM_train_csc = URM.copy().tocsc() for i, item in enumerate(warm_items): # Find all users who liked that item users_who_liked_item = URM_train_csc[:, item].indices offset = users_who_liked_item.size if offset > 0: col_v[j:j + offset] = users_who_liked_item row_v[j:j + offset] = i data_v[j:j + offset] = 1 col_v[j + offset] = item + item_offset row_v[j + offset] = i data_v[j + offset] = 1 col_v[j + offset + 1] = fm_matrix.shape[1] - 1 row_v[j + offset + 1] = i data_v[j + offset + 1] = 1 j = j + offset + 2 else: raise RuntimeError("Illegal state") # Setting new information fm_matrix.row = row_v fm_matrix.col = col_v fm_matrix.data = data_v return fm_matrix.tocsr()
def search_hyperparameter_to_recommenders(urm_train_split: csr_matrix, urm_validation_split: csr_matrix, urm_test_split: csr_matrix, urm_impressions: csr_matrix, recommender: Type[BaseRecommender]): URM_train = urm_train_split.copy() URM_validation = urm_validation_split.copy() URM_test = urm_test_split.copy() URM_impressions = urm_impressions.copy() if any(not isspmatrix_csr(split) for split in [URM_train, URM_validation, URM_test, URM_impressions]): raise ValueError("The matrices are not all CSR matrices.") assert_implicit_data([URM_train, URM_validation, URM_test]) assert_disjoint_matrices([URM_train, URM_validation, URM_test]) if recommender_class.RECOMMENDER_NAME == Random.RECOMMENDER_NAME: evaluator_validation = EvaluatorHoldout(URM_validation, cutoff_list=[10], parallel=False) evaluator_test = EvaluatorHoldout(URM_test, cutoff_list=[5, 10, 20], parallel=False) else: evaluator_validation = EvaluatorHoldout(URM_validation, cutoff_list=[10], parallel=True, num_workers=NUM_WORKERS) evaluator_test = EvaluatorHoldout(URM_test, cutoff_list=[5, 10, 20], parallel=True, num_workers=NUM_WORKERS) runParameterSearch_Collaborative_partial = partial( runParameterSearch_Collaborative, URM_train=URM_train, URM_train_last_test=URM_train + URM_validation, metric_to_optimize=METRIC_TO_OPTIMIZE, evaluator_validation_earlystopping=evaluator_validation, evaluator_validation=evaluator_validation, evaluator_test=evaluator_test, output_folder_path=EXPERIMENTS_FOLDER_PATH, parallelizeKNN=False, allow_weighting=True, resume_from_saved=True, n_cases=NUM_CASES, n_random_starts=NUM_RANDOM_STARTS, URM_impressions=URM_impressions) try: runParameterSearch_Collaborative_partial(recommender) except Exception as e: logging.exception(f"On recommender {recommender} Exception {e}")
def _distance_to_connectivity(distances: csr_matrix, *, max_value: float = None) -> csr_matrix: """Get a weighted adjacency matrix from a distance matrix. A distance of 1 (in the sparse matrix) corresponds to an actual distance of 0. An actual distance of 0 corresponds to a connectivity of 1. A distance of 0 (in the sparse matrix) corresponds to an actual distance of infinity. An actual distance of infinity corresponds to a connectivity of 0. Parameters ---------- distances sparse distance matrix max_value The max_value is used to normalize the distances, i.e. distances are divided by this value. If not specified it will be the max. of the input matrix. """ if not isinstance(distances, csr_matrix): raise ValueError("Distance matrix must be in CSR format.") if max_value is None: max_value = np.max(distances) connectivities = distances.copy() d = connectivities.data - 1 # structure of the matrix stays the same, we can safely change the data only connectivities.data = (max_value - d) / max_value connectivities.eliminate_zeros() return connectivities
def sparse_clip(x: csr_matrix, min_, max_, inplace=False): if inplace: out = x else: out = x.copy() out.data = np.clip(x.data, min_, max_) return out
def dense_sparse_mul(a: np.array, b: csr_matrix, inplace=False): if inplace: out = b else: out = b.copy() out.data *= a[b.indices] return out
def sparse_reshape(adj_matrix: sp.csr_matrix, shape: tuple = None) -> sp.csr_matrix: """ Parameters ---------- adj_matrix: Scipy matrix or Numpy array or a list of them Single or a list of Scipy sparse matrices or Numpy arrays. shape: new shape. Returns ------- Single or a list of Scipy sparse matrix or Numpy matrices. See also ---------- graphgallery.functional.SparseReshape """ if shape is None: return adj_matrix.copy() else: M1, N1 = shape M2, N2 = adj_matrix.shape assert (M1 >= M2) and (N1 >= N2) edge_index, edge_weight = sparse_adj_to_edge(adj_matrix) return sp.csr_matrix((edge_weight, edge_index), shape=shape)
def augmentURM(cls, URM_train: csr_matrix, W_sparse: csr_matrix, threshold_interactions: int, threshold_similarity: float): """ Augmentation of the URM train. :param threshold_interactions: here a threshold on the similarity is considered. Similarity matrix W_sparse will be considered for this purpose :param threshold_similarity: threshold used to insert a new row. In this case it is specified as the minimum number of interactions required to insert a new row in the URM train :param W_sparse: similarity matrix :param URM_train: URM train that will be augmented :return: a csr_matrix with augmented interactions according to the threshold """ print("Augmenting URM") URM_train = URM_train.copy() # Count similarity count_W_sparse = URM_train.dot(URM_train.transpose()) # Selecting new print("Selecting new candidates") users = np.arange(URM_train.shape[0]) new_rows_list = [] for i in range(0, users.size): if i % 5000 == 0: print("{} done in {}".format(i, users.size)) candidates = count_W_sparse[i].indices # users candidates data = count_W_sparse[i].data # data for the candidates for j, candidate in enumerate(candidates): if candidate > i and data[ j] > threshold_interactions and W_sparse[ i, candidate] > threshold_similarity: new_rows_list.append([i, candidate]) print("Candidate list size: {}".format(len(new_rows_list))) # Creating the new matrix print("Creating new URM...", end="") new_URM = None for candidate in new_rows_list: new_row = URM_train[[candidate[0], candidate[1]]].sum(axis=0) new_row = csr_matrix(new_row) new_row.data[new_row.data > 1] = 1 if new_URM is None: new_URM = new_row else: new_URM = vstack([new_URM, new_row], format="csr") if new_URM is None: new_URM = URM_train else: new_URM = vstack([URM_train, new_URM], format="csr") print("Done") return new_URM
def print_results(urm_test_split: csr_matrix): urm_test = urm_test_split.copy() n_test_users = np.sum(np.ediff1d(urm_test.indptr) >= 1) result_loader = ResultFolderLoader(EXPERIMENTS_FOLDER_PATH, base_algorithm_list=None, other_algorithm_list=None, KNN_similarity_list=KNN_SIMILARITY_LIST, ICM_names_list=None, UCM_names_list=None) article_metrics_latex_results_filename = os.path.join(RESULTS_EXPORT_FOLDER_PATH, "article_metrics_latex_results.txt") result_loader.generate_latex_results(article_metrics_latex_results_filename, metrics_list=["RECALL", "MAP"], cutoffs_list=METRICS_CUTOFF_TO_REPORT_LIST, table_title=None, highlight_best=True) beyond_accuracy_metrics_latex_results_filename = os.path.join(RESULTS_EXPORT_FOLDER_PATH, "beyond_accuracy_metrics_latex_results.txt") result_loader.generate_latex_results(beyond_accuracy_metrics_latex_results_filename, metrics_list=["DIVERSITY_MEAN_INTER_LIST", "DIVERSITY_HERFINDAHL", "COVERAGE_ITEM", "DIVERSITY_GINI", "SHANNON_ENTROPY"], cutoffs_list=OTHERS_CUTOFF_TO_REPORT_LIST, table_title=None, highlight_best=True) all_metrics_latex_results_filename = os.path.join(RESULTS_EXPORT_FOLDER_PATH, "all_metrics_latex_results.txt") result_loader.generate_latex_results(all_metrics_latex_results_filename, metrics_list=["PRECISION", "RECALL", "MAP", "MRR", "NDCG", "F1", "HIT_RATE", "ARHR", "NOVELTY", "DIVERSITY_MEAN_INTER_LIST", "DIVERSITY_HERFINDAHL", "COVERAGE_ITEM", "DIVERSITY_GINI", "SHANNON_ENTROPY"], cutoffs_list=OTHERS_CUTOFF_TO_REPORT_LIST, table_title=None, highlight_best=True) time_latex_results_filename = os.path.join(RESULTS_EXPORT_FOLDER_PATH, "time_latex_results.txt") result_loader.generate_latex_time_statistics(time_latex_results_filename, n_evaluation_users=n_test_users, table_title=None)
def _scale_X(cls, X: sps.csr_matrix, scheme: IALSConfigScaling, epsilon: float) -> sps.csr_matrix: if scheme is IALSConfigScaling.none: return X else: X_ret: sps.csr_matrix = X.copy() X_ret.data = np.log(1 + X_ret.data / epsilon) return X_ret
def __init__(self, URM: sp.csr_matrix, ICM, exclude_seen=True): if not sp.isspmatrix_csr(URM): raise TypeError(f"We expected a CSR matrix, we got {type(URM)}") self.URM = URM.copy() self.ICM = ICM.copy() self.predicted_URM = None self.exclude_seen = exclude_seen self.recommendations = None
def format_URM_positive_non_compressed(URM: csr_matrix): """ Format positive interactions of an URM in the way that is needed for the FM model. - We have #num_ratings row - The last column with all the ratings (for implicit dataset it just a col full of 1 - In each row there are 3 interactions: 1 for the user, 1 for the item, and 1 for the rating - Only positive samples are encoded here Note: this method works only for implicit dataset :param URM: URM to be preprocessed :return: csr_matrix containing the URM preprocessed in the described way """ new_train = URM.copy().tocoo() fm_matrix = sps.coo_matrix( (URM.data.size, URM.shape[0] + URM.shape[1] + 1), dtype=np.int8) # Index offset item_offset = URM.shape[0] # Last col last_col = URM.shape[0] + URM.shape[1] # Set up initial vectors row_v = np.zeros(new_train.data.size * 3) # Row should have (i,i,i) repeated for all the size col_v = np.zeros(new_train.data.size * 3) # This is the "harder" to set data_v = np.ones(new_train.data.size * 3) # Already ok, nothing to be added # Setting row vector for i in range(0, new_train.data.size): row_v[3 * i] = i row_v[(3 * i) + 1] = i row_v[(3 * i) + 2] = i # Setting col vector for i in range(0, new_train.data.size): # Retrieving information user = new_train.row[i] item = new_train.col[i] # Fixing col indices to be added to the new matrix user_index = user item_index = item + item_offset col_v[3 * i] = user_index col_v[(3 * i) + 1] = item_index col_v[(3 * i) + 2] = last_col # Setting new information fm_matrix.row = row_v fm_matrix.col = col_v fm_matrix.data = data_v return fm_matrix.tocsr()
def get_sorted_best_item_indices(self, URM: sps.csr_matrix, target_column: np.ndarray, item_idx: int) -> np.ndarray: if self.sorted_indices is None: c_URM = URM.copy() c_URM.data **= 2 variances = np.array( c_URM.mean(axis=0) - np.power(URM.mean(axis=0), 2)).flatten() sorted_indices = np.argsort(variances)[::-1] return sorted_indices return self.sorted_indices
def test_jaccard(X: sps.csr_matrix) -> None: rec = JaccardKNNRecommender(X, shrinkage=0, top_k=X.shape[1], n_threads=1) rec.learn() sim = rec.W.toarray() X_bin = X.copy() X_bin.sort_indices() X_bin.data[:] = 1 manual = X_bin.T.toarray() # I x U norm = manual.sum(axis=1) manual = manual.dot(manual.T) denom = norm[:, None] + norm[None, :] - manual + 1e-6 denom[denom <= 1e-10] = 1e-10 manual = manual / denom np.fill_diagonal(manual, 0) np.testing.assert_allclose(sim, manual)
def normalize_vectors(mx: sparse.csr_matrix, axis: int) -> sparse.csr_matrix: """Performs normalization of vectors (i.e. divide each vector by its corresponding Euclidean norm). Parameter `axis` can be 0 (column-vectors) or 1 (row-vectors) :param mx: sparse matrix :param axis: 0 or 1 :return: sparse matrix """ if axis not in {0, 1}: raise ValueError('Axis must be either 0 or 1.') mx = mx.copy().astype(np.float64) mx_norms = mx.copy() mx_norms.data **= 2 mx_norms = mx_norms.sum(axis=axis).A.flatten()**0.5 mx_norms = mx_norms[mx.nonzero()[1 - axis]] mx.data /= mx_norms return mx
def add_UCM_info(fm_matrix: csr_matrix, UCM: csr_matrix, user_offset): """ Given a matrix in the format needed to FM, it adds information concerning the UCM Note: no group by items should be applied in this case :param fm_matrix: matrix containing dataset for FM models (last column has no rating list) :param UCM: UCM information about users :param user_offset: starting column index for items in fm_matrix (should be 0) :return: new matrix containing also information about the UCM """ fm_matrix_copy = fm_matrix.copy() user_fm_matrix = fm_matrix[:, user_offset:user_offset + UCM.shape[0]].copy() UCM_fm_matrix = user_fm_matrix.dot(UCM) merged_fm = sps.hstack([fm_matrix_copy, UCM_fm_matrix], format="csr") return merged_fm
def add_ICM_info(fm_matrix: csr_matrix, ICM: csr_matrix, item_offset): """ Given a matrix in the format needed for FM, it adds information concerning the ICM Note: no group by users should be applied in this case :param fm_matrix: matrix concerning dataset for FM models (last column has no rating list) :param ICM: ICM information about items :param item_offset: starting column index for items in fm_matrix (it should be URM_train.shape[0] of the URM used to construct the fm_matrix) :return: new matrix integrating ICM data """ fm_matrix_copy = fm_matrix.copy() item_fm_matrix = fm_matrix[:, item_offset:item_offset + ICM.shape[0]].copy() ICM_fm_matrix = item_fm_matrix.dot(ICM) merged_fm = sps.hstack([fm_matrix_copy, ICM_fm_matrix], format="csr") return merged_fm
def _reorder_empty( X: sparse.csr_matrix, rows: bool = True, cols: bool = True, copy: bool = True) -> Tuple[sparse.csr_matrix, np.ndarray, np.ndarray]: if copy: Y = X.copy() else: Y = X p_row = None p_col = None if rows: p_row = _perm_first_axis(Y) if cols: p_col = _perm_first_axis(Y.tocsc()) Y = _apply_perm(X, p_row=p_row, p_col=p_col) return Y, p_row, p_col
def precompute_best_item_indices(self, URM: sps.csr_matrix): URM = URM.copy() if self.feature_weighting == "BM25": URM = URM.astype(np.float32) URM = okapi_BM_25(URM) URM = check_matrix(URM, 'csr') elif self.feature_weighting == "TF-IDF": URM = URM.astype(np.float32) URM = TF_IDF(URM) URM = check_matrix(URM, 'csr') similarity = Compute_Similarity(URM, shrink=self.shrink, topK=self.topK, normalize=self.normalize, similarity="cosine") similarity_matrix = similarity.compute_similarity() self.sorted_indices = np.array( np.argsort(-similarity_matrix.todense(), axis=1))
def _initialize_parameters(self, X: csr_matrix): n, d = X.shape if self.use_biases: self.mu = X.data.mean() else: self.mu = 0 if self.init_method == "svd": R = X.copy() if self.use_biases: R.data -= R.data.mean() u, s, vt = svds(R, k=self.K) self.U = u self.V = vt.T else: self.U = np.random.normal(scale=0.1, size=(n, self.K)) self.V = np.random.normal(scale=0.1, size=(d, self.K)) self.user_bias = np.zeros(n) self.item_bias = np.zeros(d) self.is_initialized = True
def bandedLU(M: csr, ml, mu): """ Computes standard LU decomposition of a class 'scipy.sparse.csr.csr_matrix' banded square matrix M with lower and upper bandwidths ml and mu, respectively. Returns L and U as sparse CSR matrices. """ m = M.shape[0] u = M.copy() # can remove to act directly on M # Allocating memory to store nnzl number of non-zero entries of L nnzl = int(m * (ml + 1) - ml * (ml + 1) / 2) l_row = np.zeros(nnzl).astype(np.int_) l_val = np.ones(nnzl).astype(M.dtype) for i in range(m): l_row[i] = i l_col = l_row.copy() count = i + 1 # counter for the next entry of L for k in range(m - 1): column_entries_ind = u.indptr[k] + ( u.indices[u.indptr[k]:u.indptr[min(k + ml + 1, m)]] == k).nonzero()[0] for i, ind in enumerate(column_entries_ind[1:]): l = u.data[ind] / u.data[column_entries_ind[0]] l_val[count] = l l_col[count] = k l_row[count] = int(k + i + 1) count += 1 b = min(mu + 1, m - k) u.data[ind + 1:ind + b] -= l * u.data[column_entries_ind[0] + 1:column_entries_ind[0] + b] u.data[ind] = 0. u.eliminate_zeros() l = csr((l_val, (l_row, l_col))) return l, u
def format_URM_negative_sampling_user_compressed(URM: csr_matrix, negative_rate=1, check_replacement=False, sampling_function=None): """ Format negative interactions of an URM in the way that is needed for the FM model. Here, however, users and compressed w.r.t. the items they liked in the negative samples sampled In particular you will have: - #different_items_sampled @row - #users+items+1 @cols - #(negative_sample_size)*(different_items_sampled*2) @data :param URM: URM to be preprocessed and from which negative samples are taken :param negative_rate: how much negatives samples do you want in proportion to the negative one :param check_replacement: whether to check for replacement or not. Checking costs time :param sampling_function: sampling function that takes in input the negative sample size and the URM from which samples are taken. If None, uniform sampling will be applied :return: csr_matrix containing the negative interactions: """ negative_sample_size = int(URM.data.size * negative_rate) new_train = URM.copy().tocoo() item_offset = URM.shape[0] print("Start sampling...") if sampling_function is None: collected_samples = uniform_sampling_strategy( negative_sample_size=negative_sample_size, URM=URM, check_replacement=check_replacement) else: collected_samples = sampling_function( negative_sample_size=negative_sample_size, URM=URM, check_replacement=check_replacement) # Different items sampled different_items_sampled = np.unique(collected_samples[1]) fm_matrix = coo_matrix( (different_items_sampled.size, URM.shape[0] + URM.shape[1] + 1), dtype=np.int8) row_v = np.zeros(new_train.data.size + (different_items_sampled.size * 2)) col_v = np.zeros(new_train.data.size + (different_items_sampled.size * 2)) data_v = np.zeros(new_train.data.size + (different_items_sampled.size * 2)) print("Matrix builiding...", end="") # For all the items, set up its content j = 0 # Index to scan and modify the vectors URM_train_csc = URM.copy().tocsc() for i, item in enumerate(different_items_sampled): # Find all users sampled for that item item_mask = collected_samples[1] == item users_sampled_for_that_item = np.unique( collected_samples[0][item_mask]) offset = users_sampled_for_that_item.size if offset > 0: col_v[j:j + offset] = users_sampled_for_that_item row_v[j:j + offset] = i data_v[j:j + offset] = 1 col_v[j + offset] = item + item_offset row_v[j + offset] = i data_v[j + offset] = 1 col_v[j + offset + 1] = fm_matrix.shape[1] - 1 row_v[j + offset + 1] = i data_v[j + offset + 1] = 1 j = j + offset + 2 else: raise RuntimeError("Illegal state") print("Done") # Setting new information fm_matrix.row = row_v fm_matrix.col = col_v fm_matrix.data = data_v return fm_matrix.tocsr()
def __init__(self, URM: sp.csr_matrix, ICM, exclude_seen=True, k=3): super().__init__(URM.copy(), ICM, exclude_seen) self.k = k
def sparse_sub_with_clip(a: csr_matrix, c): out = a.copy() out.data -= c return sparse_pos_clip(out)
def __init__(self, P: csr_matrix): self.P = P.copy() self.exact_errors = [] self.bh_errors = [] self.fft_errors = []
def _eliminate(matrix: sp.csr_matrix, user_indices, item_indices): matrix = matrix.copy() matrix[user_indices, item_indices] = 0 matrix.eliminate_zeros() return matrix
def _eliminate(matrix: sp.csr_matrix, user_indices, item_indices): matrix = matrix.copy() # `lil_matrix` is too slow matrix[list(user_indices), list(item_indices)] = 0 matrix.eliminate_zeros() return matrix
def precompute_best_item_indices(self, URM: sps.csr_matrix): c_URM = URM.copy() c_URM.data **= 2 variances = np.array( c_URM.mean(axis=0) - np.power(URM.mean(axis=0), 2)).flatten() self.sorted_indices = np.argsort(variances)[::-1]