def similarity_from_sparse(matrix_a: sparse.csr_matrix, matrix_b: sparse.csr_matrix): intersection = matrix_a.dot(matrix_b.transpose()).toarray() norm_1 = np.array(matrix_a.multiply(matrix_a).sum(axis=1)) norm_2 = np.array(matrix_b.multiply(matrix_b).sum(axis=1)) union = norm_1 + norm_2.T - intersection return intersection / union
def filter_dataset(X: sp.csr_matrix, min_ui_count=10, min_iu_count=10): X = filter_rows(X, min_ui_count) T_X = X.transpose() T_X = filter_rows(T_X, min_iu_count) X = T_X.transpose() return X
def augmentURM(cls, URM_train: csr_matrix, W_sparse: csr_matrix, threshold_interactions: int, threshold_similarity: float): """ Augmentation of the URM train. :param threshold_interactions: here a threshold on the similarity is considered. Similarity matrix W_sparse will be considered for this purpose :param threshold_similarity: threshold used to insert a new row. In this case it is specified as the minimum number of interactions required to insert a new row in the URM train :param W_sparse: similarity matrix :param URM_train: URM train that will be augmented :return: a csr_matrix with augmented interactions according to the threshold """ print("Augmenting URM") URM_train = URM_train.copy() # Count similarity count_W_sparse = URM_train.dot(URM_train.transpose()) # Selecting new print("Selecting new candidates") users = np.arange(URM_train.shape[0]) new_rows_list = [] for i in range(0, users.size): if i % 5000 == 0: print("{} done in {}".format(i, users.size)) candidates = count_W_sparse[i].indices # users candidates data = count_W_sparse[i].data # data for the candidates for j, candidate in enumerate(candidates): if candidate > i and data[ j] > threshold_interactions and W_sparse[ i, candidate] > threshold_similarity: new_rows_list.append([i, candidate]) print("Candidate list size: {}".format(len(new_rows_list))) # Creating the new matrix print("Creating new URM...", end="") new_URM = None for candidate in new_rows_list: new_row = URM_train[[candidate[0], candidate[1]]].sum(axis=0) new_row = csr_matrix(new_row) new_row.data[new_row.data > 1] = 1 if new_URM is None: new_URM = new_row else: new_URM = vstack([new_URM, new_row], format="csr") if new_URM is None: new_URM = URM_train else: new_URM = vstack([URM_train, new_URM], format="csr") print("Done") return new_URM
def psinv2(matr: csr_matrix, dtype, reg=0.): # inverse operation for sparse matrices doesn't seem to exist in cupy regsize = matr.shape[1] if reg == 0: regm = csr_matrix((regsize, regsize)) else: regm = identity(regsize, dtype=dtype) * ( 1. / reg) # if direct cuda, call get_sparse_module() toinv = matr.transpose().dot(matr) + regm return get_sparse_module( inv(toinv)) # if direct cuda, add .get() to inv param
def newAugmentUMR(cls, URM_train: csr_matrix, W_sparse: csr_matrix, threshold_interactions: int, threshold_similarity: float): print("New Augmenting URM") count_W_sparse = URM_train.dot(URM_train.transpose()) count_mask: csr_matrix = count_W_sparse > threshold_interactions sim_mask: csr_matrix = W_sparse > threshold_similarity mask = count_mask.multiply(sim_mask) mask = triu(mask) mask = mask.tocoo() row_user = mask.row col_user = mask.col new_mask = row_user != col_user row_user = row_user[new_mask] col_user = col_user[new_mask] new_users = np.array([row_user, col_user]) new_users = np.transpose(new_users) new_rows_list: list = new_users.tolist() print("Candidate list size: {}".format(len(new_rows_list))) # Creating the new matrix print("Creating new URM...", end="") new_URM = None for candidate in new_rows_list: new_row = URM_train[[candidate[0], candidate[1]]].sum(axis=0) new_row = csr_matrix(new_row) new_row.data[new_row.data > 1] = 1 if new_URM is None: new_URM = new_row else: new_URM = vstack([new_URM, new_row], format="csr") if new_URM is None: new_URM = URM_train else: new_URM = vstack([URM_train, new_URM], format="csr") print("Done") return new_URM
def _merge_into_adjacency( tf_ids: csr_matrix, word_cooccurrences_list: List[Tuple[str, str, float]], token_to_int_vocab_map: Dict[str, int]) -> csr_matrix: """ Merge the word co-occurence information together with the tf-idf information to create an adjacency matrix where, (0, 0) to (|vocab|, |vocab|) - indices describe the word-word interactions and, (|vocal|, |vocab|) to (|vocab| + #Docs, |vocab|) - indices describe the word-document interactions. """ word_co_row = np.array([ token_to_int_vocab_map[word_cooccurrence[0]] for word_cooccurrence in word_cooccurrences_list ]) word_co_col = np.array([ token_to_int_vocab_map[word_cooccurrence[1]] for word_cooccurrence in word_cooccurrences_list ]) word_co_data = np.array([ word_cooccurrence[2] for word_cooccurrence in word_cooccurrences_list ]) word_coocurrences = csr_matrix( (word_co_data, (word_co_row, word_co_col)), shape=(len(token_to_int_vocab_map), len(token_to_int_vocab_map))) # Stack word co-occurences ontop of TF-IDF (Left hand side of adjacency) adj_lhs = vstack([word_coocurrences, tf_ids]) # Empty (zeros) for doc-doc interactions zero_csr = csr_matrix(([], ([], [])), shape=(tf_ids.shape[0], tf_ids.shape[0])) # Mirror TF-IDFs and stack ontop of doc-doc interactions to create right hand side of the adjacency adj_rhs = vstack([tf_ids.transpose(), zero_csr]) # Stack side-by-side adj = hstack([adj_lhs, adj_rhs]) + identity(adj_lhs.shape[0]) assert adj.shape == ( len(token_to_int_vocab_map) + tf_ids.shape[0], len(token_to_int_vocab_map) + tf_ids.shape[0], ), "Expected {} == {}".format( adj.shape, (len(token_to_int_vocab_map) + tf_ids.shape[0], len(token_to_int_vocab_map) + tf_ids.shape[0])) return adj
def calc_objective_per_iter(w_i, feature_mat: sparse.csr_matrix, empirical_counts, num_h, true_tags, alpha): """ Calculate max entropy likelihood for an iterative optimization method :param alpha: the regularization coefficient :param num_h: number of histories in the training data :param empirical_counts: pre computed empirical_counts :param w_i: weights vector in iteration i The function returns the Max Entropy likelihood (objective) and the objective gradient """ scores = feature_mat.dot(w_i) scores = scores.reshape((num_h, -1)) exp_scores = np.exp(scores) sum_exp = np.sum(exp_scores, axis=1) probs = exp_scores/sum_exp.reshape((num_h, 1)) expected_counts = feature_mat.transpose().dot(probs.reshape(-1)).reshape(-1) likelihood = np.sum(scores[np.arange(num_h), true_tags] - np.log(sum_exp) - (alpha/2) * (w_i**2)) grad = empirical_counts - expected_counts - alpha*w_i return (-1) * likelihood, (-1) * grad
def snn_dissimilarity_func(graph: csr_matrix, n_neighbors: int, *args, **kwargs) -> csr_matrix: """Default SNN dissimilarity function Computes the dissimilarity between two points in terms of shared nearest neighbors Args: graph (scipy.sparse.csr_matrix): sparse matrix with dimensions (n_samples, n_samples), where the element ij represents the distance between the point i and j n_neighbors (int): number of neighbors in the k-neighborhood search """ graph.data[graph.data > 0] = 1 n_samples = graph.shape[0] # Add the point as its own neighbor graph += spdiags(np.ones(n_samples), diags=0, m=n_samples, n=n_samples) matrix = graph * graph.transpose() matrix.sort_indices() # The lower the "closer" matrix.data = n_neighbors - matrix.data return matrix
def calculate(self, X: sp.csr_matrix, q_X: sp.csr_matrix): q_X = normalize(q_X, norm="l1") sim = q_X * X.transpose() # sim = sim.transpose() return sim.toarray()[0]