def fit(self, X: sp.csr_matrix, n_samples: int, multiplier: np.ndarray = None): """Learn the idf vector (global term weights). Arguments: X: A matrix of term/token counts. n_samples: Number of total documents """ X = check_array(X, accept_sparse=('csr', 'csc')) if not sp.issparse(X): X = sp.csr_matrix(X) dtype = np.float64 if self.use_idf: _, n_features = X.shape df = np.squeeze(np.asarray(X.sum(axis=0))) avg_nr_samples = int(X.sum(axis=1).mean()) idf = np.log(avg_nr_samples / df) if multiplier is not None: idf = idf * multiplier self._idf_diag = sp.diags(idf, offsets=0, shape=(n_features, n_features), format='csr', dtype=dtype) return self
def fit(self, X: sp.csr_matrix, n_samples: int): """Learn the idf vector (global term weights) Parameters ---------- X : sparse matrix of shape n_samples, n_features) A matrix of term/token counts. """ # Prepare input X = check_array(X, accept_sparse=('csr', 'csc')) if not sp.issparse(X): X = sp.csr_matrix(X) dtype = X.dtype if X.dtype in FLOAT_DTYPES else np.float64 # Calculate IDF scores _, n_features = X.shape df = np.squeeze(np.asarray(X.sum(axis=0))) avg_nr_samples = int(X.sum(axis=1).mean()) idf = np.log(avg_nr_samples / df) self._idf_diag = sp.diags(idf, offsets=0, shape=(n_features, n_features), format='csr', dtype=dtype) return self
def normalize_adj(adj : sp.csr_matrix): """Normalize adjacency matrix and convert it to a sparse tensor.""" if sp.isspmatrix(adj): adj = adj.tolil() adj.setdiag(1) adj = adj.tocsr() deg = np.ravel(adj.sum(1)) deg_sqrt_inv = 1 / np.sqrt(deg) adj_norm = adj.multiply(deg_sqrt_inv[:, None]).multiply(deg_sqrt_inv[None, :]) elif torch.is_tensor(adj): deg = adj.sum(1) deg_sqrt_inv = 1 / torch.sqrt(deg) adj_norm = adj * deg_sqrt_inv[:, None] * deg_sqrt_inv[None, :] return to_sparse_tensor(adj_norm)
def evaluate_item(train:ss.csr_matrix, test:ss.csr_matrix, user:np.ndarray, item:np.ndarray, topk:int=200, cutoff:int=200): train = train.tocsr() test = test.tocsr() idx = np.squeeze((test.sum(axis=1) > 0).A) train = train[idx, :] test = test[idx, :] user = user[idx, :] N = train.shape[1] cand_count = N - train.sum(axis=1) if topk <0: mat_rank = Eval.predict(train, test, user, item) else: mat_rank = Eval.topk_search_(train, test, user, item, topk) return Eval.compute_item_metric(test, mat_rank, cand_count, cutoff)
def test_rp3(X: sps.csr_matrix, alpha: float, beta: float) -> None: rec = RP3betaRecommender(X, alpha=alpha, beta=beta, n_threads=4) rec.learn() W = rec.W.toarray() W_sum = W.sum(axis=1) W_sum = W_sum[W_sum >= 0] popularity = X.sum(axis=0).A1.ravel() ** beta def zero_or_1(X: np.ndarray) -> np.ndarray: X = X.copy() X[X == 0] = 1 return X P_ui = np.power(X.toarray(), alpha) P_iu = np.power(X.T.toarray(), alpha) P_ui /= zero_or_1(P_ui.sum(axis=1))[:, None] P_iu /= zero_or_1(P_iu.sum(axis=1))[:, None] W_man = P_iu.dot(P_ui) # p_{ui} ^{RP3} = p_{ui} ^{P3} / popularity_i ^ beta W_man = W_man / zero_or_1(popularity)[None, :] np.testing.assert_allclose(W, W_man) rec_norm = RP3betaRecommender( X, alpha=alpha, n_threads=4, top_k=2, normalize_weight=True ) rec_norm.learn() # rec W_sum = rec_norm.W.sum(axis=1).A1 for w in W_sum: assert w == pytest.approx(1.0)
def _normalise_adjacency(adjacency: csr_matrix) -> csr_matrix: """ A_tidle = D^(-0.5) A D^(-0.5) """ # Create D^(-0.5) degree_inv_sqrt = np.power(np.array(adjacency.sum(1)), -0.5).flatten() degree_inv_sqrt[np.isinf(degree_inv_sqrt)] = 0.0 degree_inv_sqrt = diags(degree_inv_sqrt, format="coo") # Compute D^(-0.5) A D^(-0.5) return degree_inv_sqrt.dot(adjacency).dot(degree_inv_sqrt)
def normalize_sparse_matrix(matrix: sparse.csr_matrix) -> sparse.csr_matrix: """Normalizes each row to one. Norms of rows aren't exactly because before division 0.1 is added just to make sure that 0 doesn't appear in the nominator. """ sums_for_strings = matrix.sum(axis=1).A.flatten() normalization_matrix = create_sparce_from_diagonal(1 / (sums_for_strings + 0.1)) return normalization_matrix.dot(matrix)
def __call__(self, matrix: csr_matrix): SUM = matrix.sum() row_sum = matrix.sum(axis=1) col_sum = matrix.sum(axis=0) # do: 1 / each cell row_sum = _safe_divide(1, row_sum) col_sum = _safe_divide(1, col_sum) row_sum *= SUM if self.smooth: row_sum = np.power(row_sum, SMOOTH_POWER) res = matrix.multiply(row_sum).multiply(col_sum).tocsr() res.data = np.log(res.data) res.eliminate_zeros() return res
def calcDInv(K: sparse.csr_matrix): """Calculates a scaling diagonal matrix D to rescale eigen vectors Parameters ---------- K the sparse matrix K for the pairwise affinity Returns ------- a sparse matrix with type csr, and D's diagonal values """ dim = K.shape[0] D_diag_inv = 1 / (K.sum(axis=1) + .000000001 ) # add small epsilon to each row in K.sum() # D_diag = 1 / K.sum(axis=1) # print("D_diag",D_diag) D_sparse = sparse.dia_matrix((np.reshape(D_diag_inv, [1, -1]), [0]), (dim, dim)) return sparse.csr_matrix(D_sparse), (K.sum(axis=1) + .000000001)
def calculate_normalized_affinity( W: csr_matrix ) -> Tuple[csr_matrix, np.array, np.array]: diag = W.sum(axis=1).A1 diag_half = np.sqrt(diag) W_norm = W.tocoo(copy=True) W_norm.data /= diag_half[W_norm.row] W_norm.data /= diag_half[W_norm.col] W_norm = W_norm.tocsr() return W_norm, diag, diag_half
def sparse_mat_decompose(user_preference: ss.csr_matrix, latent_factor_num: int = 1) -> (ss.csr_matrix, ss.csr_matrix): """ 分解稀疏的用户偏好矩阵至两个初始化矩阵 U 和 V,U 的列数和 V 的行数为 latent_factor_num :param user_preference: 分解用户偏好矩阵 :param latent_factor_num: U 的列数和 V 的行数 :return: U, V """ rows_num, column_num = user_preference.shape not_nan_elements_num = user_preference.count_nonzero() avg = user_preference.sum() / not_nan_elements_num init_element = np.sqrt(avg / latent_factor_num) u_init, v_init = np.full((rows_num, latent_factor_num), init_element), \ np.full((latent_factor_num, column_num), init_element) return ss.csr_matrix(u_init), ss.csr_matrix(v_init)
def evaluate_item(train: ss.csr_matrix, test: ss.csr_matrix, user: np.ndarray, item: np.ndarray, topk: int = -1, cutoff: int = 100): train = train.tocsr() test = test.tocsr() N = train.shape[1] cand_count = N - train.sum(axis=1) if topk < 0: mat_rank = Eval.predict(train, test, user, item) else: mat_rank = Eval.topk_search_(train, test, user, item, topk) return Eval.compute_item_metric(test, mat_rank, cand_count, cutoff)
def fit(self, X: sp.csr_matrix, multiplier: np.ndarray = None): """Learn the idf vector (global term weights). Arguments: X: A matrix of term/token counts. multiplier: A multiplier for increasing/decreasing certain IDF scores """ X = check_array(X, accept_sparse=('csr', 'csc')) if not sp.issparse(X): X = sp.csr_matrix(X) dtype = np.float64 if self.use_idf: _, n_features = X.shape # Calculate the frequency of words across all classes df = np.squeeze(np.asarray(X.sum(axis=0))) # Calculate the average number of samples as regularization avg_nr_samples = int(X.sum(axis=1).mean()) # Divide the average number of samples by the word frequency # +1 is added to force values to be positive idf = np.log((avg_nr_samples / df) + 1) # Multiplier to increase/decrease certain idf scores if multiplier is not None: idf = idf * multiplier self._idf_diag = sp.diags(idf, offsets=0, shape=(n_features, n_features), format='csr', dtype=dtype) return self
def evaluate_topk(train:ss.csr_matrix, test:ss.csr_matrix, topk_item:np.ndarray, cutoff:int=200): train = train.tocsr() test = test.tocsr() result = topk_item N = train.shape[1] cand_count = N - train.sum(axis=1) M = test.shape[0] uir = [] for i in range(M): R = set(test.indices[test.indptr[i]:test.indptr[i+1]]) for k in range(result.shape[1]): if result[i,k] in R: uir.append((i, result[i,k], k)) user_id, item_id, rank = zip(*uir) mat_rank = ss.csr_matrix((rank, (user_id, item_id)), shape=test.shape) return Eval.compute_item_metric(test, mat_rank, cand_count, cutoff)
def row_normalize_csr_matrix(matrix: csr_matrix) -> csr_matrix: """ Row normalize a csr matrix without mutating the input :param matrix: scipy.sparse.csr_matrix instance """ if not isinstance(matrix, csr_matrix): raise TypeError('expected input to be a scipy csr_matrix') if any(matrix.data == 0): raise ValueError( 'input must be scipy.sparse.csr_matrix and must not store zeros') # get row index for every nonzero element in matrix row_idx, col_idx = matrix.nonzero() # compute unraveled row sums row_sums = matrix.sum(axis=1).A1 # divide data by (broadcasted) row sums normalized = matrix.data / row_sums[row_idx] return csr_matrix((normalized, (row_idx, col_idx)), shape=matrix.shape)
def collect_basic_statistics( X: csr_matrix, cluster_labels: List[str], cond_labels: List[str], gene_names: List[str], n_jobs: int, temp_folder: str, verbose: bool, ) -> List[pd.DataFrame]: """ Collect basic statistics, triggering calc_basic_stat in parallel """ start = time.perf_counter() sum_vec = cnt_vec = None if cond_labels is None: sum_vec = X.sum(axis=0).A1 cnt_vec = X.getnnz(axis=0) result_list = Parallel(n_jobs=n_jobs, max_nbytes=1e7, temp_folder=temp_folder)( delayed(calc_basic_stat)( clust_id, X.data, X.indices, X.indptr, X.shape, cluster_labels, cond_labels, gene_names, sum_vec, cnt_vec, verbose, ) for clust_id in cluster_labels.categories ) end = time.perf_counter() if verbose: logger.info( "Collecting basic statistics is done. Time spent = {:.2f}s.".format( end - start ) ) return result_list
def t_test( X: csr_matrix, cluster_labels: List[str], cond_labels: List[str], gene_names: List[str], n_jobs: int, temp_folder: str, verbose: bool, ) -> List[pd.DataFrame]: """ Run Welch's t-test, triggering calc_t in parallel """ start = time.time() sum_vec = sum2_vec = None if cond_labels is None: sum_vec = X.sum(axis=0).A1 sum2_vec = X.power(2).sum(axis=0).A1 result_list = Parallel(n_jobs=n_jobs, max_nbytes=1e7, temp_folder=temp_folder)(delayed(calc_t)( clust_id, X.data, X.indices, X.indptr, X.shape, cluster_labels, cond_labels, gene_names, sum_vec, sum2_vec, verbose, ) for clust_id in cluster_labels.categories) end = time.time() if verbose: logger.info( "Welch's t-test is done. Time spent = {:.2f}s.".format(end - start)) return result_list
def calculate_min_violations(A: csr_matrix) -> (float, float): """ Calculate the minimum number of violations in a graph for all possible rankings A violaton is an edge going from a lower ranked node to a higher ranked one Minimum number is calculated by summing bidirectional interactions. Input: A: graph adjacency matrix where A[i,j] is the weight of an edge from node i to j Output: minimum number of violations proportion of all edges against minimum violations """ ii, ji, v = scipy.sparse.find( A ) # I,J,V contain the row, column indices, and values of the nonzero entries. min_viol = 0.0 for e in range(len(v)): # for all nodes interactions i, j = ii[e], ji[e] if A[i, j] > 0 and A[j, i] > 0: min_viol = min_viol + min(A[i, j], A[j, i]) m = A.sum() return (min_viol, min_viol / m)
def vertex_degree(graph: sp.csr_matrix, vertex: int): return graph.sum(axis=0)[vertex]
def axis_norms(X: sparse.csr_matrix, norm: str = "l1", axis: int = 1) -> np.ndarray: if norm == "l1": return np.asarray(X.sum(axis=axis)).reshape(-1) return np.sqrt(np.asarray(X.power(2).sum(axis=axis)).reshape(-1))
def __init__(self, adj: sparse.csr_matrix, nodes, tp: ConfigType): self._adj = adj self._nodes: List[Node] = nodes self._size = len(nodes) self._type = tp self._edges = int(adj.sum() / 2)
def get_inv_propensity(train_y: csr_matrix, a=0.55, b=1.5): n, number = train_y.shape[0], np.asarray(train_y.sum(axis=0)).squeeze() c = (np.log(n) - 1) * ((b + 1)**a) return 1.0 + c * (number + b)**(-a)
def normalize_relative_frequency(matrix: sparse.csr_matrix, axis: AxisType): if axis == AxisType.REPERTOIRES: return sparse.diags(1 / matrix.sum(axis=1).A.ravel()) @ matrix if axis == AxisType.FEATURES: return matrix @ sparse.diags(1 / matrix.sum(axis=0).A.ravel())
def preprocess_graph( adj: sp.csr_matrix, same_type_nodes: bool = True ) -> Tuple[np.array, np.array, Tuple[int, int]]: """ Parameters ---------- adj : sp.csr_matrix Adjacency matrix. same_type_nodes : bool Is adjacency matrix for nodes of same type or not? E.g. drug-drug, protein-protein adj or drug-protein, protein-drug. Returns ------- np.array Pairs of edges in normalized adjacency matrix with nonzero values. np.array Nonzero values of normalized adjacency matrix. Tuple[int, int] Shape of normalized adjacency matrix. Notes ----- Updating embeddings on new layer can be written as H(l+1) = σ(SUM_r A_r_normalize @ H(l) @ W_r(l)) A_r_normalize --- normalized adj matrix for r edge type. So we have two variants of normalization for A_r (further just A). 1. Adj matrix for nodes of same type. It is symmetric. A_ = A + I, to add information of current node when collecting information from neighbors with same type. E.g. collecting info from drug nodes when update current drug embedding. D: degree matrix (diagonal matrix with number of neighbours on the diagonal). A_normalize = D^(-1/2) @ A_ @ D^(-1/2), to symmetric normalization (division by sqrt(N_r^i) * sqrt(N_r^j) in formula from original paper). 2. Adj matrix for nodes of different type. Here we don't need to add information from the current node. D_row: output degree matrix --- diagonal matrix with number of output neighbours (i.e. i -> neighbours) on the diagonal. D_col: input degree matrix --- diagonal matrix with number of input neighbours (i.e. neighbours -> i) on the diagonal. A_normalize = D_row^(-1/2) @ A @ D_col^(-1/2), to symmetric normalization (division by sqrt(N_r^i) * sqrt(N_r^j) in formula from original paper). """ adj = sp.coo_matrix(adj) if same_type_nodes: adj_ = adj + sp.eye(adj.shape[0]) rowsum = np.array(adj_.sum(1)) degree_mat_inv_sqrt = sp.diags(np.power(rowsum, -0.5).flatten()) adj_normalized = adj_.dot(degree_mat_inv_sqrt).transpose().dot( degree_mat_inv_sqrt).tocoo() else: rowsum = np.array(adj.sum(1)) colsum = np.array(adj.sum(0)) rowdegree_mat_inv = sp.diags( np.nan_to_num(np.power(rowsum, -0.5)).flatten()) coldegree_mat_inv = sp.diags( np.nan_to_num(np.power(colsum, -0.5)).flatten()) adj_normalized = rowdegree_mat_inv.dot(adj).dot( coldegree_mat_inv).tocoo() return preprocessing.sparse_to_tuple(adj_normalized)