def similarity_from_sparse(matrix_a: sparse.csr_matrix, matrix_b: sparse.csr_matrix): intersection = matrix_a.dot(matrix_b.transpose()).toarray() norm_1 = np.array(matrix_a.multiply(matrix_a).sum(axis=1)) norm_2 = np.array(matrix_b.multiply(matrix_b).sum(axis=1)) union = norm_1 + norm_2.T - intersection return intersection / union
def _compute_sparse_minimizer(hat_vect_matrix: sparse.csr_matrix, X: sparse.csr_matrix) -> np.ndarray: # compute numerator part of minimizer, this will yield a dense vector (size of u or v) minimizer = (hat_vect_matrix.multiply(X)).sum(axis=1) # divide by norm squared of masked vector vector_of_norms = (hat_vect_matrix.multiply(hat_vect_matrix)).sum(axis=1) # if some norm is 0 numerator will be 0 too (avoid 0/0) vector_of_norms[vector_of_norms < 1e-8] = 1 minimizer = minimizer / vector_of_norms # return np.ndarray representation return minimizer.A
def fit(self, X: np.ndarray, knn: sparse.csr_matrix, k: int) -> np.ndarray: """ Args: X Input vector of expression values, shape=(n_cells) knn KNN connectivity matrix, shape=(n_cells, n_cells) Remarks: knn is assumed to have a single k for all cells (if not, the enrichment will be only approximate). """ n_cells = X.shape[0] nonzeros = (X > 0).astype('int') nz_bycell = knn.multiply(nonzeros).sum(axis=1) sum_bycell = knn.multiply(X).sum(axis=1) total_nz = nonzeros.sum() total_sum = X.sum() nz_enrichment = (nz_bycell / k + self.epsilon) / ((total_nz - nz_bycell) / (n_cells - k) + self.epsilon) mean_enrichment = (sum_bycell / k + self.epsilon) / ((total_sum - sum_bycell) / (n_cells - k) + self.epsilon) return (nz_enrichment.A * mean_enrichment.A).T[0]
def normalize_adj(adj : sp.csr_matrix): """Normalize adjacency matrix and convert it to a sparse tensor.""" if sp.isspmatrix(adj): adj = adj.tolil() adj.setdiag(1) adj = adj.tocsr() deg = np.ravel(adj.sum(1)) deg_sqrt_inv = 1 / np.sqrt(deg) adj_norm = adj.multiply(deg_sqrt_inv[:, None]).multiply(deg_sqrt_inv[None, :]) elif torch.is_tensor(adj): deg = adj.sum(1) deg_sqrt_inv = 1 / torch.sqrt(deg) adj_norm = adj * deg_sqrt_inv[:, None] * deg_sqrt_inv[None, :] return to_sparse_tensor(adj_norm)
def _compute_sparse_gradient(hat_vect_matrix: sparse.csr_matrix, X: sparse.csr_matrix, z: np.ndarray, y: np.ndarray) -> np.ndarray: # grad_z = (hat_vect_matrix.multiply(z @ y.T - X)).sum(axis=1) <-- compressed but memory inefficient (`A`=z @ y.T is dense) implementation # return grad_z.A # sparse matrix are represented by data, rows, cols indices sparse_X_tuple = (X.data, *X.nonzero()) # create difference matrix sparse representation to avoid passing `hat_vect_matrix` as full dense matrix diff_matrix = sparse.csr_matrix( (_compute_sparse_difference_matrix(sparse_X_tuple, z, y)), shape=X.shape, dtype=z.dtype) # print("Norm-check", np.linalg.norm(hat_vect_matrix.multiply(diff_matrix).toarray() - hat_vect_matrix.multiply(z @ y.T - X).toarray() ) ) # sum over rows (axis=1) return hat_vect_matrix.multiply(diff_matrix).sum(axis=1)
def Lloyd_iteration2( A,P, w ,Q): dists,Tags,_=squaredis(P,Q) print('finish squaredis') Qjl=SM((Q.shape[0],A.shape[1])) wq=np.zeros((Q.shape[0],1)) w=np.reshape(w,(len(w),1)) for i in range (Qjl.shape[0]): #print(i) inds=np.where(Tags==i)[0] wmin=0 wi=w[inds,:]-wmin Qjl[i,:]=(A[inds,:].multiply(wi)).sum(0) wq[i,:]=np.sum(wi,0) wq[wq==0]=1 wqi=1/wq Qjl=Qjl.multiply(wqi+wmin) return SM(Qjl)
def _assign_train_indices(self, tree: HuffmanTree, y: csr_matrix) -> HuffmanTree: """ Assings indices of train data to the nodes of the label tree. :param tree: Label Tree :param y: Label Matrix :return: tree with data assigned """ if self.verbose: print('Assigning train data to tree_ nodes') for node in tree.bfs_traverse(): compare_row = np.zeros(y[0].shape).ravel() compare_row[node.label_idx] = 1 compare_row = csr_matrix(compare_row) # do matrix vector multiplication to node.train_idx = y.multiply(compare_row) node.y = node.train_idx.max(axis=1).astype('int8').toarray().ravel() return tree
def __call__(self, matrix: csr_matrix): SUM = matrix.sum() row_sum = matrix.sum(axis=1) col_sum = matrix.sum(axis=0) # do: 1 / each cell row_sum = _safe_divide(1, row_sum) col_sum = _safe_divide(1, col_sum) row_sum *= SUM if self.smooth: row_sum = np.power(row_sum, SMOOTH_POWER) res = matrix.multiply(row_sum).multiply(col_sum).tocsr() res.data = np.log(res.data) res.eliminate_zeros() return res
def sparse_average_precision_at_k(y_true: csr_matrix, y_scores: csr_matrix, k: int = 5) -> float: """ Computes the average precision at k for sparse binary matrices. :param y_true: grounded truth in binary format (n_samples, n_labels) :param y_scores: predictions in representation that can be ranked (e.g. probabilities) :param k: top k labels to check :return: precision at k score """ if y_true.shape != y_scores.shape: raise Exception('y_true and y_pred must have same shape') if y_true.shape[1] < k: raise Exception('Less labels than k') # get indices of k top values of y_pred top_idx = top_n_idx_sparse(y_scores, k) # create new matrix with shape == y_true.shape with only top ranked labels y_pred_binary_only_top = lil_matrix(y_true.shape, dtype='int8') for index, (binary_row, idx_row) in enumerate(zip(y_pred_binary_only_top, top_idx)): y_pred_binary_only_top[index, idx_row] = 1 y_pred_binary_only_top = y_pred_binary_only_top.tocsr() # compute precision # get correct predicted labels correct_labelled = y_true.multiply(y_pred_binary_only_top) summed_precision = [] for index, (row, score_row) in enumerate(zip(correct_labelled, y_scores)): # check special case that corresponding y_true row is empty => unlabeled instance if y_true[index].count_nonzero() == 0: # if no labels where predicted add 1 to sum if score_row.count_nonzero() == 0: summed_precision.append(1.0) else: summed_precision.append(0) else: summed_precision.append(row.count_nonzero() / k) return sum(summed_precision) / len(summed_precision)
def compute_tf_idf(doc_matrix: sparse.csr_matrix) -> sparse.csr_matrix: # 假设这里的 doc_matrix 已经是行和为 1,那么就只需计算 idf # 首先找出所有非零元的坐标 i, nonzero_cols, v = sparse.find(doc_matrix) # 然后统计每一列的非零元的个数 nonzero_cols, nonzero_col_appearences = np.unique(nonzero_cols, return_counts=True) # 有些列可能全为 0,所以 indicator = np.zeros(shape=( 1, doc_matrix.shape[1], ), dtype=np.float32) indicator[0, nonzero_cols] = nonzero_col_appearences[:] n_articles = doc_matrix.shape[0] indicator = np.log(n_articles / indicator) # 此处为按元素乘 tf_idf = sparse.csr_matrix(doc_matrix.multiply(indicator)) return tf_idf
def compute_norms(matrix: sparse.csr_matrix) -> np.ndarray: """Computes norms for each row.""" return np.sqrt(matrix.multiply(matrix).sum(axis=1).A).flatten()
def sparse_pos_clip(a: csr_matrix): # TODO: optimize return a.multiply(a > 0)
def sparse_l2_norm(*, matrix: csr_matrix) -> np.array: """ Return the l2 norm of an input csr sparse matrix. This is significantly faster and less memory intensive than simply passing the matrix to numpy. """ return np.sqrt(np.sum(matrix.multiply(matrix), axis=1))