def weight_matrix_by_user_feature_counts(dataMatrix: sps.csr_matrix, UCM: sps.csr_matrix, strategy="linear"): """ Assumes that rows of dataMatrix are users and it returns the weighted dataMatrix based on user feature counts of UCM :param dataMatrix: :param UCM: :param strategy: strategy to use in order to put weights :return: """ if UCM.shape[0] != dataMatrix.shape[0]: raise ValueError("UCM does not contain all users in dataMatrix") UCM_popularity = (UCM > 0).sum(axis=0) UCM_popularity = np.array(UCM_popularity).squeeze() user_list = UCM.tocoo().row feature_list = UCM.tocoo().col user_feature_list = np.full(shape=dataMatrix.shape[0], fill_value=1) user_feature_list[user_list] = UCM_popularity[feature_list] users = dataMatrix.tocoo().row feature_list_for_user = np.array(user_feature_list[users], dtype=np.float32) return _weight_matrix(dataMatrix, feature_list_for_user, strategy)
def sparse_adj_to_edge(adj_matrix: sp.csr_matrix): """Convert a Scipy sparse matrix to (edge_index, edge_weight) representation""" adj_matrix = adj_matrix.tocoo(copy=False) edge_index = np.asarray((adj_matrix.row, adj_matrix.col)) edge_weight = adj_matrix.data.copy() return edge_index, edge_weight
def _save( filepath: Union[str, Path], matrix: csr_matrix, cell_ids: pd.DataFrame, features: pd.DataFrame, save_pickle: bool = False, save_rds: bool = False, save_h5ad: bool = False, save_loom: bool = False, meta: Optional[pd.DataFrame] = None, ): filepath = Path(filepath) if save_pickle: build_counts_store(matrix.tocoo(), cell_ids, features, save_path=filepath) if save_rds: Convert.pickle_to_rds_dir(filepath.parent) if save_h5ad or save_loom: cell_ids = meta if meta is not None else pd.DataFrame( cell_ids).set_index(0) cell_ids.index.name = "index" features = features.rename(columns={ "ensgs": "gene_ids", "genes": "index" }).set_index("index") adata = AnnData(matrix.tocsr(), cell_ids, features) if save_h5ad: adata.write_h5ad(filepath.parent / "rna.h5ad") if save_loom: adata.write_loom(filepath.parent / "rna.loom")
def convert_URM_to_FM(URM: csr_matrix): """ Convert positive interactions of an URM in the way that is needed for the FM model. - In each row there are 3 interactions: 1 for the user, 1 for the item - Only positive samples are encoded here Note: this method works only for implicit dataset :param URM: URM to be preprocessed :return: csr_matrix containing the URM preprocessed in the described way """ n_users = URM.shape[0] n_items = URM.shape[1] n_sample = len(URM.data) FM_matrix = sps.coo_matrix((n_sample, n_users + n_items)) # Setting rows FM_matrix.row = np.repeat(np.arange(n_sample), 2) # one row has two ones # Setting cols row = np.reshape(URM.tocoo().row, newshape=(n_sample, 1)) col = np.reshape(URM.tocoo().col + n_users, newshape=(n_sample, 1)) row_col = np.concatenate([row, col], axis=1) unrolled_row_col = np.reshape(row_col, newshape=len(FM_matrix.row)) FM_matrix.col = unrolled_row_col # Setting data FM_matrix.data = np.ones(len(FM_matrix.row), dtype=np.float32) return FM_matrix.tocsr()
def sparse2dict(matrix: csr_matrix): """ csr_matrix을 dictionary format으로 변환 """ coo = matrix.tocoo() return dict(data=coo.data.tolist(), row=coo.row.tolist(), col=coo.col.tolist(), shape=coo.shape)
def calculate_normalized_affinity( W: csr_matrix ) -> Tuple[csr_matrix, np.array, np.array]: diag = W.sum(axis=1).A1 diag_half = np.sqrt(diag) W_norm = W.tocoo(copy=True) W_norm.data /= diag_half[W_norm.row] W_norm.data /= diag_half[W_norm.col] W_norm = W_norm.tocsr() return W_norm, diag, diag_half
def fit(self, graph: sp.csr_matrix): """ Fitting a NodeSketch model. """ self._graph = graph self._num_nodes = graph.shape[0] self._hash_values = self._generate_hash_values() self._sla = graph.tocoo() self._sla.data = np.array([1 for _ in range(len(self._sla.data))]) self._sla_original = self._sla.copy() self._do_single_sketch() for _ in range(self.iterations - 1): self._augment_sla() self._do_single_sketch()
def _get_log_distances(self, y_distances: csr_matrix, base=0.5) -> csr_matrix: """ Returns the logarithmic version (base default: 0.5) of the distance matrix returned by TagEmebddingClassifier. This must be used in order to compute valid precision@k scores since small Distances should be ranked better than great ones. :param y_distances: sparse distance matrix (multilabel matrix with distances instead of binary indicators) :param base: base of the log function (must be smaller then one) :return: sparse matrix with the log values """ log_y_distances = y_distances.tocoo() log_y_distances.data = np.log(log_y_distances.data) / np.log(base) return log_y_distances.tocsr()
def csr2tensor(self, matrix: sp.csr_matrix): r"""Convert csr_matrix to tensor. Args: matrix (scipy.csr_matrix): Sparse matrix to be converted. Returns: torch.sparse.FloatTensor: Transformed sparse matrix. """ matrix = matrix.tocoo() x = torch.sparse.FloatTensor( torch.LongTensor(np.array([matrix.row, matrix.col])), torch.FloatTensor(matrix.data.astype(np.float32)), matrix.shape).to(self.device) return x
def dropcols_coo(csr_mat: sp.csr_matrix, idx_to_drop): """ Drop columns of sparse matrix http://stackoverflow.com/questions/23966923/delete-columns-of-matrix-of-csr-format-in-python """ idx_to_drop = np.unique(idx_to_drop) coo_mat = csr_mat.tocoo() keep = ~np.in1d(coo_mat.col, idx_to_drop) coo_mat.data = coo_mat.data[keep] coo_mat.row = coo_mat.row[keep] coo_mat.col = coo_mat.col[keep] # decrement column indices coo_mat.col -= idx_to_drop.searchsorted(coo_mat.col) coo_mat._shape = (coo_mat.shape[0], coo_mat.shape[1] - len(idx_to_drop)) return coo_mat.tocsr()
def weight_matrix_by_user_profile(dataMatrix: sps.csr_matrix, URM, strategy="linear"): """ :param dataMatrix: :param URM: :param strategy: strategy to use in order to put weights :return: """ if URM.shape[0] != dataMatrix.shape[0]: raise ValueError("URM does not contain all users in dataMatrix") user_activity = (URM > 0).sum(axis=1) user_activity = np.array(user_activity).squeeze() users = dataMatrix.tocoo().row user_profile_for_user = np.array(user_activity[users], dtype=np.float32) return _weight_matrix(dataMatrix, user_profile_for_user, strategy)
def weight_matrix_by_item_feature_value(dataMatrix: sps.csr_matrix, ICM: sps.csr_matrix, strategy="linear"): """ Assumes that rows of dataMatrix are items and it returns the weighted dataMatrix based on item feature value of ICM :param dataMatrix: :param ICM: ICM with only one columns :param strategy: :return: """ if ICM.shape[0] != dataMatrix.shape[0]: raise ValueError("ICM does not contain all items in dataMatrix") item_list = dataMatrix.tocoo().row item_feature_weights = np.array(ICM[item_list].todense()).squeeze() mean_value = item_feature_weights.mean() item_feature_weights[item_feature_weights == 0] = mean_value return _weight_matrix(dataMatrix, item_feature_weights, strategy)
def weight_matrix_by_item_popularity(dataMatrix: sps.csr_matrix, R_iu: sps.csr_matrix, strategy="linear"): """ Assumes that dataMatrix has items as row :param dataMatrix: csr matrix with items as rows :param R_iu: Rating item x user matrix :param strategy: strategy to use in order to put weights :return: """ if R_iu.shape[0] != dataMatrix.shape[0]: raise ValueError("R_iu does not contain all items in dataMatrix") item_popularity = (R_iu > 0).sum(axis=1) item_popularity = np.array(item_popularity).squeeze() items = dataMatrix.tocoo().row item_popularity_for_item = np.array(item_popularity[items], dtype=np.float32) return _weight_matrix(dataMatrix, item_popularity_for_item, strategy)
def _reweight_values(self, doc_term_matrix: sp.csr_matrix) -> sp.csr_matrix: """ Re-weight values in a doc-term matrix according to parameters specified in :class:`Vectorizer` initialization: binary or tf-idf weighting, sublinear term-frequency, document-normalized weights. Args: doc_term_matrix Returns: Reweighted doc-term matrix. """ # re-weight the local components (term freqs) if self.tf_type == "binary": doc_term_matrix.data.fill(1) elif self.tf_type == "bm25": if not self.dl_type: doc_term_matrix.data = (doc_term_matrix.data * (BM25_K1 + 1.0) / (BM25_K1 + doc_term_matrix.data)) else: dls = get_doc_lengths(doc_term_matrix, type_=self.dl_type) length_norm = (1 - BM25_B) + (BM25_B * (dls / self._avg_doc_length)) doc_term_matrix = doc_term_matrix.tocoo(copy=False) doc_term_matrix.data = ( doc_term_matrix.data * (BM25_K1 + 1.0) / (doc_term_matrix.data + (BM25_K1 * length_norm[doc_term_matrix.row]))) doc_term_matrix = doc_term_matrix.tocsr(copy=False) elif self.tf_type == "sqrt": _ = np.sqrt(doc_term_matrix.data, doc_term_matrix.data, casting="unsafe") elif self.tf_type == "log": _ = np.log(doc_term_matrix.data, doc_term_matrix.data, casting="unsafe") doc_term_matrix.data += 1.0 elif self.tf_type == "linear": pass # tfs are already linear else: # this should never raise, i'm just being a worrywart raise ValueError( errors.value_invalid_msg( "tf_type", self.tf_type, {"binary", "bm25", "sqrt", "log", "linear"})) # apply the global component (idfs), column-wise if self.idf_type: doc_term_matrix = doc_term_matrix * self._idf_diag # apply normalizations, row-wise # unless we've already handled it for bm25-style tf if self.dl_type and self.tf_type != "bm25": n_docs, _ = doc_term_matrix.shape dls = get_doc_lengths(doc_term_matrix, type_=self.dl_type) dl_diag = sp.spdiags(1.0 / dls, diags=0, m=n_docs, n=n_docs, format="csr") doc_term_matrix = dl_diag * doc_term_matrix if self.norm is not None: doc_term_matrix = normalize_mat(doc_term_matrix, norm=self.norm, axis=1, copy=False) return doc_term_matrix
def augment_adj(adj_matrix: sp.csr_matrix, nodes: Union[list, int, np.ndarray], edge_weight: np.ndarray = None, *, nbrs_to_link: Union[list, np.ndarray, None] = None, common_nbrs: Union[list, np.ndarray, None] = None, fill_weight: float = 1.0) -> sp.csr_matrix: """Augment a specified adjacency matrix by linking nodes to each element in `nbrs_to_link`. Examples ---------- # add 2 nodes adjacent to [2,3] and 3, respectively. >>> augmented_adj = augment_adj(adj_matrix, nodes=2, nbrs_to_link=[[2,3],3], fill_weight=1.0) # add 2 nodes all adjacent to [1,2,3]. >>> augmented_adj = augment_adj(adj_matrix, nodes=2, common_nbrs=[1,2,3], fill_weight=1.0) # add 3 edges, [3,1], [4,2], [5,3]. >>> augmented_adj = augment_adj(adj_matrix, nodes=[3,4,5], common_nbrs=[1,2,3], fill_weight=1.0) Parameters ---------- adj_matrix: shape [num_nodes, num_nodes]. A Scipy sparse adjacency matrix. nodes: the nodes that will be linked to the graph. list or np.array: the nodes connected to `nbrs_to_link` int: new added nodes connected to `nbrs_to_link`, node ids [num_nodes, ..., num_nodes+nodes-1]. nbrs_to_link: a list of N elements, where N is the length of 'nodes'. the specified neighbor(s) for each added node. if `None`, it will be set to `[0, ..., N-1]`. common_nbrs: shape [None,]. specified common neighbors for each added node. fill_weight: edge weight for the augmented edges. NOTE: ---------- Both `nbrs_to_link` and `common_nbrs` should not be specified together. See Also ---------- graphgallery.functional.augment_edge """ adj_matrix = adj_matrix.tocoo(copy=False) edge_index = adj_matrix.row, adj_matrix.col augmented_edge_index, augmented_edge_weight = augment_edge(edge_index, nodes, edge_weight=adj_matrix.data, nbrs_to_link=nbrs_to_link, common_nbrs=common_nbrs, fill_weight=fill_weight) N = augmented_edge_index.max() + 1 augmented_adj = sp.csr_matrix((augmented_edge_weight, augmented_edge_index), shape=(N, N)) augmented_adj.eliminate_zeros() return augmented_adj