def evaluate_item_with_code(train: ss.csr_matrix, test: ss.csr_matrix, user: np.ndarray, item_code: np.ndarray, item_center: List[np.ndarray], topk=200, cutoff=200): train = train.tocsr() test = test.tocsr() #result1 = Eval.topk_search_with_code(train, user, item_code, item_center, topk) result = Eval.topk_search_with_code_fast(train, user, item_code, item_center, topk) return Eval.evaluate_topk(train, test, result, cutoff)
def evaluate_item(train:ss.csr_matrix, test:ss.csr_matrix, user:np.ndarray, item:np.ndarray, topk:int=200, cutoff:int=200): train = train.tocsr() test = test.tocsr() idx = np.squeeze((test.sum(axis=1) > 0).A) train = train[idx, :] test = test[idx, :] user = user[idx, :] N = train.shape[1] cand_count = N - train.sum(axis=1) if topk <0: mat_rank = Eval.predict(train, test, user, item) else: mat_rank = Eval.topk_search_(train, test, user, item, topk) return Eval.compute_item_metric(test, mat_rank, cand_count, cutoff)
def evaluate_item(train: ss.csr_matrix, test: ss.csr_matrix, user: np.ndarray, item: np.ndarray, topk: int = -1, cutoff: int = 100): train = train.tocsr() test = test.tocsr() N = train.shape[1] cand_count = N - train.sum(axis=1) if topk < 0: mat_rank = Eval.predict(train, test, user, item) else: mat_rank = Eval.topk_search_(train, test, user, item, topk) return Eval.compute_item_metric(test, mat_rank, cand_count, cutoff)
def _save( filepath: Union[str, Path], matrix: csr_matrix, cell_ids: pd.DataFrame, features: pd.DataFrame, save_pickle: bool = False, save_rds: bool = False, save_h5ad: bool = False, save_loom: bool = False, meta: Optional[pd.DataFrame] = None, ): filepath = Path(filepath) if save_pickle: build_counts_store(matrix.tocoo(), cell_ids, features, save_path=filepath) if save_rds: Convert.pickle_to_rds_dir(filepath.parent) if save_h5ad or save_loom: cell_ids = meta if meta is not None else pd.DataFrame( cell_ids).set_index(0) cell_ids.index.name = "index" features = features.rename(columns={ "ensgs": "gene_ids", "genes": "index" }).set_index("index") adata = AnnData(matrix.tocsr(), cell_ids, features) if save_h5ad: adata.write_h5ad(filepath.parent / "rna.h5ad") if save_loom: adata.write_loom(filepath.parent / "rna.loom")
def evaluate_topk(train:ss.csr_matrix, test:ss.csr_matrix, topk_item:np.ndarray, cutoff:int=200): train = train.tocsr() test = test.tocsr() result = topk_item N = train.shape[1] cand_count = N - train.sum(axis=1) M = test.shape[0] uir = [] for i in range(M): R = set(test.indices[test.indptr[i]:test.indptr[i+1]]) for k in range(result.shape[1]): if result[i,k] in R: uir.append((i, result[i,k], k)) user_id, item_id, rank = zip(*uir) mat_rank = ss.csr_matrix((rank, (user_id, item_id)), shape=test.shape) return Eval.compute_item_metric(test, mat_rank, cand_count, cutoff)
def neighbor_sampler(adj_matrix: sp.csr_matrix, max_degree: int = 25, selfloop: bool = False): adj_matrix = adj_matrix.tocsr(copy=False) N = adj_matrix.shape[0] neighbors_matrix = N * np.ones((N + 1, max_degree), dtype=intx()) for nodeid in range(N): neighbors = adj_matrix[nodeid].indices # if not selfloop: # neighbors = np.setdiff1d(neighbors, [nodeid]) # else: # neighbors = np.intersect1d(neighbors, [nodeid]) size = neighbors.size if size == 0: continue if size > max_degree: neighbors = np.random.choice(neighbors, max_degree, replace=False) elif size < max_degree: neighbors = np.random.choice(neighbors, max_degree, replace=True) neighbors_matrix[nodeid] = neighbors np.random.shuffle(neighbors_matrix.T) return neighbors_matrix
def _k_neighbors_precomputed_sparse(self, X: csr_matrix, n_samples: int = None): """ Find nearest neighbors in sparse distance matrix. Parameters ---------- X: sparse, shape = [n_test, n_indexed] Sparse distance matrix. Only non-zero elements may be considered neighbors. n_samples: int Number of sampled indexed objects, e.g. in approximate hubness reduction. If None, this is inferred from the first row of X. Returns ------- k_neighbors : ndarray Flattened array of neighbor indices. """ if not issparse(X): raise TypeError(f'Matrix X is not sparse') X = X.tocsr() if n_samples is None: n_samples = X.indptr[1] - X.indptr[0] n_test, _ = X.shape # To allow different number of explicit entries per row, # we need to process the matrix row-by-row. if np.all(X.indptr[1:] - X.indptr[:-1] == n_samples) and not self.shuffle_equal: min_ind = np.argpartition(X.data.reshape(n_test, n_samples), kth=np.arange(self.k), axis=1)[:, :self.k] k_neighbors = X.indices[min_ind.ravel() + np.repeat(X.indptr[:-1], repeats=self.k)] else: k_neighbors = np.empty((n_test, ), dtype=object) if self.verbose: range_n_test = tqdm(range(n_test)) else: range_n_test = range(n_test) if self.shuffle_equal: for i in range_n_test: x = X.getrow(i) rp = self._random_state.permutation(x.nnz) d2 = x.data[rp] d2idx = np.argpartition(d2, kth=np.arange(self.k)) k_neighbors[i] = x.indices[rp[d2idx[:self.k]]] else: for i in range_n_test: x = X.getrow(i) min_ind = np.argpartition(x.data, kth=np.arange(self.k))[:self.k] k_neighbors[i] = x.indices[min_ind] k_neighbors = np.concatenate(k_neighbors) return k_neighbors
def compute_reg_scale(X: sps.csr_matrix, alpha0: float, nu: float) -> float: X_csr: sps.csr_matrix = X.tocsr() X_csr.sort_indices() X_csc = X_csr.tocsc() X_csc.sort_indices() U, I = X.shape nnz_row: np.ndarray = X_csr.indptr[1:] - X_csr.indptr[:-1] nnz_col: np.ndarray = X_csc.indptr[1:] - X_csc.indptr[:-1] return float(((nnz_row + alpha0 * I)**nu).sum()) + float( ((nnz_col + alpha0 * U)**nu).sum())
def topk_search(train:ss.csr_matrix, user:np.ndarray, item:np.ndarray, topk:int=200)->np.ndarray: train = train.tocsr() M, _ = train.shape item_t = item.T result = np.zeros((M, topk), dtype=np.int) for i in range(M): E = train.indices[train.indptr[i]:train.indptr[i+1]] pred = np.matmul(user[i,:], item_t) #pred = np.tensordot(user[i,:], item, [0,-1]) pred[E] = -np.inf idx = np.argpartition(pred, -topk)[-topk:] result[i,:] = idx[np.argsort(-pred[idx])] return result
def _store_sparse(self, group: tables.Group, name: str, arr: csr_matrix) -> None: if not sparse.isspmatrix_csr(arr): arr = arr.tocsr() csr_group = self.h5f.create_group(group, name) csr_group.was_sparse = True if arr is not None and arr.nnz > 0: self.h5f.create_array(csr_group, 'data', arr.data) self.h5f.create_array(csr_group, 'indptr', arr.indptr) self.h5f.create_array(csr_group, 'indices', arr.indices) self.h5f.create_array(csr_group, 'shape', arr.shape) self.h5f.flush()
def normalize_adj(adj : sp.csr_matrix): """Normalize adjacency matrix and convert it to a sparse tensor.""" if sp.isspmatrix(adj): adj = adj.tolil() adj.setdiag(1) adj = adj.tocsr() deg = np.ravel(adj.sum(1)) deg_sqrt_inv = 1 / np.sqrt(deg) adj_norm = adj.multiply(deg_sqrt_inv[:, None]).multiply(deg_sqrt_inv[None, :]) elif torch.is_tensor(adj): deg = adj.sum(1) deg_sqrt_inv = 1 / torch.sqrt(deg) adj_norm = adj * deg_sqrt_inv[:, None] * deg_sqrt_inv[None, :] return to_sparse_tensor(adj_norm)
def _weight_matrix(dataMatrix: sps.csr_matrix, weights: np.ndarray, strategy="linear"): """ Assuming that rows are the object to weights, it returns a weighted matrix based on the weights and alpha :param dataMatrix: dataMatrix with rows being the object to weight :param weights: an array with the same length as the number of nnz inside the dataMatrix :param strategy: strategy to use in order to put weights :return: new data matrix with weighted data """ if len(weights) != len(dataMatrix.data): raise ValueError( "Demographic list does not contain all users in dataMatrix") matrix = dataMatrix.tocsr(copy=True) strategy_funct = STRATEGY_MAPPER[strategy] matrix = strategy_funct.weight_matrix(matrix, feature_data=weights) return sps.csr_matrix(matrix)
def _preprocess_URM_all(self, URM_all: sps.csr_matrix): warm_items_mask = np.ediff1d(URM_all.tocsc().indptr) > self.threshold_items self.warm_items = np.arange(URM_all.shape[1])[warm_items_mask] URM_all = URM_all[:, self.warm_items] warm_users_mask = np.ediff1d(URM_all.tocsr().indptr) > self.threshold_users self.warm_users = np.arange(URM_all.shape[0])[warm_users_mask] URM_all = URM_all[self.warm_users, :] self._LOADED_GLOBAL_MAPPER_DICT["user_original_ID_to_index"] = reconcile_mapper_with_removed_tokens( self._LOADED_GLOBAL_MAPPER_DICT["user_original_ID_to_index"], np.arange(0, len(warm_users_mask), dtype=np.int)[np.logical_not(warm_users_mask)]) self._LOADED_GLOBAL_MAPPER_DICT["item_original_ID_to_index"] = reconcile_mapper_with_removed_tokens( self._LOADED_GLOBAL_MAPPER_DICT["item_original_ID_to_index"], np.arange(0, len(warm_items_mask), dtype=np.int)[np.logical_not(warm_items_mask)]) return URM_all
def with_cliques(adjacency: sp.csr_matrix, clique_size: int, num_cliques: int = 1) -> Tuple[sp.csr_matrix, np.ndarray]: """ Get adjacency matrix of dataset with cliques. A clique is defined as a set of nodes where each node is a neighbor of every other node. Args: adjacency: adjacency matrix to start with. clique_size: size of each clique. num_cliques: number of cliques to add. Returns: augmented_adjacency: adjacency with cliques added. cliques: [num_cliques, clique_size] int32 array of indices of clique nodes. """ num_nodes = adjacency.shape[0] adjacency = adjacency.tolil() dtype = adjacency.dtype rows = adjacency.rows data = adjacency.data cliques = np.empty((num_cliques, clique_size), dtype=np.int32) for i in range(num_cliques): clique = np.random.choice(num_nodes, clique_size, replace=False) clique.sort() cliques[i] = clique for c in clique: row = set(rows[c]) contains_c = c in row row.update(clique) if not contains_c: row.remove(c) rows[c] = sorted(row) data[c] = np.ones((len(row), ), dtype=dtype) return adjacency.tocsr(), cliques
def split(urm: sps.csr_matrix, range_intervals_list: list, k_out_list: list, probability_list: list) -> (sps.csr_matrix, sps.csr_matrix): # Initialize timer start_time = time.time() # Checking the size of the list assert len(range_intervals_list) == len(k_out_list) == len(probability_list), \ "The input lists must have the same length" # Generating a copy of the matrix matrix = urm.tocsr(copy=True) # Initializing the matrices train_matrix = sps.lil_matrix(matrix.shape, dtype=np.float) test_matrix = sps.lil_matrix(matrix.shape, dtype=np.float) # Users users = range(matrix.shape[0]) # Creating a list of triple (range_interval, k_out, probability) triple_list = [ (r, k, p) for r, k, p in zip(range_intervals_list, k_out_list, probability_list) ] # Start splitting the input matrix for user in users: # Finding the items with which the user has interacted items = matrix[user].indices n_items = len(items) # Finding the splitting category for the user k, p = -1, -1 for range_interval, k_out, probability in triple_list: if range_interval[0] <= n_items <= range_interval[1]: k = k_out p = probability break # Extracting k unique items (if possible) n_items_to_be_extracted = min(k, n_items) rnd.shuffle(items) k_extracted_items = items[:n_items_to_be_extracted] remaining_items = items[n_items_to_be_extracted:] # Extracting elements with probability p p_extracted_items = [ item for item in remaining_items if rnd.random() <= p ] remaining_items = [ item for item in remaining_items if item not in p_extracted_items ] # Incrementally generate fill the train and test matrix for item in remaining_items: train_matrix[user, item] = matrix[user, item] for item in k_extracted_items: test_matrix[user, item] = matrix[user, item] for item in p_extracted_items: test_matrix[user, item] = matrix[user, item] # Converting train matrix and test matrix to csr format train_matrix = train_matrix.tocsr() test_matrix = test_matrix.tocsr() # Checking output matrices assert train_matrix.shape == test_matrix.shape == matrix.shape, \ "Train and test matrices should have the same shape as the input one" assert (train_matrix.nnz + test_matrix.nnz) == matrix.nnz, \ f"The sum of train and test matrices nnz elements should be exactly the nnz elements in the input matrix:\n" \ f"train matrix nnz elements : {train_matrix.nnz}\n" \ f"test matrix nnz elements : {test_matrix.nnz}\n" \ f"matrix nnz elements : {matrix.nnz}\n" # Ending time end_time = time.time() print( f"Time taken to split the input matrix : {end_time - start_time} seconds" ) return train_matrix, test_matrix
def __init__(self, adj_matrix: sp.csr_matrix, attr_matrix=None, labels: np.ndarray = None, node_names: np.ndarray = None, attr_names: np.ndarray = None, class_names: np.ndarray = None, metadata: object = None): """Create attrribute graph Parameters ---------- adj_matrix : sp.csr_matrix, shape [num_nodes, num_nodes] Adjacency matrix in CSR format. attr_matrix : sp.csr_matrix or np.ndarray, shape [num_nodes, num_attr], optional Attribute matrix in CSR or numpy format. labels : np.ndarray, shape [num_nodes], optional Array, where each entry represents respective node's label(s). node_names : np.ndarray, shape [num_nodes], optional Names of nodes (as strings). attr_names : np.ndarray, shape [num_attr] Names of the attributes (as strings). class_names : np.ndarray, shape [num_classes], optional Names of the class labels (as strings). metadata : object Additional metadata such as text. """ if sp.isspmatrix(adj_matrix): adj_matrix = adj_matrix.tocsr().astype(np.float32) else: raise ValueError( "Adjacency matrix must be in sparse format (got {0} instead)". format(type(adj_matrix))) if adj_matrix.shape[0] != adj_matrix.shape[1]: raise ValueError("Dimensions of the adjacency matrix don't agree") if attr_matrix is not None: if sp.isspmatrix(attr_matrix): attr_matrix = attr_matrix.tocsr().astype(np.float32) elif isinstance(attr_matrix, np.ndarray): attr_matrix = attr_matrix.astype(np.float32) else: raise ValueError( "Attribute matrix must be a sp.spmatrix or a np.ndarray (got {0} instead)" .format(type(attr_matrix))) if attr_matrix.shape[0] != adj_matrix.shape[0]: raise ValueError( "Dimensions of the adjacency and attribute matrices don't agree" ) if labels is not None: if labels.shape[0] != adj_matrix.shape[0]: raise ValueError( "Dimensions of the adjacency matrix and the label vector don't agree" ) if node_names is not None: if len(node_names) != adj_matrix.shape[0]: raise ValueError( "Dimensions of the adjacency matrix and the node names don't agree" ) if attr_names is not None: if len(attr_names) != attr_matrix.shape[1]: raise ValueError( "Dimensions of the attribute matrix and the attribute names don't agree" ) self.adj_matrix = adj_matrix self.attr_matrix = attr_matrix self.labels = labels self.node_names = node_names self.attr_names = attr_names self.class_names = class_names self.metadata = metadata
def removeEye(adj: sp.csr_matrix): adj = adj.tolil(copy=True) adj.setdiag(0) return adj.tocsr()
def addEye(adj: sp.csr_matrix): adj = adj.tolil(copy=True) adj.setdiag(1) return adj.tocsr()
def _reweight_values(self, doc_term_matrix: sp.csr_matrix) -> sp.csr_matrix: """ Re-weight values in a doc-term matrix according to parameters specified in :class:`Vectorizer` initialization: binary or tf-idf weighting, sublinear term-frequency, document-normalized weights. Args: doc_term_matrix Returns: Reweighted doc-term matrix. """ # re-weight the local components (term freqs) if self.tf_type == "binary": doc_term_matrix.data.fill(1) elif self.tf_type == "bm25": if not self.dl_type: doc_term_matrix.data = (doc_term_matrix.data * (BM25_K1 + 1.0) / (BM25_K1 + doc_term_matrix.data)) else: dls = get_doc_lengths(doc_term_matrix, type_=self.dl_type) length_norm = (1 - BM25_B) + (BM25_B * (dls / self._avg_doc_length)) doc_term_matrix = doc_term_matrix.tocoo(copy=False) doc_term_matrix.data = ( doc_term_matrix.data * (BM25_K1 + 1.0) / (doc_term_matrix.data + (BM25_K1 * length_norm[doc_term_matrix.row]))) doc_term_matrix = doc_term_matrix.tocsr(copy=False) elif self.tf_type == "sqrt": _ = np.sqrt(doc_term_matrix.data, doc_term_matrix.data, casting="unsafe") elif self.tf_type == "log": _ = np.log(doc_term_matrix.data, doc_term_matrix.data, casting="unsafe") doc_term_matrix.data += 1.0 elif self.tf_type == "linear": pass # tfs are already linear else: # this should never raise, i'm just being a worrywart raise ValueError( errors.value_invalid_msg( "tf_type", self.tf_type, {"binary", "bm25", "sqrt", "log", "linear"})) # apply the global component (idfs), column-wise if self.idf_type: doc_term_matrix = doc_term_matrix * self._idf_diag # apply normalizations, row-wise # unless we've already handled it for bm25-style tf if self.dl_type and self.tf_type != "bm25": n_docs, _ = doc_term_matrix.shape dls = get_doc_lengths(doc_term_matrix, type_=self.dl_type) dl_diag = sp.spdiags(1.0 / dls, diags=0, m=n_docs, n=n_docs, format="csr") doc_term_matrix = dl_diag * doc_term_matrix if self.norm is not None: doc_term_matrix = normalize_mat(doc_term_matrix, norm=self.norm, axis=1, copy=False) return doc_term_matrix