示例#1
0
文件: utils.py 项目: HERECJ/two_pass
 def evaluate_item_with_code(train: ss.csr_matrix,
                             test: ss.csr_matrix,
                             user: np.ndarray,
                             item_code: np.ndarray,
                             item_center: List[np.ndarray],
                             topk=200,
                             cutoff=200):
     train = train.tocsr()
     test = test.tocsr()
     #result1 = Eval.topk_search_with_code(train, user, item_code, item_center, topk)
     result = Eval.topk_search_with_code_fast(train, user, item_code,
                                              item_center, topk)
     return Eval.evaluate_topk(train, test, result, cutoff)
示例#2
0
文件: utils.py 项目: mindis/PRIS
 def evaluate_item(train:ss.csr_matrix, test:ss.csr_matrix, user:np.ndarray, item:np.ndarray, topk:int=200, cutoff:int=200):
     train = train.tocsr()
     test = test.tocsr()
     idx = np.squeeze((test.sum(axis=1) > 0).A)
     train = train[idx, :]
     test = test[idx, :]
     user = user[idx, :]
     N = train.shape[1]
     cand_count = N - train.sum(axis=1)
     if topk <0:
         mat_rank = Eval.predict(train, test, user, item)
     else:
         mat_rank = Eval.topk_search_(train, test, user, item, topk)
     return Eval.compute_item_metric(test, mat_rank, cand_count, cutoff)
示例#3
0
 def evaluate_item(train: ss.csr_matrix,
                   test: ss.csr_matrix,
                   user: np.ndarray,
                   item: np.ndarray,
                   topk: int = -1,
                   cutoff: int = 100):
     train = train.tocsr()
     test = test.tocsr()
     N = train.shape[1]
     cand_count = N - train.sum(axis=1)
     if topk < 0:
         mat_rank = Eval.predict(train, test, user, item)
     else:
         mat_rank = Eval.topk_search_(train, test, user, item, topk)
     return Eval.compute_item_metric(test, mat_rank, cand_count, cutoff)
示例#4
0
 def _save(
     filepath: Union[str, Path],
     matrix: csr_matrix,
     cell_ids: pd.DataFrame,
     features: pd.DataFrame,
     save_pickle: bool = False,
     save_rds: bool = False,
     save_h5ad: bool = False,
     save_loom: bool = False,
     meta: Optional[pd.DataFrame] = None,
 ):
     filepath = Path(filepath)
     if save_pickle:
         build_counts_store(matrix.tocoo(),
                            cell_ids,
                            features,
                            save_path=filepath)
     if save_rds:
         Convert.pickle_to_rds_dir(filepath.parent)
     if save_h5ad or save_loom:
         cell_ids = meta if meta is not None else pd.DataFrame(
             cell_ids).set_index(0)
         cell_ids.index.name = "index"
         features = features.rename(columns={
             "ensgs": "gene_ids",
             "genes": "index"
         }).set_index("index")
         adata = AnnData(matrix.tocsr(), cell_ids, features)
         if save_h5ad:
             adata.write_h5ad(filepath.parent / "rna.h5ad")
         if save_loom:
             adata.write_loom(filepath.parent / "rna.loom")
示例#5
0
文件: utils.py 项目: mindis/PRIS
 def evaluate_topk(train:ss.csr_matrix, test:ss.csr_matrix, topk_item:np.ndarray, cutoff:int=200):
     train = train.tocsr()
     test = test.tocsr()
     result = topk_item
     N = train.shape[1]
     cand_count = N - train.sum(axis=1)
     M = test.shape[0]
     uir = []
     for i in range(M):
         R = set(test.indices[test.indptr[i]:test.indptr[i+1]])
         for k in range(result.shape[1]):
             if result[i,k] in R:
                 uir.append((i, result[i,k], k))
     user_id, item_id, rank = zip(*uir) 
     mat_rank = ss.csr_matrix((rank, (user_id, item_id)), shape=test.shape)
     return Eval.compute_item_metric(test, mat_rank, cand_count, cutoff)
def neighbor_sampler(adj_matrix: sp.csr_matrix, max_degree: int = 25,
                     selfloop: bool = False):
    adj_matrix = adj_matrix.tocsr(copy=False)
    N = adj_matrix.shape[0]
    neighbors_matrix = N * np.ones((N + 1, max_degree), dtype=intx())
    for nodeid in range(N):
        neighbors = adj_matrix[nodeid].indices

#         if not selfloop:
#             neighbors = np.setdiff1d(neighbors, [nodeid])
#         else:
#             neighbors = np.intersect1d(neighbors, [nodeid])

        size = neighbors.size
        if size == 0:
            continue

        if size > max_degree:
            neighbors = np.random.choice(neighbors, max_degree, replace=False)
        elif size < max_degree:
            neighbors = np.random.choice(neighbors, max_degree, replace=True)

        neighbors_matrix[nodeid] = neighbors

    np.random.shuffle(neighbors_matrix.T)
    return neighbors_matrix
示例#7
0
    def _k_neighbors_precomputed_sparse(self,
                                        X: csr_matrix,
                                        n_samples: int = None):
        """ Find nearest neighbors in sparse distance matrix.

        Parameters
        ----------
        X: sparse, shape = [n_test, n_indexed]
            Sparse distance matrix. Only non-zero elements
            may be considered neighbors.

        n_samples: int
            Number of sampled indexed objects, e.g.
            in approximate hubness reduction.
            If None, this is inferred from the first row of X.

        Returns
        -------
        k_neighbors : ndarray
            Flattened array of neighbor indices.
        """
        if not issparse(X):
            raise TypeError(f'Matrix X is not sparse')
        X = X.tocsr()
        if n_samples is None:
            n_samples = X.indptr[1] - X.indptr[0]
        n_test, _ = X.shape
        # To allow different number of explicit entries per row,
        # we need to process the matrix row-by-row.
        if np.all(X.indptr[1:] -
                  X.indptr[:-1] == n_samples) and not self.shuffle_equal:
            min_ind = np.argpartition(X.data.reshape(n_test, n_samples),
                                      kth=np.arange(self.k),
                                      axis=1)[:, :self.k]
            k_neighbors = X.indices[min_ind.ravel() +
                                    np.repeat(X.indptr[:-1], repeats=self.k)]
        else:
            k_neighbors = np.empty((n_test, ), dtype=object)
            if self.verbose:
                range_n_test = tqdm(range(n_test))
            else:
                range_n_test = range(n_test)
            if self.shuffle_equal:
                for i in range_n_test:
                    x = X.getrow(i)
                    rp = self._random_state.permutation(x.nnz)
                    d2 = x.data[rp]
                    d2idx = np.argpartition(d2, kth=np.arange(self.k))
                    k_neighbors[i] = x.indices[rp[d2idx[:self.k]]]
            else:
                for i in range_n_test:
                    x = X.getrow(i)
                    min_ind = np.argpartition(x.data,
                                              kth=np.arange(self.k))[:self.k]
                    k_neighbors[i] = x.indices[min_ind]
            k_neighbors = np.concatenate(k_neighbors)
        return k_neighbors
示例#8
0
def compute_reg_scale(X: sps.csr_matrix, alpha0: float, nu: float) -> float:
    X_csr: sps.csr_matrix = X.tocsr()
    X_csr.sort_indices()
    X_csc = X_csr.tocsc()
    X_csc.sort_indices()
    U, I = X.shape
    nnz_row: np.ndarray = X_csr.indptr[1:] - X_csr.indptr[:-1]
    nnz_col: np.ndarray = X_csc.indptr[1:] - X_csc.indptr[:-1]
    return float(((nnz_row + alpha0 * I)**nu).sum()) + float(
        ((nnz_col + alpha0 * U)**nu).sum())
示例#9
0
文件: utils.py 项目: mindis/PRIS
 def topk_search(train:ss.csr_matrix, user:np.ndarray, item:np.ndarray, topk:int=200)->np.ndarray:
     train = train.tocsr()
     M, _ = train.shape
     item_t = item.T
     result = np.zeros((M, topk), dtype=np.int)
     for i in range(M):
         E = train.indices[train.indptr[i]:train.indptr[i+1]]
         pred = np.matmul(user[i,:], item_t)
         #pred = np.tensordot(user[i,:], item, [0,-1])
         pred[E] = -np.inf
         idx = np.argpartition(pred, -topk)[-topk:]
         result[i,:] = idx[np.argsort(-pred[idx])]
     return result
示例#10
0
    def _store_sparse(self, group: tables.Group, name: str,
                      arr: csr_matrix) -> None:
        if not sparse.isspmatrix_csr(arr):
            arr = arr.tocsr()

        csr_group = self.h5f.create_group(group, name)
        csr_group.was_sparse = True

        if arr is not None and arr.nnz > 0:
            self.h5f.create_array(csr_group, 'data', arr.data)
            self.h5f.create_array(csr_group, 'indptr', arr.indptr)
            self.h5f.create_array(csr_group, 'indices', arr.indices)
            self.h5f.create_array(csr_group, 'shape', arr.shape)
        self.h5f.flush()
示例#11
0
 def normalize_adj(adj : sp.csr_matrix):
     """Normalize adjacency matrix and convert it to a sparse tensor."""
     if sp.isspmatrix(adj):
         adj = adj.tolil()
         adj.setdiag(1)
         adj = adj.tocsr()
         deg = np.ravel(adj.sum(1))
         deg_sqrt_inv = 1 / np.sqrt(deg)
         adj_norm = adj.multiply(deg_sqrt_inv[:, None]).multiply(deg_sqrt_inv[None, :])
     elif torch.is_tensor(adj):
         deg = adj.sum(1)
         deg_sqrt_inv = 1 / torch.sqrt(deg)
         adj_norm = adj * deg_sqrt_inv[:, None] * deg_sqrt_inv[None, :]
     return to_sparse_tensor(adj_norm)
示例#12
0
def _weight_matrix(dataMatrix: sps.csr_matrix,
                   weights: np.ndarray,
                   strategy="linear"):
    """
    Assuming that rows are the object to weights, it returns a weighted matrix based on the weights and alpha

    :param dataMatrix: dataMatrix with rows being the object to weight
    :param weights: an array with the same length as the number of nnz inside the dataMatrix
    :param strategy: strategy to use in order to put weights
    :return: new data matrix with weighted data
    """
    if len(weights) != len(dataMatrix.data):
        raise ValueError(
            "Demographic list does not contain all users in dataMatrix")

    matrix = dataMatrix.tocsr(copy=True)
    strategy_funct = STRATEGY_MAPPER[strategy]
    matrix = strategy_funct.weight_matrix(matrix, feature_data=weights)
    return sps.csr_matrix(matrix)
示例#13
0
    def _preprocess_URM_all(self, URM_all: sps.csr_matrix):
        warm_items_mask = np.ediff1d(URM_all.tocsc().indptr) > self.threshold_items
        self.warm_items = np.arange(URM_all.shape[1])[warm_items_mask]

        URM_all = URM_all[:, self.warm_items]

        warm_users_mask = np.ediff1d(URM_all.tocsr().indptr) > self.threshold_users
        self.warm_users = np.arange(URM_all.shape[0])[warm_users_mask]

        URM_all = URM_all[self.warm_users, :]

        self._LOADED_GLOBAL_MAPPER_DICT["user_original_ID_to_index"] = reconcile_mapper_with_removed_tokens(
            self._LOADED_GLOBAL_MAPPER_DICT["user_original_ID_to_index"],
            np.arange(0, len(warm_users_mask), dtype=np.int)[np.logical_not(warm_users_mask)])

        self._LOADED_GLOBAL_MAPPER_DICT["item_original_ID_to_index"] = reconcile_mapper_with_removed_tokens(
            self._LOADED_GLOBAL_MAPPER_DICT["item_original_ID_to_index"],
            np.arange(0, len(warm_items_mask), dtype=np.int)[np.logical_not(warm_items_mask)])

        return URM_all
示例#14
0
def with_cliques(adjacency: sp.csr_matrix,
                 clique_size: int,
                 num_cliques: int = 1) -> Tuple[sp.csr_matrix, np.ndarray]:
    """
    Get adjacency matrix of dataset with cliques.

    A clique is defined as a set of nodes where each node is a neighbor of every other
    node.

    Args:
        adjacency: adjacency matrix to start with.
        clique_size: size of each clique.
        num_cliques: number of cliques to add.

    Returns:
        augmented_adjacency: adjacency with cliques added.
        cliques: [num_cliques, clique_size] int32 array of indices of clique nodes.
    """
    num_nodes = adjacency.shape[0]
    adjacency = adjacency.tolil()
    dtype = adjacency.dtype
    rows = adjacency.rows
    data = adjacency.data
    cliques = np.empty((num_cliques, clique_size), dtype=np.int32)
    for i in range(num_cliques):
        clique = np.random.choice(num_nodes, clique_size, replace=False)
        clique.sort()
        cliques[i] = clique
        for c in clique:
            row = set(rows[c])
            contains_c = c in row
            row.update(clique)
            if not contains_c:
                row.remove(c)
            rows[c] = sorted(row)
            data[c] = np.ones((len(row), ), dtype=dtype)
    return adjacency.tocsr(), cliques
示例#15
0
def split(urm: sps.csr_matrix, range_intervals_list: list, k_out_list: list,
          probability_list: list) -> (sps.csr_matrix, sps.csr_matrix):
    # Initialize timer
    start_time = time.time()

    # Checking the size of the list
    assert len(range_intervals_list) == len(k_out_list) == len(probability_list), \
        "The input lists must have the same length"

    # Generating a copy of the matrix
    matrix = urm.tocsr(copy=True)

    # Initializing the matrices
    train_matrix = sps.lil_matrix(matrix.shape, dtype=np.float)
    test_matrix = sps.lil_matrix(matrix.shape, dtype=np.float)

    # Users
    users = range(matrix.shape[0])

    # Creating a list of triple (range_interval, k_out, probability)
    triple_list = [
        (r, k, p)
        for r, k, p in zip(range_intervals_list, k_out_list, probability_list)
    ]

    # Start splitting the input matrix
    for user in users:

        # Finding the items with which the user has interacted
        items = matrix[user].indices
        n_items = len(items)

        # Finding the splitting category for the user
        k, p = -1, -1
        for range_interval, k_out, probability in triple_list:
            if range_interval[0] <= n_items <= range_interval[1]:
                k = k_out
                p = probability
                break

        # Extracting k unique items (if possible)
        n_items_to_be_extracted = min(k, n_items)
        rnd.shuffle(items)
        k_extracted_items = items[:n_items_to_be_extracted]
        remaining_items = items[n_items_to_be_extracted:]

        # Extracting elements with probability p
        p_extracted_items = [
            item for item in remaining_items if rnd.random() <= p
        ]
        remaining_items = [
            item for item in remaining_items if item not in p_extracted_items
        ]

        # Incrementally generate fill the train and test matrix
        for item in remaining_items:
            train_matrix[user, item] = matrix[user, item]
        for item in k_extracted_items:
            test_matrix[user, item] = matrix[user, item]
        for item in p_extracted_items:
            test_matrix[user, item] = matrix[user, item]

    # Converting train matrix and test matrix to csr format
    train_matrix = train_matrix.tocsr()
    test_matrix = test_matrix.tocsr()

    # Checking output matrices
    assert train_matrix.shape == test_matrix.shape == matrix.shape, \
        "Train and test matrices should have the same shape as the input one"
    assert (train_matrix.nnz + test_matrix.nnz) == matrix.nnz, \
        f"The sum of train and test matrices nnz elements should be exactly the nnz elements in the input matrix:\n" \
        f"train matrix nnz elements : {train_matrix.nnz}\n" \
        f"test matrix nnz elements : {test_matrix.nnz}\n" \
        f"matrix nnz elements : {matrix.nnz}\n"

    # Ending time
    end_time = time.time()
    print(
        f"Time taken to split the input matrix : {end_time - start_time} seconds"
    )

    return train_matrix, test_matrix
示例#16
0
    def __init__(self,
                 adj_matrix: sp.csr_matrix,
                 attr_matrix=None,
                 labels: np.ndarray = None,
                 node_names: np.ndarray = None,
                 attr_names: np.ndarray = None,
                 class_names: np.ndarray = None,
                 metadata: object = None):
        """Create attrribute graph

        Parameters
        ----------
        adj_matrix : sp.csr_matrix, shape [num_nodes, num_nodes]
            Adjacency matrix in CSR format.
        attr_matrix : sp.csr_matrix or np.ndarray, shape [num_nodes, num_attr], optional
            Attribute matrix in CSR or numpy format.
        labels : np.ndarray, shape [num_nodes], optional
            Array, where each entry represents respective node's label(s).
        node_names : np.ndarray, shape [num_nodes], optional
            Names of nodes (as strings).
        attr_names : np.ndarray, shape [num_attr]
            Names of the attributes (as strings).
        class_names : np.ndarray, shape [num_classes], optional
            Names of the class labels (as strings).
        metadata : object
            Additional metadata such as text.
        """
        if sp.isspmatrix(adj_matrix):
            adj_matrix = adj_matrix.tocsr().astype(np.float32)
        else:
            raise ValueError(
                "Adjacency matrix must be in sparse format (got {0} instead)".
                format(type(adj_matrix)))

        if adj_matrix.shape[0] != adj_matrix.shape[1]:
            raise ValueError("Dimensions of the adjacency matrix don't agree")

        if attr_matrix is not None:
            if sp.isspmatrix(attr_matrix):
                attr_matrix = attr_matrix.tocsr().astype(np.float32)
            elif isinstance(attr_matrix, np.ndarray):
                attr_matrix = attr_matrix.astype(np.float32)
            else:
                raise ValueError(
                    "Attribute matrix must be a sp.spmatrix or a np.ndarray (got {0} instead)"
                    .format(type(attr_matrix)))

            if attr_matrix.shape[0] != adj_matrix.shape[0]:
                raise ValueError(
                    "Dimensions of the adjacency and attribute matrices don't agree"
                )

        if labels is not None:
            if labels.shape[0] != adj_matrix.shape[0]:
                raise ValueError(
                    "Dimensions of the adjacency matrix and the label vector don't agree"
                )

        if node_names is not None:
            if len(node_names) != adj_matrix.shape[0]:
                raise ValueError(
                    "Dimensions of the adjacency matrix and the node names don't agree"
                )

        if attr_names is not None:
            if len(attr_names) != attr_matrix.shape[1]:
                raise ValueError(
                    "Dimensions of the attribute matrix and the attribute names don't agree"
                )

        self.adj_matrix = adj_matrix
        self.attr_matrix = attr_matrix
        self.labels = labels
        self.node_names = node_names
        self.attr_names = attr_names
        self.class_names = class_names
        self.metadata = metadata
示例#17
0
 def removeEye(adj: sp.csr_matrix):
     adj = adj.tolil(copy=True)
     adj.setdiag(0)
     return adj.tocsr()
示例#18
0
 def addEye(adj: sp.csr_matrix):
     adj = adj.tolil(copy=True)
     adj.setdiag(1)
     return adj.tocsr()
示例#19
0
    def _reweight_values(self,
                         doc_term_matrix: sp.csr_matrix) -> sp.csr_matrix:
        """
        Re-weight values in a doc-term matrix according to parameters specified
        in :class:`Vectorizer` initialization: binary or tf-idf weighting,
        sublinear term-frequency, document-normalized weights.

        Args:
            doc_term_matrix

        Returns:
            Reweighted doc-term matrix.
        """
        # re-weight the local components (term freqs)
        if self.tf_type == "binary":
            doc_term_matrix.data.fill(1)
        elif self.tf_type == "bm25":
            if not self.dl_type:
                doc_term_matrix.data = (doc_term_matrix.data *
                                        (BM25_K1 + 1.0) /
                                        (BM25_K1 + doc_term_matrix.data))
            else:
                dls = get_doc_lengths(doc_term_matrix, type_=self.dl_type)
                length_norm = (1 - BM25_B) + (BM25_B *
                                              (dls / self._avg_doc_length))
                doc_term_matrix = doc_term_matrix.tocoo(copy=False)
                doc_term_matrix.data = (
                    doc_term_matrix.data * (BM25_K1 + 1.0) /
                    (doc_term_matrix.data +
                     (BM25_K1 * length_norm[doc_term_matrix.row])))
                doc_term_matrix = doc_term_matrix.tocsr(copy=False)
        elif self.tf_type == "sqrt":
            _ = np.sqrt(doc_term_matrix.data,
                        doc_term_matrix.data,
                        casting="unsafe")
        elif self.tf_type == "log":
            _ = np.log(doc_term_matrix.data,
                       doc_term_matrix.data,
                       casting="unsafe")
            doc_term_matrix.data += 1.0
        elif self.tf_type == "linear":
            pass  # tfs are already linear
        else:
            # this should never raise, i'm just being a worrywart
            raise ValueError(
                errors.value_invalid_msg(
                    "tf_type", self.tf_type,
                    {"binary", "bm25", "sqrt", "log", "linear"}))

        # apply the global component (idfs), column-wise
        if self.idf_type:
            doc_term_matrix = doc_term_matrix * self._idf_diag

        # apply normalizations, row-wise
        # unless we've already handled it for bm25-style tf
        if self.dl_type and self.tf_type != "bm25":
            n_docs, _ = doc_term_matrix.shape
            dls = get_doc_lengths(doc_term_matrix, type_=self.dl_type)
            dl_diag = sp.spdiags(1.0 / dls,
                                 diags=0,
                                 m=n_docs,
                                 n=n_docs,
                                 format="csr")
            doc_term_matrix = dl_diag * doc_term_matrix
        if self.norm is not None:
            doc_term_matrix = normalize_mat(doc_term_matrix,
                                            norm=self.norm,
                                            axis=1,
                                            copy=False)

        return doc_term_matrix