예제 #1
0
    def fit(self,
            X: sp.csr_matrix,
            n_samples: int,
            multiplier: np.ndarray = None):
        """Learn the idf vector (global term weights).

        Arguments:
            X: A matrix of term/token counts.
            n_samples: Number of total documents
        """
        X = check_array(X, accept_sparse=('csr', 'csc'))
        if not sp.issparse(X):
            X = sp.csr_matrix(X)
        dtype = np.float64

        if self.use_idf:
            _, n_features = X.shape
            df = np.squeeze(np.asarray(X.sum(axis=0)))
            avg_nr_samples = int(X.sum(axis=1).mean())
            idf = np.log(avg_nr_samples / df)
            if multiplier is not None:
                idf = idf * multiplier
            self._idf_diag = sp.diags(idf,
                                      offsets=0,
                                      shape=(n_features, n_features),
                                      format='csr',
                                      dtype=dtype)

        return self
예제 #2
0
파일: ctfidf.py 프로젝트: fagan2888/cTFIDF
    def fit(self, X: sp.csr_matrix, n_samples: int):
        """Learn the idf vector (global term weights)

        Parameters
        ----------
        X : sparse matrix of shape n_samples, n_features)
            A matrix of term/token counts.

        """

        # Prepare input
        X = check_array(X, accept_sparse=('csr', 'csc'))
        if not sp.issparse(X):
            X = sp.csr_matrix(X)
        dtype = X.dtype if X.dtype in FLOAT_DTYPES else np.float64

        # Calculate IDF scores
        _, n_features = X.shape
        df = np.squeeze(np.asarray(X.sum(axis=0)))
        avg_nr_samples = int(X.sum(axis=1).mean())
        idf = np.log(avg_nr_samples / df)
        self._idf_diag = sp.diags(idf,
                                  offsets=0,
                                  shape=(n_features, n_features),
                                  format='csr',
                                  dtype=dtype)
        return self
예제 #3
0
 def normalize_adj(adj : sp.csr_matrix):
     """Normalize adjacency matrix and convert it to a sparse tensor."""
     if sp.isspmatrix(adj):
         adj = adj.tolil()
         adj.setdiag(1)
         adj = adj.tocsr()
         deg = np.ravel(adj.sum(1))
         deg_sqrt_inv = 1 / np.sqrt(deg)
         adj_norm = adj.multiply(deg_sqrt_inv[:, None]).multiply(deg_sqrt_inv[None, :])
     elif torch.is_tensor(adj):
         deg = adj.sum(1)
         deg_sqrt_inv = 1 / torch.sqrt(deg)
         adj_norm = adj * deg_sqrt_inv[:, None] * deg_sqrt_inv[None, :]
     return to_sparse_tensor(adj_norm)
예제 #4
0
파일: utils.py 프로젝트: mindis/PRIS
 def evaluate_item(train:ss.csr_matrix, test:ss.csr_matrix, user:np.ndarray, item:np.ndarray, topk:int=200, cutoff:int=200):
     train = train.tocsr()
     test = test.tocsr()
     idx = np.squeeze((test.sum(axis=1) > 0).A)
     train = train[idx, :]
     test = test[idx, :]
     user = user[idx, :]
     N = train.shape[1]
     cand_count = N - train.sum(axis=1)
     if topk <0:
         mat_rank = Eval.predict(train, test, user, item)
     else:
         mat_rank = Eval.topk_search_(train, test, user, item, topk)
     return Eval.compute_item_metric(test, mat_rank, cand_count, cutoff)
예제 #5
0
def test_rp3(X: sps.csr_matrix, alpha: float, beta: float) -> None:
    rec = RP3betaRecommender(X, alpha=alpha, beta=beta, n_threads=4)
    rec.learn()
    W = rec.W.toarray()
    W_sum = W.sum(axis=1)
    W_sum = W_sum[W_sum >= 0]

    popularity = X.sum(axis=0).A1.ravel() ** beta

    def zero_or_1(X: np.ndarray) -> np.ndarray:
        X = X.copy()
        X[X == 0] = 1
        return X

    P_ui = np.power(X.toarray(), alpha)
    P_iu = np.power(X.T.toarray(), alpha)

    P_ui /= zero_or_1(P_ui.sum(axis=1))[:, None]
    P_iu /= zero_or_1(P_iu.sum(axis=1))[:, None]
    W_man = P_iu.dot(P_ui)

    # p_{ui} ^{RP3} = p_{ui} ^{P3} / popularity_i ^ beta

    W_man = W_man / zero_or_1(popularity)[None, :]
    np.testing.assert_allclose(W, W_man)

    rec_norm = RP3betaRecommender(
        X, alpha=alpha, n_threads=4, top_k=2, normalize_weight=True
    )
    rec_norm.learn()
    # rec
    W_sum = rec_norm.W.sum(axis=1).A1
    for w in W_sum:
        assert w == pytest.approx(1.0)
예제 #6
0
def _normalise_adjacency(adjacency: csr_matrix) -> csr_matrix:
    """ A_tidle = D^(-0.5) A D^(-0.5) """
    # Create D^(-0.5)
    degree_inv_sqrt = np.power(np.array(adjacency.sum(1)), -0.5).flatten()
    degree_inv_sqrt[np.isinf(degree_inv_sqrt)] = 0.0
    degree_inv_sqrt = diags(degree_inv_sqrt, format="coo")
    # Compute D^(-0.5) A D^(-0.5)
    return degree_inv_sqrt.dot(adjacency).dot(degree_inv_sqrt)
예제 #7
0
def normalize_sparse_matrix(matrix: sparse.csr_matrix) -> sparse.csr_matrix:
    """Normalizes each row to one.

    Norms of rows aren't exactly because before division 0.1 is added
    just to make sure that 0 doesn't appear in the nominator.
    """
    sums_for_strings = matrix.sum(axis=1).A.flatten()
    normalization_matrix = create_sparce_from_diagonal(1 / (sums_for_strings + 0.1))
    return normalization_matrix.dot(matrix)
예제 #8
0
    def __call__(self, matrix: csr_matrix):
        SUM = matrix.sum()
        row_sum = matrix.sum(axis=1)
        col_sum = matrix.sum(axis=0)

        # do: 1 / each cell
        row_sum = _safe_divide(1, row_sum)
        col_sum = _safe_divide(1, col_sum)

        row_sum *= SUM

        if self.smooth:
            row_sum = np.power(row_sum, SMOOTH_POWER)

        res = matrix.multiply(row_sum).multiply(col_sum).tocsr()

        res.data = np.log(res.data)
        res.eliminate_zeros()

        return res
예제 #9
0
def calcDInv(K: sparse.csr_matrix):
    """Calculates a scaling diagonal matrix D to rescale eigen vectors

    Parameters
    ----------
    K the sparse matrix K for the pairwise affinity

    Returns
    -------
    a sparse matrix with type csr, and D's diagonal values
    """
    dim = K.shape[0]
    D_diag_inv = 1 / (K.sum(axis=1) + .000000001
                      )  # add small epsilon to each row in K.sum()

    # D_diag = 1 / K.sum(axis=1)
    # print("D_diag",D_diag)
    D_sparse = sparse.dia_matrix((np.reshape(D_diag_inv, [1, -1]), [0]),
                                 (dim, dim))
    return sparse.csr_matrix(D_sparse), (K.sum(axis=1) + .000000001)
예제 #10
0
def calculate_normalized_affinity(
    W: csr_matrix
) -> Tuple[csr_matrix, np.array, np.array]:
    diag = W.sum(axis=1).A1
    diag_half = np.sqrt(diag)
    W_norm = W.tocoo(copy=True)
    W_norm.data /= diag_half[W_norm.row]
    W_norm.data /= diag_half[W_norm.col]
    W_norm = W_norm.tocsr()

    return W_norm, diag, diag_half
예제 #11
0
def sparse_mat_decompose(user_preference: ss.csr_matrix, latent_factor_num: int = 1) -> (ss.csr_matrix, ss.csr_matrix):
    """
    分解稀疏的用户偏好矩阵至两个初始化矩阵 U 和 V,U 的列数和 V 的行数为 latent_factor_num
    :param user_preference: 分解用户偏好矩阵
    :param latent_factor_num: U 的列数和 V 的行数
    :return: U, V
    """
    rows_num, column_num = user_preference.shape
    not_nan_elements_num = user_preference.count_nonzero()
    avg = user_preference.sum() / not_nan_elements_num
    init_element = np.sqrt(avg / latent_factor_num)
    u_init, v_init = np.full((rows_num, latent_factor_num), init_element), \
                     np.full((latent_factor_num, column_num), init_element)
    return ss.csr_matrix(u_init), ss.csr_matrix(v_init)
예제 #12
0
 def evaluate_item(train: ss.csr_matrix,
                   test: ss.csr_matrix,
                   user: np.ndarray,
                   item: np.ndarray,
                   topk: int = -1,
                   cutoff: int = 100):
     train = train.tocsr()
     test = test.tocsr()
     N = train.shape[1]
     cand_count = N - train.sum(axis=1)
     if topk < 0:
         mat_rank = Eval.predict(train, test, user, item)
     else:
         mat_rank = Eval.topk_search_(train, test, user, item, topk)
     return Eval.compute_item_metric(test, mat_rank, cand_count, cutoff)
예제 #13
0
파일: _ctfidf.py 프로젝트: cavvia/BERTopic
    def fit(self, X: sp.csr_matrix, multiplier: np.ndarray = None):
        """Learn the idf vector (global term weights).

        Arguments:
            X: A matrix of term/token counts.
            multiplier: A multiplier for increasing/decreasing certain IDF scores
        """
        X = check_array(X, accept_sparse=('csr', 'csc'))
        if not sp.issparse(X):
            X = sp.csr_matrix(X)
        dtype = np.float64

        if self.use_idf:
            _, n_features = X.shape

            # Calculate the frequency of words across all classes
            df = np.squeeze(np.asarray(X.sum(axis=0)))

            # Calculate the average number of samples as regularization
            avg_nr_samples = int(X.sum(axis=1).mean())

            # Divide the average number of samples by the word frequency
            # +1 is added to force values to be positive
            idf = np.log((avg_nr_samples / df) + 1)

            # Multiplier to increase/decrease certain idf scores
            if multiplier is not None:
                idf = idf * multiplier

            self._idf_diag = sp.diags(idf,
                                      offsets=0,
                                      shape=(n_features, n_features),
                                      format='csr',
                                      dtype=dtype)

        return self
예제 #14
0
파일: utils.py 프로젝트: mindis/PRIS
 def evaluate_topk(train:ss.csr_matrix, test:ss.csr_matrix, topk_item:np.ndarray, cutoff:int=200):
     train = train.tocsr()
     test = test.tocsr()
     result = topk_item
     N = train.shape[1]
     cand_count = N - train.sum(axis=1)
     M = test.shape[0]
     uir = []
     for i in range(M):
         R = set(test.indices[test.indptr[i]:test.indptr[i+1]])
         for k in range(result.shape[1]):
             if result[i,k] in R:
                 uir.append((i, result[i,k], k))
     user_id, item_id, rank = zip(*uir) 
     mat_rank = ss.csr_matrix((rank, (user_id, item_id)), shape=test.shape)
     return Eval.compute_item_metric(test, mat_rank, cand_count, cutoff)
def row_normalize_csr_matrix(matrix: csr_matrix) -> csr_matrix:
    """
    Row normalize a csr matrix without mutating the input
    :param matrix: scipy.sparse.csr_matrix instance
    """
    if not isinstance(matrix, csr_matrix):
        raise TypeError('expected input to be a scipy csr_matrix')
    if any(matrix.data == 0):
        raise ValueError(
            'input must be scipy.sparse.csr_matrix and must not store zeros')
    # get row index for every nonzero element in matrix
    row_idx, col_idx = matrix.nonzero()
    # compute unraveled row sums
    row_sums = matrix.sum(axis=1).A1
    # divide data by (broadcasted) row sums
    normalized = matrix.data / row_sums[row_idx]
    return csr_matrix((normalized, (row_idx, col_idx)), shape=matrix.shape)
예제 #16
0
def collect_basic_statistics(
    X: csr_matrix,
    cluster_labels: List[str],
    cond_labels: List[str],
    gene_names: List[str],
    n_jobs: int,
    temp_folder: str,
    verbose: bool,
) -> List[pd.DataFrame]:
    """ Collect basic statistics, triggering calc_basic_stat in parallel
    """
    start = time.perf_counter()

    sum_vec = cnt_vec = None
    if cond_labels is None:
        sum_vec = X.sum(axis=0).A1
        cnt_vec = X.getnnz(axis=0)

    result_list = Parallel(n_jobs=n_jobs, max_nbytes=1e7, temp_folder=temp_folder)(
        delayed(calc_basic_stat)(
            clust_id,
            X.data,
            X.indices,
            X.indptr,
            X.shape,
            cluster_labels,
            cond_labels,
            gene_names,
            sum_vec,
            cnt_vec,
            verbose,
        )
        for clust_id in cluster_labels.categories
    )

    end = time.perf_counter()
    if verbose:
        logger.info(
            "Collecting basic statistics is done. Time spent = {:.2f}s.".format(
                end - start
            )
        )

    return result_list
예제 #17
0
def t_test(
    X: csr_matrix,
    cluster_labels: List[str],
    cond_labels: List[str],
    gene_names: List[str],
    n_jobs: int,
    temp_folder: str,
    verbose: bool,
) -> List[pd.DataFrame]:
    """ Run Welch's t-test, triggering calc_t in parallel
    """
    start = time.time()

    sum_vec = sum2_vec = None
    if cond_labels is None:
        sum_vec = X.sum(axis=0).A1
        sum2_vec = X.power(2).sum(axis=0).A1

    result_list = Parallel(n_jobs=n_jobs,
                           max_nbytes=1e7,
                           temp_folder=temp_folder)(delayed(calc_t)(
                               clust_id,
                               X.data,
                               X.indices,
                               X.indptr,
                               X.shape,
                               cluster_labels,
                               cond_labels,
                               gene_names,
                               sum_vec,
                               sum2_vec,
                               verbose,
                           ) for clust_id in cluster_labels.categories)

    end = time.time()
    if verbose:
        logger.info(
            "Welch's t-test is done. Time spent = {:.2f}s.".format(end -
                                                                   start))

    return result_list
예제 #18
0
def calculate_min_violations(A: csr_matrix) -> (float, float):
    """
    Calculate the minimum number of violations in a graph for all possible rankings
    A violaton is an edge going from a lower ranked node to a higher ranked one
    Minimum number is calculated by summing bidirectional interactions.
    Input:
        A: graph adjacency matrix where A[i,j] is the weight of an edge from node i to j
    Output:
        minimum number of violations
        proportion of all edges against minimum violations
    """

    ii, ji, v = scipy.sparse.find(
        A
    )  # I,J,V contain the row, column indices, and values of the nonzero entries.

    min_viol = 0.0
    for e in range(len(v)):  # for all nodes interactions
        i, j = ii[e], ji[e]
        if A[i, j] > 0 and A[j, i] > 0:
            min_viol = min_viol + min(A[i, j], A[j, i])

    m = A.sum()
    return (min_viol, min_viol / m)
예제 #19
0
def vertex_degree(graph: sp.csr_matrix, vertex: int):
    return graph.sum(axis=0)[vertex]
예제 #20
0
파일: norm.py 프로젝트: ckingdev/sparseutil
def axis_norms(X: sparse.csr_matrix,
               norm: str = "l1",
               axis: int = 1) -> np.ndarray:
    if norm == "l1":
        return np.asarray(X.sum(axis=axis)).reshape(-1)
    return np.sqrt(np.asarray(X.power(2).sum(axis=axis)).reshape(-1))
예제 #21
0
 def __init__(self, adj: sparse.csr_matrix, nodes, tp: ConfigType):
     self._adj = adj
     self._nodes: List[Node] = nodes
     self._size = len(nodes)
     self._type = tp
     self._edges = int(adj.sum() / 2)
예제 #22
0
def get_inv_propensity(train_y: csr_matrix, a=0.55, b=1.5):
    n, number = train_y.shape[0], np.asarray(train_y.sum(axis=0)).squeeze()
    c = (np.log(n) - 1) * ((b + 1)**a)
    return 1.0 + c * (number + b)**(-a)
예제 #23
0
 def normalize_relative_frequency(matrix: sparse.csr_matrix, axis: AxisType):
     if axis == AxisType.REPERTOIRES:
         return sparse.diags(1 / matrix.sum(axis=1).A.ravel()) @ matrix
     if axis == AxisType.FEATURES:
         return matrix @ sparse.diags(1 / matrix.sum(axis=0).A.ravel())
예제 #24
0
    def preprocess_graph(
        adj: sp.csr_matrix,
        same_type_nodes: bool = True
    ) -> Tuple[np.array, np.array, Tuple[int, int]]:
        """

        Parameters
        ----------
        adj : sp.csr_matrix
            Adjacency matrix.
        same_type_nodes : bool
            Is adjacency matrix for nodes of same type or not?
            E.g. drug-drug, protein-protein adj or drug-protein, protein-drug.

        Returns
        -------
        np.array
            Pairs of edges in normalized adjacency matrix with nonzero values.
        np.array
            Nonzero values of normalized adjacency matrix.
        Tuple[int, int]
            Shape of normalized adjacency matrix.

        Notes
        -----
        Updating embeddings on new layer can be written as
        H(l+1) = σ(SUM_r A_r_normalize @ H(l) @ W_r(l))
        A_r_normalize --- normalized adj matrix for r edge type.

        So we have two variants of normalization for A_r (further just A).
        1. Adj matrix for nodes of same type. It is symmetric.
            A_ = A + I,
            to add information of current node when collecting information from neighbors
            with same type.
            E.g. collecting info from drug nodes when update current drug embedding.

            D: degree matrix (diagonal matrix with number of neighbours on the diagonal).
            A_normalize = D^(-1/2) @ A_ @ D^(-1/2),
            to symmetric normalization (division by sqrt(N_r^i) * sqrt(N_r^j)
            in formula from original paper).

        2. Adj matrix for nodes of different type.
            Here we don't need to add information from the current node.

            D_row: output degree matrix --- diagonal matrix with number of output
            neighbours (i.e. i -> neighbours) on the diagonal.
            D_col: input degree matrix --- diagonal matrix with number of input
            neighbours (i.e. neighbours -> i) on the diagonal.
            A_normalize = D_row^(-1/2) @ A @ D_col^(-1/2),
            to symmetric normalization (division by sqrt(N_r^i) * sqrt(N_r^j)
            in formula from original paper).


        """
        adj = sp.coo_matrix(adj)
        if same_type_nodes:
            adj_ = adj + sp.eye(adj.shape[0])
            rowsum = np.array(adj_.sum(1))
            degree_mat_inv_sqrt = sp.diags(np.power(rowsum, -0.5).flatten())
            adj_normalized = adj_.dot(degree_mat_inv_sqrt).transpose().dot(
                degree_mat_inv_sqrt).tocoo()
        else:
            rowsum = np.array(adj.sum(1))
            colsum = np.array(adj.sum(0))
            rowdegree_mat_inv = sp.diags(
                np.nan_to_num(np.power(rowsum, -0.5)).flatten())
            coldegree_mat_inv = sp.diags(
                np.nan_to_num(np.power(colsum, -0.5)).flatten())
            adj_normalized = rowdegree_mat_inv.dot(adj).dot(
                coldegree_mat_inv).tocoo()
        return preprocessing.sparse_to_tuple(adj_normalized)