Exemplo n.º 1
0
    def __init__(
        self,
        profile: sps.csr_matrix,
        embedding: np.ndarray,
        search_config: Optional[MLPSearchConfig] = None,
    ):

        (
            profile_train,
            profile_test,
            embedding_train,
            embedding_test,
        ) = train_test_split(
            profile.astype(np.float32),
            embedding.astype(np.float32),
            random_state=42,
        )
        self.profile_train = profile_train
        self.profile_test = profile_test
        self.embedding_train = jnp.asarray(embedding_train, dtype=jnp.float32)
        self.embedding_test = jnp.asarray(embedding_test, dtype=jnp.float32)
        if search_config is None:
            self.search_config = MLPSearchConfig()
        else:
            self.search_config = search_config
Exemplo n.º 2
0
def saveSparseMatrix(file: str,
                     matrix: csr_matrix,
                     colnames: Union[List[str], None] = None) -> None:
    """ Save sparse matrix to json file

        Args:
            ile:                                file name to store results, full or relative path
            matrix (M,N):                       sparse matrix of size (M,N)
            colnames, optional (1,N):           column names

        Returns:
            None
    """

    json_content = dict()

    if colnames is not None:
        json_content["features"] = colnames

    json_content["size"] = matrix.shape
    matrix = matrix.astype(float)

    keys = matrix.todok().keys()
    keys = [tuple(map(int, key)) for key in keys]

    vals = matrix.todok().values()
    vals = list(map(float, vals))

    json_content["positions"] = keys
    json_content["counts"] = vals

    with open(file, "w") as f:
        json.dump(json_content, f, indent=4)
Exemplo n.º 3
0
    def precompute_best_item_indices(self, URM: sps.csr_matrix):
        URM = URM.copy()
        if self.feature_weighting == "BM25":
            URM = URM.astype(np.float32)
            URM = okapi_BM_25(URM)
            URM = check_matrix(URM, 'csr')

        elif self.feature_weighting == "TF-IDF":
            URM = URM.astype(np.float32)
            URM = TF_IDF(URM)
            URM = check_matrix(URM, 'csr')

        similarity = Compute_Similarity(URM,
                                        shrink=self.shrink,
                                        topK=self.topK,
                                        normalize=self.normalize,
                                        similarity="cosine")
        similarity_matrix = similarity.compute_similarity()
        self.sorted_indices = np.array(
            np.argsort(-similarity_matrix.todense(), axis=1))
Exemplo n.º 4
0
def calculate_diffusion_map(
    W: csr_matrix, n_components: int, solver: str, max_t: int, n_jobs: int, random_state: int,
) -> Tuple[np.array, np.array, np.array]:
    assert issparse(W)

    nc, labels = connected_components(W, directed=True, connection="strong")
    logger.info("Calculating connected components is done.")

    assert nc == 1

    W_norm, diag, diag_half = calculate_normalized_affinity(W.astype(np.float64)) # use double precision to guarantee reproducibility
    logger.info("Calculating normalized affinity matrix is done.")

    n_jobs = eff_n_jobs(n_jobs)
    with threadpool_limits(limits = n_jobs):
        if solver == "eigsh":
            np.random.seed(random_state)
            v0 = np.random.uniform(-1.0, 1.0, W_norm.shape[0])
            Lambda, U = eigsh(W_norm, k=n_components, v0=v0)
            Lambda = Lambda[::-1]
            U = U[:, ::-1]
        else:
            assert solver == "randomized"
            U, S, VT = randomized_svd(
                W_norm, n_components=n_components, random_state=random_state
            )
            signs = np.sign((U * VT.transpose()).sum(axis=0))  # get eigenvalue signs
            Lambda = signs * S  # get eigenvalues

    # remove the first eigen value and vector
    Lambda = Lambda[1:]
    U = U[:, 1:]
    Phi = U / diag_half[:, np.newaxis]

    if max_t == -1:
        Lambda_new = Lambda / (1.0 - Lambda)
    else:
        # Find the knee point
        x = np.array(range(1, max_t + 1), dtype = float)
        y = np.array([calc_von_neumann_entropy(Lambda, t) for t in x])
        t = x[find_knee_point(x, y)]
        logger.info("Detected knee point at t = {:.0f}.".format(t))

        # U_df = U * Lambda #symmetric diffusion component
        Lambda_new = Lambda * ((1.0 - Lambda ** t) / (1.0 - Lambda))
    Phi_pt = Phi * Lambda_new  # asym pseudo component

    return Phi_pt, Lambda, Phi  # , U_df, W_norm
Exemplo n.º 5
0
    def _secondary_outputs(self, input_matrix: sparse.csr_matrix):
        """Compute different variables from labels_."""
        if self.return_membership or self.return_aggregate:
            if np.issubdtype(input_matrix.data.dtype, np.bool_):
                input_matrix = input_matrix.astype(float)
            if not self.bipartite:
                membership = membership_matrix(self.labels_)
                if self.return_membership:
                    self.membership_ = normalize(input_matrix.dot(membership))
                if self.return_aggregate:
                    self.aggregate_ = sparse.csr_matrix(
                        membership.T.dot(input_matrix.dot(membership)))
            else:
                if self.labels_col_ is None:
                    n_labels = max(self.labels_) + 1
                    membership_row = membership_matrix(self.labels_,
                                                       n_labels=n_labels)
                    membership_col = normalize(
                        input_matrix.T.dot(membership_row))
                else:
                    n_labels = max(max(self.labels_row_), max(
                        self.labels_col_)) + 1
                    membership_row = membership_matrix(self.labels_row_,
                                                       n_labels=n_labels)
                    membership_col = membership_matrix(self.labels_col_,
                                                       n_labels=n_labels)
                if self.return_membership:
                    self.membership_row_ = normalize(
                        input_matrix.dot(membership_col))
                    self.membership_col_ = normalize(
                        input_matrix.T.dot(membership_row))
                    self.membership_ = self.membership_row_
                if self.return_aggregate:
                    aggregate_ = sparse.csr_matrix(
                        membership_row.T.dot(input_matrix))
                    aggregate_ = aggregate_.dot(membership_col)
                    self.aggregate_ = aggregate_

        return self
Exemplo n.º 6
0
def tree_sampling_divergence(adjacency: sparse.csr_matrix,
                             dendrogram: np.ndarray,
                             weights: str = 'degree',
                             normalized: bool = True) -> float:
    """Tree sampling divergence of a hierarchy (quality metric).

    * Graphs
    * Digraphs

    Parameters
    ----------
    adjacency :
        Adjacency matrix of the graph.
    dendrogram :
        Dendrogram.
    weights :
        Weights of nodes.
        ``'degree'`` (default) or ``'uniform'``.
    normalized :
        If ``True``, normalized score (between 0 and 1).

    Returns
    -------
    score : float
        Score.

    Example
    -------
    >>> from sknetwork.hierarchy import tree_sampling_divergence, Paris
    >>> from sknetwork.data import house
    >>> paris = Paris()
    >>> adjacency = house()
    >>> dendrogram = paris.fit_transform(adjacency)
    >>> score = tree_sampling_divergence(adjacency, dendrogram)
    >>> np.round(score, 2)
    0.52

    References
    ----------
    Charpentier, B. & Bonald, T. (2019).
    `Tree Sampling Divergence: An Information-Theoretic Metric for
    Hierarchical Graph Clustering.
    <https://hal.telecom-paristech.fr/hal-02144394/document>`_
    Proceedings of IJCAI.
    """
    adjacency = check_format(adjacency)
    check_square(adjacency)
    check_min_nnz(adjacency.nnz, 1)
    adjacency = adjacency.astype(float)
    n = adjacency.shape[0]
    check_min_size(n, 2)

    adjacency.data /= adjacency.data.sum()

    aggregate_graph, height, cluster_weight, edge_sampling, weights_row, weights_col = _instanciate_vars(
        adjacency, weights)
    node_sampling = np.zeros(n - 1)

    for t in range(n - 1):
        i = int(dendrogram[t][0])
        j = int(dendrogram[t][1])
        if i >= n and height[i - n] == dendrogram[t][2]:
            edge_sampling[t] = edge_sampling[i - n]
            edge_sampling[i - n] = 0
            node_sampling[t] = node_sampling[i - n]
        elif j >= n and height[j - n] == dendrogram[t][2]:
            edge_sampling[t] = edge_sampling[j - n]
            edge_sampling[j - n] = 0
            node_sampling[t] = node_sampling[j - n]
        if j in aggregate_graph.neighbors[i]:
            edge_sampling[t] += aggregate_graph.neighbors[i][j]
        node_sampling[t] += aggregate_graph.cluster_out_weights[i] * aggregate_graph.cluster_in_weights[j] + \
            aggregate_graph.cluster_out_weights[j] * aggregate_graph.cluster_in_weights[i]
        height[t] = dendrogram[t][2]
        aggregate_graph.merge(i, j)

    index = np.where(edge_sampling)[0]
    score = edge_sampling[index].dot(
        np.log(edge_sampling[index] / node_sampling[index]))
    if normalized:
        inv_out_weights = sparse.diags(weights_row, shape=(n, n), format='csr')
        inv_out_weights.data = 1 / inv_out_weights.data
        inv_in_weights = sparse.diags(weights_col, shape=(n, n), format='csr')
        inv_in_weights.data = 1 / inv_in_weights.data
        sampling_ratio = inv_out_weights.dot(adjacency.dot(inv_in_weights))
        inv_out_weights.data = np.ones(len(inv_out_weights.data))
        inv_in_weights.data = np.ones(len(inv_in_weights.data))
        edge_sampling = inv_out_weights.dot(adjacency.dot(inv_in_weights))
        mutual_information = edge_sampling.data.dot(np.log(
            sampling_ratio.data))
        score /= mutual_information
    return score
Exemplo n.º 7
0
 def get_score(self, profile: sps.csr_matrix) -> DenseScoreArray:
     user_embedding: DenseMatrix = self.mlp.predict(
         profile.astype(np.float32).toarray())
     return self.cf_rec.get_score_from_user_embedding(
         user_embedding).astype(np.float64)
Exemplo n.º 8
0
def tree_sampling_divergence(adjacency: sparse.csr_matrix,
                             dendrogram: np.ndarray,
                             weights: str = 'degree',
                             normalized: bool = True) -> float:
    """Tree sampling divergence of a hierarchy (quality metric).

    Parameters
    ----------
    adjacency :
        Adjacency matrix of the graph.
    dendrogram :
        Dendrogram.
    weights :
        Weights of nodes.
        ``'degree'`` (default) or ``'uniform'``.
    normalized :
        If ``True``, normalized score (between 0 and 1).

    Returns
    -------
    score : float
        Score.

    Example
    -------
    >>> from sknetwork.hierarchy import tree_sampling_divergence, Paris
    >>> from sknetwork.data import house
    >>> paris = Paris()
    >>> adjacency = house()
    >>> dendrogram = paris.fit_transform(adjacency)
    >>> score = tree_sampling_divergence(adjacency, dendrogram)
    >>> np.round(score, 2)
    0.05

    References
    ----------
    Charpentier, B. & Bonald, T. (2019).
    `Tree Sampling Divergence: An Information-Theoretic Metric for
    Hierarchical Graph Clustering.
    <https://hal.telecom-paristech.fr/hal-02144394/document>`_
    Proceedings of IJCAI.
    """
    adjacency = check_format(adjacency)
    check_square(adjacency)
    check_min_nnz(adjacency.nnz, 1)
    adjacency = adjacency.astype(float)
    n = adjacency.shape[0]
    check_min_size(n, 2)

    adjacency.data /= adjacency.data.sum()
    edge_sampling, node_sampling, _ = get_sampling_distributions(
        adjacency, dendrogram, weights)

    index = np.where(edge_sampling)[0]
    score = edge_sampling[index].dot(
        np.log(edge_sampling[index] / node_sampling[index]))
    if normalized:
        weights_row = get_probs(weights, adjacency)
        weights_col = get_probs(weights, adjacency.T)
        inv_out_weights = sparse.diags(weights_row, shape=(n, n), format='csr')
        inv_out_weights.data = 1 / inv_out_weights.data
        inv_in_weights = sparse.diags(weights_col, shape=(n, n), format='csr')
        inv_in_weights.data = 1 / inv_in_weights.data
        sampling_ratio = inv_out_weights.dot(adjacency.dot(inv_in_weights))
        inv_out_weights.data = np.ones(len(inv_out_weights.data))
        inv_in_weights.data = np.ones(len(inv_in_weights.data))
        edge_sampling = inv_out_weights.dot(adjacency.dot(inv_in_weights))
        mutual_information = edge_sampling.data.dot(np.log(
            sampling_ratio.data))
        if mutual_information > 0:
            score /= mutual_information
    return score