Пример #1
0
    def fit(
        self,
        adjacency: Union[sparse.csr_matrix, np.ndarray],
        personalization: Optional[Union[dict,
                                        np.ndarray]] = None) -> 'PageRank':
        """
        Standard PageRank with restart.

        Parameters
        ----------
        adjacency :
            Adjacency matrix.
        personalization :
            If ``None``, the uniform distribution is used.
            Otherwise, a non-negative, non-zero vector or a dictionary must be provided.

        Returns
        -------
        self: :class:`PageRank`
        """

        adjacency = check_format(adjacency)
        if not is_square(adjacency):
            raise ValueError("The adjacency is not square. See BiPageRank.")

        rso = RandomSurferOperator(adjacency, self.damping_factor,
                                   personalization, False)
        self.scores_ = rso.solve(self.solver, self.n_iter)

        return self
Пример #2
0
    def fit(self, adjacency: Union[sparse.csr_matrix,
                                   np.ndarray]) -> 'Harmonic':
        """
        Harmonic centrality for connected graphs.

        Parameters
        ----------
        adjacency :
            Adjacency matrix of the graph.

        Returns
        -------
        self: :class:`Harmonic`
        """
        adjacency = check_format(adjacency)
        n = adjacency.shape[0]
        if not is_square(adjacency):
            raise ValueError(
                "The adjacency is not square. Please use 'bipartite2undirected' or "
                "'bipartite2directed'.")

        indices = np.arange(n)

        paths = shortest_path(adjacency, n_jobs=self.n_jobs, indices=indices)

        np.fill_diagonal(paths, 1)
        inv = (1 / paths)
        np.fill_diagonal(inv, 0)

        self.scores_ = inv.dot(np.ones(n))

        return self
Пример #3
0
def largest_connected_component(adjacency: Union[sparse.csr_matrix,
                                                 np.ndarray],
                                return_labels: bool = False):
    """
    Extract the largest connected component of a graph. Bipartite graphs are treated as undirected ones.

    Parameters
    ----------
    adjacency
        Adjacency or biadjacency matrix of the graph.
    return_labels: bool
        Whether to return the indices of the new nodes in the original graph.

    Returns
    -------
    new_adjacency: sparse.csr_matrix
        Adjacency or biadjacency matrix of the largest connected component.
    indices: array or tuple of array
        Indices of the nodes in the original graph. For biadjacency matrices,
        ``indices[0]`` corresponds to the rows and ``indices[1]`` to the columns.

    """
    adjacency = check_format(adjacency)
    n_samples, n_features = adjacency.shape
    if not is_square(adjacency):
        bipartite: bool = True
        full_adjacency = sparse.bmat([[None, adjacency], [adjacency.T, None]],
                                     format='csr')
    else:
        bipartite: bool = False
        full_adjacency = adjacency

    n_components, labels = connected_components(full_adjacency)
    unique_labels, counts = np.unique(labels, return_counts=True)
    component_label = unique_labels[np.argmax(counts)]
    component_indices = np.where(labels == component_label)[0]

    if bipartite:
        split_ix = np.searchsorted(component_indices, n_samples)
        samples_ix, features_ix = component_indices[:
                                                    split_ix], component_indices[
                                                        split_ix:] - n_samples
    else:
        samples_ix, features_ix = component_indices, component_indices
    new_adjacency = adjacency[samples_ix, :]
    new_adjacency = (new_adjacency.tocsc()[:, features_ix]).tocsr()

    if return_labels:
        if bipartite:
            return new_adjacency, (samples_ix, features_ix)
        else:
            return new_adjacency, samples_ix
    else:
        return new_adjacency
Пример #4
0
    def fit(self, adjacency: Union[sparse.csr_matrix,
                                   np.ndarray]) -> 'Closeness':
        """
        Closeness centrality for connected graphs.

        Parameters
        ----------
        adjacency :
            Adjacency matrix of the graph.

        Returns
        -------
        self: :class:`Closeness`
        """
        adjacency = check_format(adjacency)
        n = adjacency.shape[0]
        if not is_square(adjacency):
            raise ValueError(
                "The adjacency is not square. Please use 'bipartite2undirected' or "
                "'bipartite2directed'.")

        if not is_connected(adjacency):
            raise ValueError("The graph must be connected.")

        if self.method == 'exact':
            nb_samples = n
            indices = np.arange(n)
        elif self.method == 'approximate':
            nb_samples = min(int(log(n) / self.tol**2), n)
            indices = np.random.choice(np.arange(n), nb_samples, replace=False)
        else:
            raise ValueError(
                "Method should be either 'exact' or 'approximate'.")

        paths = shortest_path(adjacency, n_jobs=self.n_jobs, indices=indices)

        self.scores_ = (
            (n - 1) * nb_samples / n) / paths.T.dot(np.ones(nb_samples))

        return self
Пример #5
0
def cosine_modularity(adjacency,
                      embedding: np.ndarray,
                      col_embedding=None,
                      resolution=1.,
                      weights='degree',
                      return_all: bool = False):
    """
    Quality metric of an embedding :math:`x` defined by:

    :math:`Q = \\sum_{ij}\\left(\\dfrac{A_{ij}}{w} - \\gamma \\dfrac{w_iw'_j}{w^2}\\right)
    \\left(\\dfrac{1 + \\pi(x_i)^T\\pi(x_j)}{2}\\right)`

    where :math:`\\pi(x_i)` is the projection of :math:`x_i` onto the unit-sphere.

    For bipartite graphs with column embedding :math:`y`, the metric is

    :math:`Q = \\sum_{ij}\\left(\\dfrac{B_{ij}}{w} - \\gamma \\dfrac{w_iw'_j}{w^2}\\right)
    \\left(\\dfrac{1 + \\pi(x_i)^T\\pi(y_j)}{2}\\right)`

    This metric is normalized to lie between -1 and 1 (for :math:`\\gamma = 1`).

    Parameters
    ----------
    adjacency: sparse.csr_matrix or np.ndarray
        Adjacency matrix of the graph.
    embedding: np.ndarray
        Embedding of the nodes.
    col_embedding: None or np.ndarray
        For biadjacency matrices, embedding of the columns.
    resolution: float
        Resolution parameter.
    weights: ``'degree'`` or ``'uniform'``
        Weights of the nodes.
    return_all: bool, default = ``False``
        whether to return (fit, div, :math:`Q`) or :math:`Q`

    Returns
    -------
    modularity : float
    fit: float, optional
    diversity: float, optional
    """
    adjacency = check_format(adjacency)
    total_weight: float = adjacency.data.sum()

    if col_embedding is None:
        if not is_square(adjacency):
            raise ValueError(
                'col_embedding cannot be None for non-square adjacency matrices.'
            )
        else:
            col_embedding = embedding.copy()

    row_norms = np.linalg.norm(embedding, axis=1)
    col_norms = np.linalg.norm(col_embedding, axis=1)

    norm_row_emb = embedding
    norm_row_emb[(row_norms > 0)] /= row_norms[:, np.newaxis]
    norm_col_emb = col_embedding
    norm_col_emb[(col_norms > 0)] /= col_norms[:, np.newaxis]

    row_probs = check_probs(weights, adjacency)
    col_probs = check_probs(weights, adjacency.T)

    fit: float = 0.5 * (1 + (np.multiply(
        norm_row_emb, adjacency.dot(norm_col_emb))).sum() / total_weight)
    div: float = 0.5 * (
        1 + (embedding.T.dot(row_probs)).dot(col_embedding.T.dot(col_probs)))

    if return_all:
        return fit, div, fit - resolution * div
    else:
        return fit - resolution * div
Пример #6
0
    def fit(self, adjacency: Union[sparse.csr_matrix,
                                   np.ndarray]) -> 'Spectral':
        """Fits the model from data in adjacency.

        Parameters
        ----------
        adjacency :
              Adjacency matrix of the graph (symmetric matrix).

        Returns
        -------
        self: :class:`Spectral`
        """

        adjacency = check_format(adjacency).asfptype()

        if not is_square(adjacency):
            raise ValueError(
                'The adjacency matrix is not square. See BiSpectral.')

        if not is_symmetric(adjacency):
            raise ValueError(
                'The adjacency matrix is not symmetric.'
                'Either convert it to a symmetric matrix or use BiSpectral.')

        n = adjacency.shape[0]

        if self.solver == 'auto':
            solver = auto_solver(adjacency.nnz)
            if solver == 'lanczos':
                self.solver: EigSolver = LanczosEig()
            else:
                self.solver: EigSolver = HalkoEig()

        if self.embedding_dimension > n - 2:
            warnings.warn(
                Warning(
                    "The dimension of the embedding must be less than the number of nodes - 1."
                ))
            n_components = n - 2
        else:
            n_components = self.embedding_dimension + 1

        if (self.regularization is None
                or self.regularization == 0.) and not is_connected(adjacency):
            warnings.warn(
                Warning(
                    "The graph is not connected and low-rank regularization is not active."
                    "This can cause errors in the computation of the embedding."
                ))

        if isinstance(self.solver, HalkoEig) and not self.normalized_laplacian:
            raise NotImplementedError(
                "Halko solver is not yet compatible with regular Laplacian."
                "Call 'fit' with 'normalized_laplacian' = True or force lanczos solver."
            )

        weights = adjacency.dot(np.ones(n))
        regularization = self.regularization
        if regularization:
            if self.relative_regularization:
                regularization = regularization * weights.sum() / n**2
            weights += regularization * n

        if self.normalized_laplacian:
            # Finding the largest eigenvalues of the normalized adjacency is easier for the solver than finding the
            # smallest eigenvalues of the normalized laplacian.
            normalizing_matrix = diag_pinv(np.sqrt(weights))

            if regularization:
                norm_adjacency = NormalizedAdjacencyOperator(
                    adjacency, regularization)
            else:
                norm_adjacency = normalizing_matrix.dot(
                    adjacency.dot(normalizing_matrix))

            self.solver.which = 'LA'
            self.solver.fit(matrix=norm_adjacency, n_components=n_components)
            eigenvalues = 1 - self.solver.eigenvalues_
            # eigenvalues of the Laplacian in increasing order
            index = np.argsort(eigenvalues)
            # skip first eigenvalue
            eigenvalues = eigenvalues[index][1:]
            # keep only positive eigenvectors of the normalized adjacency matrix
            eigenvectors = self.solver.eigenvectors_[:, index][:, 1:] * (
                eigenvalues < 1 - self.tol)
            embedding = np.array(normalizing_matrix.dot(eigenvectors))

        else:
            if regularization:
                laplacian = LaplacianOperator(adjacency, regularization)
            else:
                weight_matrix = sparse.diags(weights, format='csr')
                laplacian = weight_matrix - adjacency

            self.solver.which = 'SM'
            self.solver.fit(matrix=laplacian, n_components=n_components)
            eigenvalues = self.solver.eigenvalues_[1:]
            embedding = self.solver.eigenvectors_[:, 1:]

        if self.scaling:
            if self.scaling == 'multiply':
                eigenvalues = np.minimum(eigenvalues, 1)
                embedding *= np.sqrt(1 - eigenvalues)
            elif self.scaling == 'divide':
                inv_eigenvalues = np.zeros_like(eigenvalues)
                index = np.where(eigenvalues > 0)[0]
                inv_eigenvalues[index] = 1 / eigenvalues[index]
                embedding *= np.sqrt(inv_eigenvalues)
            else:
                warnings.warn(
                    Warning(
                        "The scaling must be 'multiply' or 'divide'. No scaling done."
                    ))

        self.embedding_ = embedding
        self.eigenvalues_ = eigenvalues
        self.regularization_ = regularization

        return self
Пример #7
0
    def fit(self, adjacency: Union[sparse.csr_matrix, np.ndarray]) -> 'Louvain':
        """
        Clustering using chosen Optimizer.

        Parameters
        ----------
        adjacency :
            Adjacency matrix of the graph.

        Returns
        -------
        self: :class:`Louvain`
        """
        adjacency = check_format(adjacency)
        if not is_square(adjacency):
            raise ValueError('The adjacency matrix is not square. Use BiLouvain() instead.')
        n = adjacency.shape[0]

        out_weights = check_probs('degree', adjacency)
        in_weights = check_probs('degree', adjacency.T)

        nodes = np.arange(n)
        if self.shuffle_nodes:
            nodes = self.random_state.permutation(nodes)
            adjacency = adjacency[nodes, :].tocsc()[:, nodes].tocsr()

        graph = AggregateGraph(adjacency, out_weights, in_weights)

        membership = sparse.identity(n, format='csr')
        increase = True
        iteration_count = 0
        self.log.print("Starting with", graph.n_nodes, "nodes.")
        while increase:
            iteration_count += 1

            self.algorithm.fit(graph)

            if self.algorithm.score_ <= self.agg_tol:
                increase = False
            else:
                agg_membership = membership_matrix(self.algorithm.labels_)
                membership = membership.dot(agg_membership)
                graph.aggregate(agg_membership)

                if graph.n_nodes == 1:
                    break
            self.log.print("Iteration", iteration_count, "completed with", graph.n_nodes, "clusters and ",
                           self.algorithm.score_, "increment.")
            if iteration_count == self.max_agg_iter:
                break

        if self.sorted_cluster:
            labels = reindex_clusters(membership.indices)
        else:
            labels = membership.indices
        if self.shuffle_nodes:
            reverse = np.empty(nodes.size, nodes.dtype)
            reverse[nodes] = np.arange(nodes.size)
            labels = labels[reverse]

        self.labels_ = labels
        self.iteration_count_ = iteration_count
        self.aggregate_graph_ = graph.norm_adjacency * adjacency.data.sum()

        return self
Пример #8
0
    def fit(self, adjacency: Union[sparse.csr_matrix, np.ndarray]) -> 'Paris':
        """
        Agglomerative clustering using the nearest neighbor chain.

        Parameters
        ----------
        adjacency :
            Adjacency matrix of the graph.

        Returns
        -------
        self: :class:`Paris`
        """
        adjacency = check_format(adjacency)
        if not is_square(adjacency):
            raise ValueError(
                'The adjacency matrix is not square. Use BiParis() instead.')
        n = adjacency.shape[0]
        sym_adjacency = adjacency + adjacency.T

        weights = self.weights
        out_weights = check_probs(weights, adjacency)
        in_weights = check_probs(weights, adjacency.T)

        if n <= 1:
            raise ValueError('The graph must contain at least two nodes.')

        if self.engine == 'python':
            aggregate_graph = AggregateGraph(sym_adjacency, out_weights,
                                             in_weights)

            connected_components = []
            dendrogram = []

            while len(aggregate_graph.cluster_sizes) > 0:
                node = None
                for node in aggregate_graph.cluster_sizes:
                    break
                chain = [node]
                while chain:
                    node = chain.pop()
                    if aggregate_graph.neighbors[node]:
                        max_sim = -float("inf")
                        nearest_neighbor = None
                        for neighbor in aggregate_graph.neighbors[node]:
                            sim = aggregate_graph.similarity(node, neighbor)
                            if sim > max_sim:
                                nearest_neighbor = neighbor
                                max_sim = sim
                            elif sim == max_sim:
                                nearest_neighbor = min(neighbor,
                                                       nearest_neighbor)
                        if chain:
                            nearest_neighbor_last = chain.pop()
                            if nearest_neighbor_last == nearest_neighbor:
                                dendrogram.append([
                                    node, nearest_neighbor, 1. / max_sim,
                                    aggregate_graph.cluster_sizes[node] +
                                    aggregate_graph.
                                    cluster_sizes[nearest_neighbor]
                                ])
                                aggregate_graph.merge(node, nearest_neighbor)
                            else:
                                chain.append(nearest_neighbor_last)
                                chain.append(node)
                                chain.append(nearest_neighbor)
                        else:
                            chain.append(node)
                            chain.append(nearest_neighbor)
                    else:
                        connected_components.append(
                            (node, aggregate_graph.cluster_sizes[node]))
                        del aggregate_graph.cluster_sizes[node]

            node, cluster_size = connected_components.pop()
            for next_node, next_cluster_size in connected_components:
                cluster_size += next_cluster_size
                dendrogram.append(
                    [node, next_node,
                     float("inf"), cluster_size])
                node = aggregate_graph.next_cluster
                aggregate_graph.next_cluster += 1

            dendrogram = np.array(dendrogram)
            if self.reorder:
                dendrogram = reorder_dendrogram(dendrogram)

            self.dendrogram_ = dendrogram
            return self

        elif self.engine == 'numba':

            n = np.int32(adjacency.shape[0])
            indices, indptr, data = sym_adjacency.indices, sym_adjacency.indptr, sym_adjacency.data

            dendrogram = fit_core(n, out_weights, in_weights, data, indices,
                                  indptr)
            dendrogram = np.array(dendrogram)
            if self.reorder:
                dendrogram = reorder_dendrogram(dendrogram)

            self.dendrogram_ = dendrogram
            return self

        else:
            raise ValueError('Unknown engine.')
Пример #9
0
def tree_sampling_divergence(adjacency: sparse.csr_matrix, dendrogram: np.ndarray, weights: str = 'degree',
                             normalized: bool = True) -> float:
    """
    Tree sampling divergence of a hierarchy (quality metric).

    The higher the score, the better.

    Parameters
    ----------
    adjacency :
        Adjacency matrix of the graph.
    dendrogram :
        Dendrogram.
    weights :
        Weights of nodes.
        ``'degree'`` (default) or ``'uniform'``.
    normalized:
        If ``True``, normalized by the mutual information of the graph.

    Returns
    -------
    score : float
        The tree sampling divergence of the hierarchy.
        If normalized, returns a value between 0 and 1.

    References
    ----------
    Charpentier, B. & Bonald, T. (2019).
    `Tree Sampling Divergence: An Information-Theoretic Metric for
    Hierarchical Graph Clustering.
    <https://hal.telecom-paristech.fr/hal-02144394/document>`_
    Proceedings of IJCAI.

    """
    adjacency = check_format(adjacency)
    if not is_square(adjacency):
        raise ValueError('The adjacency matrix is not square.')

    n = adjacency.shape[0]
    if n <= 1:
        raise ValueError('The graph must contain at least two nodes.')

    total_weight = adjacency.data.sum()
    if total_weight <= 0:
        raise ValueError('The graph must contain at least one edge.')

    adjacency.data = adjacency.data / total_weight

    out_weights = check_probs(weights, adjacency)
    in_weights = check_probs(weights, adjacency.T)

    aggregate_graph = AggregateGraph(adjacency + adjacency.T, out_weights, in_weights)

    height = np.zeros(n - 1)
    edge_sampling = np.zeros(n - 1)
    node_sampling = np.zeros(n - 1)
    for t in range(n - 1):
        node1 = int(dendrogram[t][0])
        node2 = int(dendrogram[t][1])
        if node1 >= n and height[node1 - n] == dendrogram[t][2]:
            edge_sampling[t] = edge_sampling[node1 - n]
            edge_sampling[node1 - n] = 0
            node_sampling[t] = node_sampling[node1 - n]
        elif node2 >= n and height[node2 - n] == dendrogram[t][2]:
            edge_sampling[t] = edge_sampling[node2 - n]
            edge_sampling[node2 - n] = 0
            node_sampling[t] = node_sampling[node2 - n]
        if node2 in aggregate_graph.neighbors[node1]:
            edge_sampling[t] += aggregate_graph.neighbors[node1][node2]
        node_sampling[t] += aggregate_graph.cluster_out_weights[node1] * aggregate_graph.cluster_in_weights[node2] + \
            aggregate_graph.cluster_out_weights[node2] * aggregate_graph.cluster_in_weights[node1]
        height[t] = dendrogram[t][2]
        aggregate_graph.merge(node1, node2)

    index = np.where(edge_sampling)[0]
    score = edge_sampling[index].dot(np.log(edge_sampling[index] / node_sampling[index]))
    if normalized:
        inv_out_weights = sparse.diags(out_weights, shape=(n, n), format='csr')
        inv_out_weights.data = 1 / inv_out_weights.data
        inv_in_weights = sparse.diags(in_weights, shape=(n, n), format='csr')
        inv_in_weights.data = 1 / inv_in_weights.data
        sampling_ratio = inv_out_weights.dot(adjacency.dot(inv_in_weights))
        inv_out_weights.data = np.ones(len(inv_out_weights.data))
        inv_in_weights.data = np.ones(len(inv_in_weights.data))
        edge_sampling = inv_out_weights.dot(adjacency.dot(inv_in_weights))
        mutual_information = edge_sampling.data.dot(np.log(sampling_ratio.data))
        score /= mutual_information
    return score
Пример #10
0
def dasgupta_score(adjacency: sparse.csr_matrix, dendrogram: np.ndarray, weights: str = 'uniform') -> float:
    """
    Dasgupta's score of a hierarchy, defined as 1 - Dasgupta's cost.

    The higher the score, the better.

    Parameters
    ----------
    adjacency :
        Adjacency matrix of the graph.
    dendrogram :
        Dendrogram.
    weights :
        Weights of nodes.
        ``'degree'`` or ``'uniform'`` (default).

    Returns
    -------
    score : float
        Dasgupta's score of the hierarchy, normalized to get a value between 0 and 1.

    References
    ----------
    Dasgupta, S. (2016). A cost function for similarity-based hierarchical clustering.
    Proceedings of ACM symposium on Theory of Computing.

    """
    adjacency = check_format(adjacency)
    if not is_square(adjacency):
        raise ValueError('The adjacency matrix is not square.')

    n = adjacency.shape[0]
    if n <= 1:
        raise ValueError('The graph must contain at least two nodes.')

    out_weights = check_probs(weights, adjacency)
    in_weights = check_probs(weights, adjacency.T)

    aggregate_graph = AggregateGraph(adjacency + adjacency.T, out_weights, in_weights)

    height = np.zeros(n - 1)
    edge_sampling = np.zeros(n - 1)
    cluster_weight = np.zeros(n - 1)
    for t in range(n - 1):
        node1 = int(dendrogram[t][0])
        node2 = int(dendrogram[t][1])
        if node1 >= n and height[node1 - n] == dendrogram[t][2]:
            edge_sampling[t] = edge_sampling[node1 - n]
            edge_sampling[node1 - n] = 0
        elif node2 >= n and height[node2 - n] == dendrogram[t][2]:
            edge_sampling[t] = edge_sampling[node2 - n]
            edge_sampling[node2 - n] = 0
        height[t] = dendrogram[t][2]
        if node2 in aggregate_graph.neighbors[node1]:
            edge_sampling[t] += aggregate_graph.neighbors[node1][node2]
        cluster_weight[t] = aggregate_graph.cluster_out_weights[node1] + aggregate_graph.cluster_out_weights[node2] \
            + aggregate_graph.cluster_in_weights[node1] + aggregate_graph.cluster_in_weights[node2]
        aggregate_graph.merge(node1, node2)

    cost: float = edge_sampling.dot(cluster_weight) / 2
    return 1 - cost