Exemplo n.º 1
0
    def fit(self, adjacency: Union[sparse.csr_matrix,
                                   np.ndarray]) -> 'Harmonic':
        """
        Harmonic centrality for connected graphs.

        Parameters
        ----------
        adjacency :
            Adjacency matrix of the graph.

        Returns
        -------
        self: :class:`Harmonic`
        """
        adjacency = check_format(adjacency)
        n = adjacency.shape[0]
        if not is_square(adjacency):
            raise ValueError(
                "The adjacency is not square. Please use 'bipartite2undirected' or "
                "'bipartite2directed'.")

        indices = np.arange(n)

        paths = shortest_path(adjacency, n_jobs=self.n_jobs, indices=indices)

        np.fill_diagonal(paths, 1)
        inv = (1 / paths)
        np.fill_diagonal(inv, 0)

        self.scores_ = inv.dot(np.ones(n))

        return self
Exemplo n.º 2
0
def co_neighbors_graph(adjacency: Union[sparse.csr_matrix, np.ndarray],
                       normalized: bool = True,
                       method='knn',
                       n_neighbors: int = 5,
                       embedding_dimension: int = 8) -> sparse.csr_matrix:
    """Compute the co-neighborhood adjacency defined as

    :math:`\\tilde{A} = AF^{-1}A^T`,

    where F is a weight matrix.

    Parameters
    ----------
    adjacency:
        Adjacency of the input graph.
    normalized:
        If ``True``, F is the diagonal in-degree matrix :math:`F = \\text{diag}(A^T1)`.
        Otherwise, F is the identity matrix.
    method:
        Either ``'exact'`` or ``'knn'``. If 'exact' the output is computed with matrix multiplication.
        However, the density can be much higher than in the input graph and this can trigger Memory errors.
        If ``'knn'``, the co-neighborhood is approximated through KNN-search in an appropriate spectral embedding space.
    n_neighbors:
        Number of neighbors for the KNN search. Only useful if ``method='knn'``.
    embedding_dimension:
        Dimension of the embedding space. Only useful if ``method='knn'``.

    Returns
    -------
    adjacency_: sparse.csr_matrix
        Adjacency of the co-neighborhood.

    """
    adjacency = check_format(adjacency)

    if method == 'exact':
        if normalized:
            forward = transition_matrix(adjacency.T)
        else:
            forward = adjacency.T
        return adjacency.dot(forward)

    elif method == 'knn':
        if normalized:
            bispectral = BiSpectral(embedding_dimension,
                                    weights='degree',
                                    col_weights='degree',
                                    scaling='divide')
        else:
            bispectral = BiSpectral(embedding_dimension,
                                    weights='degree',
                                    col_weights='uniform',
                                    scaling=None)

        bispectral.fit(adjacency)
        knn = KNeighborsTransformer(n_neighbors, undirected=True)
        knn.fit(bispectral.row_embedding_)
        return knn.adjacency_
    else:
        raise ValueError('method must be "exact" or "knn".')
Exemplo n.º 3
0
    def fit(self, biadjacency: Union[sparse.csr_matrix,
                                     np.ndarray]) -> 'BiParis':
        """Applies the Paris algorithm to

        :math:`A  = \\begin{bmatrix} 0 & B \\\\ B^T & 0 \\end{bmatrix}`

        where :math:`B` is the input treated as a biadjacency matrix.

        Parameters
        ----------
        biadjacency:
            Biadjacency matrix of the graph.

        Returns
        -------
        self: :class:`BiParis`
        """
        paris = Paris(engine=self.engine,
                      weights=self.weights,
                      reorder=self.reorder)
        biadjacency = check_format(biadjacency)

        adjacency = bipartite2undirected(biadjacency)
        paris.fit(adjacency)

        self.dendrogram_ = paris.dendrogram_

        return self
Exemplo n.º 4
0
    def fit(
        self,
        adjacency: Union[sparse.csr_matrix, np.ndarray],
        personalization: Optional[Union[dict,
                                        np.ndarray]] = None) -> 'PageRank':
        """
        Standard PageRank with restart.

        Parameters
        ----------
        adjacency :
            Adjacency matrix.
        personalization :
            If ``None``, the uniform distribution is used.
            Otherwise, a non-negative, non-zero vector or a dictionary must be provided.

        Returns
        -------
        self: :class:`PageRank`
        """

        adjacency = check_format(adjacency)
        if not is_square(adjacency):
            raise ValueError("The adjacency is not square. See BiPageRank.")

        rso = RandomSurferOperator(adjacency, self.damping_factor,
                                   personalization, False)
        self.scores_ = rso.solve(self.solver, self.n_iter)

        return self
Exemplo n.º 5
0
    def fit(self, biadjacency: Union[sparse.csr_matrix, np.ndarray]) -> 'BiLouvain':
        """Applies the directed version of Louvain algorithm to

        :math:`A  = \\begin{bmatrix} 0 & B \\\\ 0 & 0 \\end{bmatrix}`

        where :math:`B` is the input treated as a biadjacency matrix.

        Parameters
        ----------
        biadjacency:
            Biadjacency matrix of the graph.

        Returns
        -------
        self: :class:`BiLouvain`
        """
        louvain = Louvain(algorithm=self.algorithm, agg_tol=self.agg_tol, max_agg_iter=self.max_agg_iter,
                          shuffle_nodes=self.shuffle_nodes, sorted_cluster=self.sorted_cluster,
                          random_state=self.random_state, verbose=self.log.verbose)
        biadjacency = check_format(biadjacency)
        n1, _ = biadjacency.shape

        adjacency = bipartite2directed(biadjacency)
        louvain.fit(adjacency)

        self.row_labels_ = louvain.labels_[:n1]
        self.col_labels_ = louvain.labels_[n1:]
        self.labels_ = louvain.labels_
        self.iteration_count_ = louvain.iteration_count_
        self.aggregate_graph_ = louvain.aggregate_graph_
        return self
    def fit(self, adjacency: Union[sparse.csr_matrix, np.ndarray]) -> 'SpectralClustering':
        """Apply embedding method followed by clustering to the graph.

        Parameters
        ----------
        adjacency:
            Adjacency matrix of the graph.

        Returns
        -------
        self: :class:`SpectralClustering`

        """
        adjacency = check_format(adjacency)
        if not is_symmetric(adjacency):
            raise ValueError('The adjacency is not symmetric.')

        spectral = Spectral(self.embedding_dimension).fit(adjacency)
        embedding = spectral.embedding_

        if self.l2normalization:
            norm = np.linalg.norm(embedding, axis=1)
            norm[norm == 0.] = 1
            embedding /= norm[:, np.newaxis]

        kmeans = KMeans(self.n_clusters)
        kmeans.fit(embedding)

        self.labels_ = kmeans.labels_

        return self
Exemplo n.º 7
0
    def fit(self,
            adjacency: Union[sparse.csr_matrix, np.ndarray],
            node_weights=None,
            randomized_decomposition: bool = True) -> 'SpectralEmbedding':
        """Fits the model from data in adjacency_matrix

        Parameters
        ----------
        adjacency : array-like, shape = (n, n)
              Adjacency matrix of the graph
        randomized_decomposition: bool (default=True)
            whether to use a randomized (and faster) decomposition method or the standard scipy one.
        node_weights : {``'uniform'``, ``'degree'``, array of length n_nodes with positive entries}
              Node weights

        Returns
        -------
        self: :class:`SpectralEmbedding`
        """

        adjacency = check_format(adjacency)
        n_nodes, m_nodes = adjacency.shape
        if not check_square(adjacency):
            raise ValueError("The adjacency matrix must be a square matrix.")
        if connected_components(adjacency, directed=False)[0] > 1:
            raise ValueError("The graph must be connected.")
        if not check_symmetry(adjacency):
            raise ValueError("The adjacency matrix is not symmetric.")

        # builds standard laplacian
        degrees = adjacency.dot(np.ones(n_nodes))
        degree_matrix = sparse.diags(degrees, format='csr')
        laplacian = degree_matrix - adjacency

        # applies normalization by node weights
        if node_weights is None:
            node_weights = self.node_weights
        weights = check_weights(node_weights, adjacency)

        weight_matrix = sparse.diags(np.sqrt(weights), format='csr')
        weight_matrix.data = 1 / weight_matrix.data
        laplacian = weight_matrix.dot(laplacian.dot(weight_matrix))

        # spectral decomposition
        n_components = min(self.embedding_dimension + 1, n_nodes - 1)
        if randomized_decomposition:
            eigenvalues, eigenvectors = randomized_eig(laplacian,
                                                       n_components,
                                                       which='SM')
        else:
            eigenvalues, eigenvectors = eigsh(laplacian,
                                              n_components,
                                              which='SM')

        self.eigenvalues_ = eigenvalues[1:]
        self.embedding_ = np.array(weight_matrix.dot(eigenvectors[:, 1:]))
        return self
Exemplo n.º 8
0
def largest_connected_component(adjacency: Union[sparse.csr_matrix,
                                                 np.ndarray],
                                return_labels: bool = False):
    """
    Extract the largest connected component of a graph. Bipartite graphs are treated as undirected ones.

    Parameters
    ----------
    adjacency
        Adjacency or biadjacency matrix of the graph.
    return_labels: bool
        Whether to return the indices of the new nodes in the original graph.

    Returns
    -------
    new_adjacency: sparse.csr_matrix
        Adjacency or biadjacency matrix of the largest connected component.
    indices: array or tuple of array
        Indices of the nodes in the original graph. For biadjacency matrices,
        ``indices[0]`` corresponds to the rows and ``indices[1]`` to the columns.

    """
    adjacency = check_format(adjacency)
    n_samples, n_features = adjacency.shape
    if not is_square(adjacency):
        bipartite: bool = True
        full_adjacency = sparse.bmat([[None, adjacency], [adjacency.T, None]],
                                     format='csr')
    else:
        bipartite: bool = False
        full_adjacency = adjacency

    n_components, labels = connected_components(full_adjacency)
    unique_labels, counts = np.unique(labels, return_counts=True)
    component_label = unique_labels[np.argmax(counts)]
    component_indices = np.where(labels == component_label)[0]

    if bipartite:
        split_ix = np.searchsorted(component_indices, n_samples)
        samples_ix, features_ix = component_indices[:
                                                    split_ix], component_indices[
                                                        split_ix:] - n_samples
    else:
        samples_ix, features_ix = component_indices, component_indices
    new_adjacency = adjacency[samples_ix, :]
    new_adjacency = (new_adjacency.tocsc()[:, features_ix]).tocsr()

    if return_labels:
        if bipartite:
            return new_adjacency, (samples_ix, features_ix)
        else:
            return new_adjacency, samples_ix
    else:
        return new_adjacency
Exemplo n.º 9
0
    def fit(self, adjacency: Union[sparse.csr_matrix, np.ndarray]) -> 'HITS':
        """
        Compute HITS algorithm with a spectral method.

        Parameters
        ----------
        adjacency :
            Adjacency or biadjacency matrix of the graph.

        Returns
        -------
        self: :class:`HITS`
        """
        adjacency = check_format(adjacency)

        if self.solver == 'auto':
            solver = auto_solver(adjacency.nnz)
            if solver == 'lanczos':
                self.solver: SVDSolver = LanczosSVD()
            else:
                self.solver: SVDSolver = HalkoSVD()

        self.solver.fit(adjacency, 1)
        hubs: np.ndarray = self.solver.left_singular_vectors_.reshape(-1)
        authorities: np.ndarray = self.solver.right_singular_vectors_.reshape(
            -1)

        h_pos, h_neg = (hubs > 0).sum(), (hubs < 0).sum()
        a_pos, a_neg = (authorities > 0).sum(), (authorities < 0).sum()

        if h_pos > h_neg:
            hubs = np.clip(hubs, a_min=0., a_max=None)
        else:
            hubs = np.clip(-hubs, a_min=0., a_max=None)

        if a_pos > a_neg:
            authorities = np.clip(authorities, a_min=0., a_max=None)
        else:
            authorities = np.clip(-authorities, a_min=0., a_max=None)

        if self.mode == 'hubs':
            self.scores_ = hubs
            self.col_scores_ = authorities
        elif self.mode == 'authorities':
            self.scores_ = authorities
            self.col_scores_ = hubs
        else:
            raise ValueError('Mode should be "hubs" or "authorities".')

        return self
    def fit(self, biadjacency: Union[sparse.csr_matrix, np.ndarray]) -> 'BiSpectralClustering':
        """Apply embedding method followed by clustering to the graph.

        Parameters
        ----------
        biadjacency:
            Biadjacency matrix of the graph.

        Returns
        -------
        self: :class:`BiSpectralClustering`

        """
        biadjacency = check_format(biadjacency)
        n1, n2 = biadjacency.shape

        bispectral = BiSpectral(self.embedding_dimension).fit(biadjacency)

        if self.co_clustering:
            embedding = bispectral.embedding_
        else:
            embedding = bispectral.row_embedding_

        if self.l2normalization:
            norm = np.linalg.norm(embedding, axis=1)
            norm[norm == 0.] = 1
            embedding /= norm[:, np.newaxis]

        kmeans = KMeans(self.n_clusters)
        kmeans.fit(embedding)

        if self.co_clustering:
            self.row_labels_ = kmeans.labels_[:n1]
            self.col_labels_ = kmeans.labels_[n1:]
            self.labels_ = kmeans.labels_

        else:
            self.row_labels_ = kmeans.labels_
            self.labels_ = kmeans.labels_

        return self
Exemplo n.º 11
0
    def fit(self, adjacency: Union[sparse.csr_matrix,
                                   np.ndarray]) -> 'Closeness':
        """
        Closeness centrality for connected graphs.

        Parameters
        ----------
        adjacency :
            Adjacency matrix of the graph.

        Returns
        -------
        self: :class:`Closeness`
        """
        adjacency = check_format(adjacency)
        n = adjacency.shape[0]
        if not is_square(adjacency):
            raise ValueError(
                "The adjacency is not square. Please use 'bipartite2undirected' or "
                "'bipartite2directed'.")

        if not is_connected(adjacency):
            raise ValueError("The graph must be connected.")

        if self.method == 'exact':
            nb_samples = n
            indices = np.arange(n)
        elif self.method == 'approximate':
            nb_samples = min(int(log(n) / self.tol**2), n)
            indices = np.random.choice(np.arange(n), nb_samples, replace=False)
        else:
            raise ValueError(
                "Method should be either 'exact' or 'approximate'.")

        paths = shortest_path(adjacency, n_jobs=self.n_jobs, indices=indices)

        self.scores_ = (
            (n - 1) * nb_samples / n) / paths.T.dot(np.ones(nb_samples))

        return self
Exemplo n.º 12
0
def dasgupta_score(adjacency: sparse.csr_matrix, dendrogram: np.ndarray, weights: str = 'uniform') -> float:
    """
    Dasgupta's score of a hierarchy, defined as 1 - Dasgupta's cost.

    The higher the score, the better.

    Parameters
    ----------
    adjacency :
        Adjacency matrix of the graph.
    dendrogram :
        Dendrogram.
    weights :
        Weights of nodes.
        ``'degree'`` or ``'uniform'`` (default).

    Returns
    -------
    score : float
        Dasgupta's score of the hierarchy, normalized to get a value between 0 and 1.

    References
    ----------
    Dasgupta, S. (2016). A cost function for similarity-based hierarchical clustering.
    Proceedings of ACM symposium on Theory of Computing.

    """
    adjacency = check_format(adjacency)
    if not is_square(adjacency):
        raise ValueError('The adjacency matrix is not square.')

    n = adjacency.shape[0]
    if n <= 1:
        raise ValueError('The graph must contain at least two nodes.')

    out_weights = check_probs(weights, adjacency)
    in_weights = check_probs(weights, adjacency.T)

    aggregate_graph = AggregateGraph(adjacency + adjacency.T, out_weights, in_weights)

    height = np.zeros(n - 1)
    edge_sampling = np.zeros(n - 1)
    cluster_weight = np.zeros(n - 1)
    for t in range(n - 1):
        node1 = int(dendrogram[t][0])
        node2 = int(dendrogram[t][1])
        if node1 >= n and height[node1 - n] == dendrogram[t][2]:
            edge_sampling[t] = edge_sampling[node1 - n]
            edge_sampling[node1 - n] = 0
        elif node2 >= n and height[node2 - n] == dendrogram[t][2]:
            edge_sampling[t] = edge_sampling[node2 - n]
            edge_sampling[node2 - n] = 0
        height[t] = dendrogram[t][2]
        if node2 in aggregate_graph.neighbors[node1]:
            edge_sampling[t] += aggregate_graph.neighbors[node1][node2]
        cluster_weight[t] = aggregate_graph.cluster_out_weights[node1] + aggregate_graph.cluster_out_weights[node2] \
            + aggregate_graph.cluster_in_weights[node1] + aggregate_graph.cluster_in_weights[node2]
        aggregate_graph.merge(node1, node2)

    cost: float = edge_sampling.dot(cluster_weight) / 2
    return 1 - cost
Exemplo n.º 13
0
    def fit(self, adjacency: Union[sparse.csr_matrix, np.ndarray]) -> 'Paris':
        """
        Agglomerative clustering using the nearest neighbor chain.

        Parameters
        ----------
        adjacency :
            Adjacency matrix of the graph.

        Returns
        -------
        self: :class:`Paris`
        """
        adjacency = check_format(adjacency)
        if not is_square(adjacency):
            raise ValueError(
                'The adjacency matrix is not square. Use BiParis() instead.')
        n = adjacency.shape[0]
        sym_adjacency = adjacency + adjacency.T

        weights = self.weights
        out_weights = check_probs(weights, adjacency)
        in_weights = check_probs(weights, adjacency.T)

        if n <= 1:
            raise ValueError('The graph must contain at least two nodes.')

        if self.engine == 'python':
            aggregate_graph = AggregateGraph(sym_adjacency, out_weights,
                                             in_weights)

            connected_components = []
            dendrogram = []

            while len(aggregate_graph.cluster_sizes) > 0:
                node = None
                for node in aggregate_graph.cluster_sizes:
                    break
                chain = [node]
                while chain:
                    node = chain.pop()
                    if aggregate_graph.neighbors[node]:
                        max_sim = -float("inf")
                        nearest_neighbor = None
                        for neighbor in aggregate_graph.neighbors[node]:
                            sim = aggregate_graph.similarity(node, neighbor)
                            if sim > max_sim:
                                nearest_neighbor = neighbor
                                max_sim = sim
                            elif sim == max_sim:
                                nearest_neighbor = min(neighbor,
                                                       nearest_neighbor)
                        if chain:
                            nearest_neighbor_last = chain.pop()
                            if nearest_neighbor_last == nearest_neighbor:
                                dendrogram.append([
                                    node, nearest_neighbor, 1. / max_sim,
                                    aggregate_graph.cluster_sizes[node] +
                                    aggregate_graph.
                                    cluster_sizes[nearest_neighbor]
                                ])
                                aggregate_graph.merge(node, nearest_neighbor)
                            else:
                                chain.append(nearest_neighbor_last)
                                chain.append(node)
                                chain.append(nearest_neighbor)
                        else:
                            chain.append(node)
                            chain.append(nearest_neighbor)
                    else:
                        connected_components.append(
                            (node, aggregate_graph.cluster_sizes[node]))
                        del aggregate_graph.cluster_sizes[node]

            node, cluster_size = connected_components.pop()
            for next_node, next_cluster_size in connected_components:
                cluster_size += next_cluster_size
                dendrogram.append(
                    [node, next_node,
                     float("inf"), cluster_size])
                node = aggregate_graph.next_cluster
                aggregate_graph.next_cluster += 1

            dendrogram = np.array(dendrogram)
            if self.reorder:
                dendrogram = reorder_dendrogram(dendrogram)

            self.dendrogram_ = dendrogram
            return self

        elif self.engine == 'numba':

            n = np.int32(adjacency.shape[0])
            indices, indptr, data = sym_adjacency.indices, sym_adjacency.indptr, sym_adjacency.data

            dendrogram = fit_core(n, out_weights, in_weights, data, indices,
                                  indptr)
            dendrogram = np.array(dendrogram)
            if self.reorder:
                dendrogram = reorder_dendrogram(dendrogram)

            self.dendrogram_ = dendrogram
            return self

        else:
            raise ValueError('Unknown engine.')
Exemplo n.º 14
0
    def fit(self, adjacency: Union[sparse.csr_matrix, np.ndarray]) -> 'BiSpectral':
        """
        Computes the generalized SVD of the adjacency matrix.

        Parameters
        ----------
        adjacency: array-like, shape = (n1, n2)
            Adjacency matrix, where n1 = n2 is the number of nodes for a standard graph,
            n1, n2 are the number of nodes in each part for a bipartite graph.

        Returns
        -------
        self: :class:`BiSpectral`
        """
        adjacency = check_format(adjacency).asfptype()
        n1, n2 = adjacency.shape

        if self.solver == 'auto':
            solver = auto_solver(adjacency.nnz)
            if solver == 'lanczos':
                self.solver: SVDSolver = LanczosSVD()
            else:
                self.solver: SVDSolver = HalkoSVD()

        total_weight = adjacency.dot(np.ones(n2)).sum()
        regularization = self.regularization
        if regularization:
            if self.relative_regularization:
                regularization = regularization * total_weight / (n1 * n2)
            adjacency = SparseLR(adjacency, [(regularization * np.ones(n1), np.ones(n2))])

        w_row = check_weights(self.weights, adjacency)
        w_col = check_weights(self.col_weights, adjacency.T)
        diag_row = diag_pinv(np.sqrt(w_row))
        diag_col = diag_pinv(np.sqrt(w_col))

        normalized_adj = safe_sparse_dot(diag_row, safe_sparse_dot(adjacency, diag_col))

        # svd
        if self.embedding_dimension >= min(n1, n2) - 1:
            n_components = min(n1, n2) - 1
            warnings.warn(Warning("The dimension of the embedding must be less than the number of rows "
                                  "and the number of columns. Changed accordingly."))
        else:
            n_components = self.embedding_dimension + 1
        self.solver.fit(normalized_adj, n_components)

        index = np.argsort(-self.solver.singular_values_)
        self.singular_values_ = self.solver.singular_values_[index[1:]]
        self.row_embedding_ = diag_row.dot(self.solver.left_singular_vectors_[:, index[1:]])
        self.col_embedding_ = diag_col.dot(self.solver.right_singular_vectors_[:, index[1:]])

        if self.scaling:
            if self.scaling == 'multiply':
                self.row_embedding_ *= np.sqrt(self.singular_values_)
                self.col_embedding_ *= np.sqrt(self.singular_values_)
            elif self.scaling == 'divide':
                energy_levels: np.ndarray = np.sqrt(1 - np.clip(self.singular_values_, 0, 1) ** 2)
                energy_levels[energy_levels > 0] = 1 / energy_levels[energy_levels > 0]
                self.row_embedding_ *= energy_levels
                self.col_embedding_ *= energy_levels
            elif self.scaling == 'barycenter':
                self.row_embedding_ *= self.singular_values_
            else:
                warnings.warn(Warning("The scaling must be 'multiply' or 'divide' or 'barycenter'. No scaling done."))

        self.embedding_ = np.vstack((self.row_embedding_, self.col_embedding_))
        return self
Exemplo n.º 15
0
def cosine_modularity(adjacency,
                      embedding: np.ndarray,
                      col_embedding=None,
                      resolution=1.,
                      weights='degree',
                      return_all: bool = False):
    """
    Quality metric of an embedding :math:`x` defined by:

    :math:`Q = \\sum_{ij}\\left(\\dfrac{A_{ij}}{w} - \\gamma \\dfrac{w_iw'_j}{w^2}\\right)
    \\left(\\dfrac{1 + \\pi(x_i)^T\\pi(x_j)}{2}\\right)`

    where :math:`\\pi(x_i)` is the projection of :math:`x_i` onto the unit-sphere.

    For bipartite graphs with column embedding :math:`y`, the metric is

    :math:`Q = \\sum_{ij}\\left(\\dfrac{B_{ij}}{w} - \\gamma \\dfrac{w_iw'_j}{w^2}\\right)
    \\left(\\dfrac{1 + \\pi(x_i)^T\\pi(y_j)}{2}\\right)`

    This metric is normalized to lie between -1 and 1 (for :math:`\\gamma = 1`).

    Parameters
    ----------
    adjacency: sparse.csr_matrix or np.ndarray
        Adjacency matrix of the graph.
    embedding: np.ndarray
        Embedding of the nodes.
    col_embedding: None or np.ndarray
        For biadjacency matrices, embedding of the columns.
    resolution: float
        Resolution parameter.
    weights: ``'degree'`` or ``'uniform'``
        Weights of the nodes.
    return_all: bool, default = ``False``
        whether to return (fit, div, :math:`Q`) or :math:`Q`

    Returns
    -------
    modularity : float
    fit: float, optional
    diversity: float, optional
    """
    adjacency = check_format(adjacency)
    total_weight: float = adjacency.data.sum()

    if col_embedding is None:
        if not is_square(adjacency):
            raise ValueError(
                'col_embedding cannot be None for non-square adjacency matrices.'
            )
        else:
            col_embedding = embedding.copy()

    row_norms = np.linalg.norm(embedding, axis=1)
    col_norms = np.linalg.norm(col_embedding, axis=1)

    norm_row_emb = embedding
    norm_row_emb[(row_norms > 0)] /= row_norms[:, np.newaxis]
    norm_col_emb = col_embedding
    norm_col_emb[(col_norms > 0)] /= col_norms[:, np.newaxis]

    row_probs = check_probs(weights, adjacency)
    col_probs = check_probs(weights, adjacency.T)

    fit: float = 0.5 * (1 + (np.multiply(
        norm_row_emb, adjacency.dot(norm_col_emb))).sum() / total_weight)
    div: float = 0.5 * (
        1 + (embedding.T.dot(row_probs)).dot(col_embedding.T.dot(col_probs)))

    if return_all:
        return fit, div, fit - resolution * div
    else:
        return fit - resolution * div
Exemplo n.º 16
0
    def fit(self,
            adjacency: Union[sparse.csr_matrix, np.ndarray],
            randomized_decomposition: bool = True,
            n_iter='auto',
            power_iteration_normalizer: Union[str, None] = 'auto',
            random_state=None) -> 'GSVDEmbedding':
        """Fits the model from data in adjacency_matrix.

        Parameters
        ----------
        adjacency: array-like, shape = (n, m)
            Adjacency matrix, where n = m is the number of nodes for a standard directed or undirected graph,
            n is the cardinal of V1 and m is the cardinal of V2 for a bipartite graph.
        randomized_decomposition:
            whether to use a randomized (and faster) svd method or the standard scipy one.
        n_iter: int or ``'auto'`` (default is ``'auto'``)
            See :meth:`sknetwork.embedding.randomized_range_finder`
        power_iteration_normalizer: ``'auto'`` (default), ``'QR'``, ``'LU'``, ``None``
            See :meth:`sknetwork.embedding.randomized_range_finder`
        random_state: int, RandomState instance or ``None``, optional (default= ``None``)
            See :meth:`sknetwork.embedding.randomized_range_finder`

        Returns
        -------
        self: :class:`GSVDEmbedding`
        """
        adjacency = check_format(adjacency)
        n_nodes, m_nodes = adjacency.shape
        total_weight = adjacency.data.sum()
        # out-degree vector
        dou = adjacency.dot(np.ones(m_nodes))
        # in-degree vector
        din = adjacency.T.dot(np.ones(n_nodes))

        # pseudo inverse square-root out-degree matrix
        dhou = sparse.diags(np.sqrt(dou),
                            shape=(n_nodes, n_nodes),
                            format='csr')
        dhou.data = 1 / dhou.data
        # pseudo inverse square-root in-degree matrix
        dhin = sparse.diags(np.sqrt(din),
                            shape=(m_nodes, m_nodes),
                            format='csr')
        dhin.data = 1 / dhin.data

        laplacian = dhou.dot(adjacency.dot(dhin))

        if randomized_decomposition:
            u, sigma, vt = randomized_svd(
                laplacian,
                self.embedding_dimension,
                n_iter=n_iter,
                power_iteration_normalizer=power_iteration_normalizer,
                random_state=random_state)
        else:
            u, sigma, vt = linalg.svds(laplacian, self.embedding_dimension)

        self.singular_values_ = sigma
        self.embedding_ = np.sqrt(total_weight) * dhou.dot(u) * sigma
        self.features_ = np.sqrt(total_weight) * dhin.dot(vt.T)
        # shift the center of mass
        self.embedding_ -= np.ones((n_nodes, 1)).dot(
            self.embedding_.T.dot(dou)[:, np.newaxis].T) / total_weight
        self.features_ -= np.ones((m_nodes, 1)).dot(
            self.features_.T.dot(din)[:, np.newaxis].T) / total_weight

        return self
Exemplo n.º 17
0
    def fit(self, adjacency: Union[sparse.csr_matrix,
                                   np.ndarray]) -> 'Spectral':
        """Fits the model from data in adjacency.

        Parameters
        ----------
        adjacency :
              Adjacency matrix of the graph (symmetric matrix).

        Returns
        -------
        self: :class:`Spectral`
        """

        adjacency = check_format(adjacency).asfptype()

        if not is_square(adjacency):
            raise ValueError(
                'The adjacency matrix is not square. See BiSpectral.')

        if not is_symmetric(adjacency):
            raise ValueError(
                'The adjacency matrix is not symmetric.'
                'Either convert it to a symmetric matrix or use BiSpectral.')

        n = adjacency.shape[0]

        if self.solver == 'auto':
            solver = auto_solver(adjacency.nnz)
            if solver == 'lanczos':
                self.solver: EigSolver = LanczosEig()
            else:
                self.solver: EigSolver = HalkoEig()

        if self.embedding_dimension > n - 2:
            warnings.warn(
                Warning(
                    "The dimension of the embedding must be less than the number of nodes - 1."
                ))
            n_components = n - 2
        else:
            n_components = self.embedding_dimension + 1

        if (self.regularization is None
                or self.regularization == 0.) and not is_connected(adjacency):
            warnings.warn(
                Warning(
                    "The graph is not connected and low-rank regularization is not active."
                    "This can cause errors in the computation of the embedding."
                ))

        if isinstance(self.solver, HalkoEig) and not self.normalized_laplacian:
            raise NotImplementedError(
                "Halko solver is not yet compatible with regular Laplacian."
                "Call 'fit' with 'normalized_laplacian' = True or force lanczos solver."
            )

        weights = adjacency.dot(np.ones(n))
        regularization = self.regularization
        if regularization:
            if self.relative_regularization:
                regularization = regularization * weights.sum() / n**2
            weights += regularization * n

        if self.normalized_laplacian:
            # Finding the largest eigenvalues of the normalized adjacency is easier for the solver than finding the
            # smallest eigenvalues of the normalized laplacian.
            normalizing_matrix = diag_pinv(np.sqrt(weights))

            if regularization:
                norm_adjacency = NormalizedAdjacencyOperator(
                    adjacency, regularization)
            else:
                norm_adjacency = normalizing_matrix.dot(
                    adjacency.dot(normalizing_matrix))

            self.solver.which = 'LA'
            self.solver.fit(matrix=norm_adjacency, n_components=n_components)
            eigenvalues = 1 - self.solver.eigenvalues_
            # eigenvalues of the Laplacian in increasing order
            index = np.argsort(eigenvalues)
            # skip first eigenvalue
            eigenvalues = eigenvalues[index][1:]
            # keep only positive eigenvectors of the normalized adjacency matrix
            eigenvectors = self.solver.eigenvectors_[:, index][:, 1:] * (
                eigenvalues < 1 - self.tol)
            embedding = np.array(normalizing_matrix.dot(eigenvectors))

        else:
            if regularization:
                laplacian = LaplacianOperator(adjacency, regularization)
            else:
                weight_matrix = sparse.diags(weights, format='csr')
                laplacian = weight_matrix - adjacency

            self.solver.which = 'SM'
            self.solver.fit(matrix=laplacian, n_components=n_components)
            eigenvalues = self.solver.eigenvalues_[1:]
            embedding = self.solver.eigenvectors_[:, 1:]

        if self.scaling:
            if self.scaling == 'multiply':
                eigenvalues = np.minimum(eigenvalues, 1)
                embedding *= np.sqrt(1 - eigenvalues)
            elif self.scaling == 'divide':
                inv_eigenvalues = np.zeros_like(eigenvalues)
                index = np.where(eigenvalues > 0)[0]
                inv_eigenvalues[index] = 1 / eigenvalues[index]
                embedding *= np.sqrt(inv_eigenvalues)
            else:
                warnings.warn(
                    Warning(
                        "The scaling must be 'multiply' or 'divide'. No scaling done."
                    ))

        self.embedding_ = embedding
        self.eigenvalues_ = eigenvalues
        self.regularization_ = regularization

        return self
Exemplo n.º 18
0
def tree_sampling_divergence(adjacency: sparse.csr_matrix, dendrogram: np.ndarray, weights: str = 'degree',
                             normalized: bool = True) -> float:
    """
    Tree sampling divergence of a hierarchy (quality metric).

    The higher the score, the better.

    Parameters
    ----------
    adjacency :
        Adjacency matrix of the graph.
    dendrogram :
        Dendrogram.
    weights :
        Weights of nodes.
        ``'degree'`` (default) or ``'uniform'``.
    normalized:
        If ``True``, normalized by the mutual information of the graph.

    Returns
    -------
    score : float
        The tree sampling divergence of the hierarchy.
        If normalized, returns a value between 0 and 1.

    References
    ----------
    Charpentier, B. & Bonald, T. (2019).
    `Tree Sampling Divergence: An Information-Theoretic Metric for
    Hierarchical Graph Clustering.
    <https://hal.telecom-paristech.fr/hal-02144394/document>`_
    Proceedings of IJCAI.

    """
    adjacency = check_format(adjacency)
    if not is_square(adjacency):
        raise ValueError('The adjacency matrix is not square.')

    n = adjacency.shape[0]
    if n <= 1:
        raise ValueError('The graph must contain at least two nodes.')

    total_weight = adjacency.data.sum()
    if total_weight <= 0:
        raise ValueError('The graph must contain at least one edge.')

    adjacency.data = adjacency.data / total_weight

    out_weights = check_probs(weights, adjacency)
    in_weights = check_probs(weights, adjacency.T)

    aggregate_graph = AggregateGraph(adjacency + adjacency.T, out_weights, in_weights)

    height = np.zeros(n - 1)
    edge_sampling = np.zeros(n - 1)
    node_sampling = np.zeros(n - 1)
    for t in range(n - 1):
        node1 = int(dendrogram[t][0])
        node2 = int(dendrogram[t][1])
        if node1 >= n and height[node1 - n] == dendrogram[t][2]:
            edge_sampling[t] = edge_sampling[node1 - n]
            edge_sampling[node1 - n] = 0
            node_sampling[t] = node_sampling[node1 - n]
        elif node2 >= n and height[node2 - n] == dendrogram[t][2]:
            edge_sampling[t] = edge_sampling[node2 - n]
            edge_sampling[node2 - n] = 0
            node_sampling[t] = node_sampling[node2 - n]
        if node2 in aggregate_graph.neighbors[node1]:
            edge_sampling[t] += aggregate_graph.neighbors[node1][node2]
        node_sampling[t] += aggregate_graph.cluster_out_weights[node1] * aggregate_graph.cluster_in_weights[node2] + \
            aggregate_graph.cluster_out_weights[node2] * aggregate_graph.cluster_in_weights[node1]
        height[t] = dendrogram[t][2]
        aggregate_graph.merge(node1, node2)

    index = np.where(edge_sampling)[0]
    score = edge_sampling[index].dot(np.log(edge_sampling[index] / node_sampling[index]))
    if normalized:
        inv_out_weights = sparse.diags(out_weights, shape=(n, n), format='csr')
        inv_out_weights.data = 1 / inv_out_weights.data
        inv_in_weights = sparse.diags(in_weights, shape=(n, n), format='csr')
        inv_in_weights.data = 1 / inv_in_weights.data
        sampling_ratio = inv_out_weights.dot(adjacency.dot(inv_in_weights))
        inv_out_weights.data = np.ones(len(inv_out_weights.data))
        inv_in_weights.data = np.ones(len(inv_in_weights.data))
        edge_sampling = inv_out_weights.dot(adjacency.dot(inv_in_weights))
        mutual_information = edge_sampling.data.dot(np.log(sampling_ratio.data))
        score /= mutual_information
    return score
Exemplo n.º 19
0
    def fit(self, adjacency: Union[sparse.csr_matrix, np.ndarray]) -> 'Louvain':
        """
        Clustering using chosen Optimizer.

        Parameters
        ----------
        adjacency :
            Adjacency matrix of the graph.

        Returns
        -------
        self: :class:`Louvain`
        """
        adjacency = check_format(adjacency)
        if not is_square(adjacency):
            raise ValueError('The adjacency matrix is not square. Use BiLouvain() instead.')
        n = adjacency.shape[0]

        out_weights = check_probs('degree', adjacency)
        in_weights = check_probs('degree', adjacency.T)

        nodes = np.arange(n)
        if self.shuffle_nodes:
            nodes = self.random_state.permutation(nodes)
            adjacency = adjacency[nodes, :].tocsc()[:, nodes].tocsr()

        graph = AggregateGraph(adjacency, out_weights, in_weights)

        membership = sparse.identity(n, format='csr')
        increase = True
        iteration_count = 0
        self.log.print("Starting with", graph.n_nodes, "nodes.")
        while increase:
            iteration_count += 1

            self.algorithm.fit(graph)

            if self.algorithm.score_ <= self.agg_tol:
                increase = False
            else:
                agg_membership = membership_matrix(self.algorithm.labels_)
                membership = membership.dot(agg_membership)
                graph.aggregate(agg_membership)

                if graph.n_nodes == 1:
                    break
            self.log.print("Iteration", iteration_count, "completed with", graph.n_nodes, "clusters and ",
                           self.algorithm.score_, "increment.")
            if iteration_count == self.max_agg_iter:
                break

        if self.sorted_cluster:
            labels = reindex_clusters(membership.indices)
        else:
            labels = membership.indices
        if self.shuffle_nodes:
            reverse = np.empty(nodes.size, nodes.dtype)
            reverse[nodes] = np.arange(nodes.size)
            labels = labels[reverse]

        self.labels_ = labels
        self.iteration_count_ = iteration_count
        self.aggregate_graph_ = graph.norm_adjacency * adjacency.data.sum()

        return self