示例#1
0
def _instanciate_vars(adjacency: sparse.csr_matrix, weights: str = 'uniform'):
    """Initialize standard variables for metrics."""
    n = adjacency.shape[0]
    weights_row = check_probs(weights, adjacency)
    weights_col = check_probs(weights, adjacency.T)
    sym_adjacency = directed2undirected(adjacency)

    aggregate_graph = AggregateGraph(weights_row, weights_col,
                                     sym_adjacency.data.astype(np.float),
                                     sym_adjacency.indices,
                                     sym_adjacency.indptr)

    height = np.zeros(n - 1)
    cluster_weight = np.zeros(n - 1)
    edge_sampling = np.zeros(n - 1)

    return aggregate_graph, height, cluster_weight, edge_sampling, weights_row, weights_col
示例#2
0
    def fit(self, adjacency: Union[sparse.csr_matrix,
                                   np.ndarray]) -> 'Louvain':
        """Fit algorithm to the data.

        Parameters
        ----------
        adjacency :
            Adjacency matrix of the graph.

        Returns
        -------
        self: :class:`Louvain`
        """
        adjacency = check_format(adjacency)
        check_square(adjacency)
        n_nodes = adjacency.shape[0]

        probs_out = check_probs('degree', adjacency)
        probs_in = check_probs('degree', adjacency.T)

        nodes = np.arange(n_nodes)
        if self.shuffle_nodes:
            nodes = self.random_state.permutation(nodes)
            adjacency = adjacency[nodes, :].tocsc()[:, nodes].tocsr()

        adjacency_norm = adjacency / adjacency.data.sum()

        membership = sparse.identity(n_nodes, format='csr')
        increase = True
        count_aggregations = 0
        self.log.print("Starting with", n_nodes, "nodes.")
        while increase:
            count_aggregations += 1

            current_labels, pass_increase = self._optimize(
                n_nodes, adjacency_norm, probs_out, probs_in)
            _, current_labels = np.unique(current_labels, return_inverse=True)

            if pass_increase <= self.tol_aggregation:
                increase = False
            else:
                membership_agg = membership_matrix(current_labels)
                membership = membership.dot(membership_agg)
                n_nodes, adjacency_norm, probs_out, probs_in = self._aggregate(
                    adjacency_norm, probs_out, probs_in, membership_agg)

                if n_nodes == 1:
                    break
            self.log.print("Aggregation", count_aggregations, "completed with",
                           n_nodes, "clusters and ", pass_increase,
                           "increment.")
            if count_aggregations == self.n_aggregations:
                break

        if self.sort_clusters:
            labels = reindex_labels(membership.indices)
        else:
            labels = membership.indices
        if self.shuffle_nodes:
            reverse = np.empty(nodes.size, nodes.dtype)
            reverse[nodes] = np.arange(nodes.size)
            labels = labels[reverse]

        self.labels_ = labels
        self._secondary_outputs(adjacency)

        return self
示例#3
0
    def fit(self, adjacency: Union[sparse.csr_matrix,
                                   np.ndarray]) -> 'Louvain':
        """Fit algorithm to the data.

        Parameters
        ----------
        adjacency :
            Adjacency matrix of the graph.

        Returns
        -------
        self: :class:`Louvain`
        """
        adjacency = check_format(adjacency)
        check_square(adjacency)
        n = adjacency.shape[0]

        if self.modularity == 'potts':
            probs_ou = check_probs('uniform', adjacency)
            probs_in = probs_ou.copy()
        elif self.modularity == 'newman':
            probs_ou = check_probs('degree', adjacency)
            probs_in = probs_ou.copy()
        elif self.modularity == 'dugue':
            probs_ou = check_probs('degree', adjacency)
            probs_in = check_probs('degree', adjacency.T)
        else:
            raise ValueError('Unknown modularity function.')

        nodes = np.arange(n, dtype=np.int32)
        if self.shuffle_nodes:
            nodes = self.random_state.permutation(nodes)
            adjacency = adjacency[nodes, :].tocsc()[:, nodes].tocsr()

        adjacency_clust = adjacency / adjacency.data.sum()

        membership = sparse.identity(n, format='csr')
        increase = True
        count_aggregations = 0
        self.log.print("Starting with", n, "nodes.")
        while increase:
            count_aggregations += 1

            labels_clust, pass_increase = self._optimize(
                adjacency_clust, probs_ou, probs_in)
            _, labels_clust = np.unique(labels_clust, return_inverse=True)

            if pass_increase <= self.tol_aggregation:
                increase = False
            else:
                membership_clust = membership_matrix(labels_clust)
                membership = membership.dot(membership_clust)
                adjacency_clust, probs_ou, probs_in = self._aggregate(
                    adjacency_clust, probs_ou, probs_in, membership_clust)

                n = adjacency_clust.shape[0]
                if n == 1:
                    break
            self.log.print("Aggregation", count_aggregations, "completed with",
                           n, "clusters and ", pass_increase, "increment.")
            if count_aggregations == self.n_aggregations:
                break

        if self.sort_clusters:
            labels = reindex_labels(membership.indices)
        else:
            labels = membership.indices
        if self.shuffle_nodes:
            reverse = np.empty(nodes.size, nodes.dtype)
            reverse[nodes] = np.arange(nodes.size)
            labels = labels[reverse]

        self.labels_ = labels
        self._secondary_outputs(adjacency)

        return self
示例#4
0
def cosine_modularity(adjacency, embedding: np.ndarray, embedding_col=None, resolution=1., weights='degree',
                      return_all: bool = False):
    """Quality metric of an embedding :math:`x` defined by:

    :math:`Q = \\sum_{ij}\\left(\\dfrac{A_{ij}}{w} - \\gamma \\dfrac{w^+_iw^-_j}{w^2}\\right)
    \\left(\\dfrac{1 + \\cos(x_i, x_j)}{2}\\right)`

    where

    * :math:`w^+_i, w^-_i` are the out-weight, in-weight of node :math:`i` (for digraphs),\n
    * :math:`w = 1^TA1` is the total weight of the graph.

    For bipartite graphs with column embedding :math:`y`, the metric is

    :math:`Q = \\sum_{ij}\\left(\\dfrac{B_{ij}}{w} - \\gamma \\dfrac{w_{1,i}w_{2,j}}{w^2}\\right)
    \\left(\\dfrac{1 + \\cos(x_i, y_j)}{2}\\right)`

    where

    * :math:`w_{1,i}, w_{2,j}` are the weights of nodes :math:`i` (row) and :math:`j` (column),\n
    * :math:`w = 1^TB1` is the total weight of the graph.

    Parameters
    ----------
    adjacency :
        Adjacency matrix of the graph.
    embedding :
        Embedding of the nodes.
    embedding_col :
        Embedding of the columns (for bipartite graphs).
    resolution :
        Resolution parameter.
    weights : ``'degree'`` or ``'uniform'``
        Weights of the nodes.
    return_all :
        If ``True``, also return fit and diversity

    Returns
    -------
    modularity : float
    fit: float, optional
    diversity: float, optional

    Example
    -------
    >>> from sknetwork.embedding import cosine_modularity
    >>> from sknetwork.data import karate_club
    >>> graph = karate_club(metadata=True)
    >>> adjacency = graph.adjacency
    >>> embedding = graph.position
    >>> np.round(cosine_modularity(adjacency, embedding), 2)
    0.35
    """
    adjacency = check_format(adjacency)
    total_weight: float = adjacency.data.sum()

    if embedding_col is None:
        check_square(adjacency)
        embedding_col = embedding.copy()

    embedding_row_norm = normalize(embedding, p=2)
    embedding_col_norm = normalize(embedding_col, p=2)

    probs_row = check_probs(weights, adjacency)
    probs_col = check_probs(weights, adjacency.T)

    if isinstance(embedding_row_norm, sparse.csr_matrix) and isinstance(embedding_col_norm, sparse.csr_matrix):
        fit: float = 0.5 * (1 + (embedding_row_norm.multiply(adjacency.dot(embedding_col_norm))).sum() / total_weight)
    else:
        fit: float = 0.5 * (
            1 + (np.multiply(embedding_row_norm, adjacency.dot(embedding_col_norm))).sum() / total_weight)
    div: float = 0.5 * (1 + (embedding.T.dot(probs_row)).dot(embedding_col.T.dot(probs_col)))

    if return_all:
        return fit, div, fit - resolution * div
    else:
        return fit - resolution * div
示例#5
0
def bimodularity(biadjacency: Union[sparse.csr_matrix, np.ndarray], labels: np.ndarray, labels_col: np.ndarray,
                 weights: Union[str, np.ndarray] = 'degree', weights_col: Union[str, np.ndarray] = 'degree',
                 resolution: float = 1, return_all: bool = False) -> Union[float, Tuple[float, float, float]]:
    """Bimodularity of a clustering (node partition).

    * Bigraphs

    The bimodularity of a clustering is

    :math:`Q = \\sum_{i}\\sum_{j}\\left(\\dfrac{B_{ij}}{w} - \\gamma \\dfrac{w_{1,i}w_{2,j}}{w^2}\\right)
    \\delta_{c_{1,i},c_{2,j}}`

    where

    * :math:`c_{1,i}, c_{2,j}` are the clusters of nodes :math:`i` (row) and :math:`j` (column),\n
    * :math:`w_{1,i}, w_{2,j}` are the weights of nodes :math:`i` (row) and :math:`j` (column),\n
    * :math:`w = 1^TB1` is the total weight,\n
    * :math:`\\delta` is the Kronecker symbol,\n
    * :math:`\\gamma \\ge 0` is the resolution parameter.

    Parameters
    ----------
    biadjacency :
        Biadjacency matrix of the graph (shape :math:`n_1 \\times n_2`).
    labels :
        Labels of rows, vector of size :math:`n_1`.
    labels_col:
        Labels of columns, vector of size :math:`n_2`.
    weights :
        Weights of nodes.
        ``'degree'`` (default), ``'uniform'`` or custom weights.
    weights_col :
        Weights of columns.
        ``'degree'`` (default), ``'uniform'`` or custom weights.
    resolution:
        Resolution parameter (default = 1).
    return_all:
        If ``True``, return modularity, fit, diversity.

    Returns
    -------
    modularity : float
    fit: float, optional
    diversity: float, optional

    Example
    -------
    >>> from sknetwork.clustering import bimodularity
    >>> from sknetwork.data import star_wars
    >>> biadjacency = star_wars()
    >>> labels = np.array([1, 1, 0, 0])
    >>> labels_col = np.array([1, 0, 0])
    >>> np.round(bimodularity(biadjacency, labels, labels_col), 2)
    0.22
    """
    biadjacency = check_format(biadjacency).astype(float)
    n_row, n_col = biadjacency.shape

    if len(labels) != n_row:
        raise ValueError('Dimension mismatch between labels and biadjacency matrix.')
    if len(labels_col) != n_col:
        raise ValueError('Dimension mismatch between labels_col and biadjacency matrix.')

    adjacency = bipartite2directed(biadjacency)

    weights_ = check_probs(weights, biadjacency)
    weights_ = np.hstack((weights_, np.zeros(n_col)))
    weights_col_ = check_probs(weights_col, biadjacency.T)
    weights_col_ = np.hstack((np.zeros(n_row), weights_col_))

    labels_ = np.hstack((labels, labels_col))

    return modularity(adjacency, labels_, weights_, weights_col_, resolution, return_all)
示例#6
0
def modularity(adjacency: Union[sparse.csr_matrix, np.ndarray], labels: np.ndarray,
               weights: Union[str, np.ndarray] = 'degree', weights_in: Union[str, np.ndarray] = 'degree',
               resolution: float = 1, return_all: bool = False) -> Union[float, Tuple[float, float, float]]:
    """Modularity of a clustering (node partition).

    * Graphs
    * Digraphs

    The modularity of a clustering is

    :math:`Q = \\sum_{i,j}\\left(\\dfrac{A_{ij}}{w} - \\gamma \\dfrac{w_iw_j}{w^2}\\right)\\delta_{c_i,c_j}`
    for graphs,

    :math:`Q = \\sum_{i,j}\\left(\\dfrac{A_{ij}}{w} - \\gamma \\dfrac{w^+_iw^-_j}{w^2}\\right)\\delta_{c_i,c_j}`
    for digraphs,

    where

    * :math:`c_i` is the cluster of node :math:`i`,\n
    * :math:`w_i` is the weight of node :math:`i`,\n
    * :math:`w^+_i, w^-_i` are the out-weight, in-weight of node :math:`i` (for digraphs),\n
    * :math:`w = 1^TA1` is the total weight,\n
    * :math:`\\delta` is the Kronecker symbol,\n
    * :math:`\\gamma \\ge 0` is the resolution parameter.

    Parameters
    ----------
    adjacency:
        Adjacency matrix of the graph.
    labels:
        Labels of nodes, vector of size :math:`n` .
    weights :
        Weights of nodes.
        ``'degree'`` (default), ``'uniform'`` or custom weights.
    weights_in :
        In-weights of nodes.
        ``None`` (default), ``'degree'``, ``'uniform'`` or custom weights.
        If ``None``, taken equal to weights.
    resolution:
        Resolution parameter (default = 1).
    return_all:
        If ``True``, return modularity, fit, diversity.

    Returns
    -------
    modularity : float
    fit: float, optional
    diversity: float, optional

    Example
    -------
    >>> from sknetwork.clustering import modularity
    >>> from sknetwork.data import house
    >>> adjacency = house()
    >>> labels = np.array([0, 0, 1, 1, 0])
    >>> np.round(modularity(adjacency, labels), 2)
    0.11
    """
    adjacency = check_format(adjacency).astype(float)
    check_square(adjacency)

    if len(labels) != adjacency.shape[0]:
        raise ValueError('Dimension mismatch between labels and adjacency matrix.')

    probs_row = check_probs(weights, adjacency)
    probs_col = check_probs(weights_in, adjacency.T)
    membership = membership_matrix(labels)

    fit: float = membership.multiply(adjacency.dot(membership)).data.sum() / adjacency.data.sum()
    div: float = membership.T.dot(probs_col).dot(membership.T.dot(probs_row))
    mod: float = fit - resolution * div
    if return_all:
        return mod, fit, div
    else:
        return mod