Пример #1
0
def dasgupta_cost(adjacency: sparse.csr_matrix,
                  dendrogram: np.ndarray,
                  weights: str = 'uniform',
                  normalized: bool = False) -> float:
    """Dasgupta's cost of a hierarchy.

    Expected size (weights = ``'uniform'``) or expected volume (weights = ``'degree'``) of the cluster induced by
    random edge sampling (closest ancestor of the two nodes in the hierarchy).

    Parameters
    ----------
    adjacency :
        Adjacency matrix of the graph.
    dendrogram :
        Dendrogram.
    weights :
        Weights of nodes.
        ``'degree'`` or ``'uniform'`` (default).
    normalized :
        If ``True``, normalized cost (between 0 and 1).

    Returns
    -------
    cost : float
        Cost.

    Example
    -------
    >>> from sknetwork.hierarchy import dasgupta_score, Paris
    >>> from sknetwork.data import house
    >>> paris = Paris()
    >>> adjacency = house()
    >>> dendrogram = paris.fit_transform(adjacency)
    >>> cost = dasgupta_cost(adjacency, dendrogram)
    >>> np.round(cost, 2)
    3.33

    References
    ----------
    Dasgupta, S. (2016). A cost function for similarity-based hierarchical clustering.
    Proceedings of ACM symposium on Theory of Computing.
    """
    adjacency = check_format(adjacency)
    check_square(adjacency)

    n = adjacency.shape[0]
    check_min_size(n, 2)

    edge_sampling, _, cluster_weight = get_sampling_distributions(
        adjacency, dendrogram, weights)
    cost = edge_sampling.dot(cluster_weight)

    if not normalized:
        if weights == 'degree':
            cost *= adjacency.data.sum()
        else:
            cost *= n

    return cost
Пример #2
0
def dasgupta_cost(adjacency: sparse.csr_matrix,
                  dendrogram: np.ndarray,
                  weights: str = 'uniform',
                  normalized: bool = False) -> float:
    """Dasgupta's cost of a hierarchy.

    * Graphs
    * Digraphs

    Expected size (weights = ``'uniform'``) or expected weight (weights = ``'degree'``) of the cluster induced by
    random edge sampling (closest ancestor of the two nodes in the hierarchy).

    Parameters
    ----------
    adjacency :
        Adjacency matrix of the graph.
    dendrogram :
        Dendrogram.
    weights :
        Weights of nodes.
        ``'degree'`` or ``'uniform'`` (default).
    normalized :
        If ``True``, normalized cost (between 0 and 1).

    Returns
    -------
    cost : float
        Cost.

    Example
    -------
    >>> from sknetwork.hierarchy import dasgupta_score, Paris
    >>> from sknetwork.data import house
    >>> paris = Paris()
    >>> adjacency = house()
    >>> dendrogram = paris.fit_transform(adjacency)
    >>> cost = dasgupta_cost(adjacency, dendrogram)
    >>> np.round(cost, 2)
    3.33

    References
    ----------
    Dasgupta, S. (2016). A cost function for similarity-based hierarchical clustering.
    Proceedings of ACM symposium on Theory of Computing.
    """
    adjacency = check_format(adjacency)
    check_square(adjacency)

    n = adjacency.shape[0]
    check_min_size(n, 2)

    aggregate_graph, height, edge_sampling, cluster_weight, _, _ = _instanciate_vars(
        adjacency, weights)

    for t in range(n - 1):
        i = int(dendrogram[t][0])
        j = int(dendrogram[t][1])
        if i >= n and height[i - n] == dendrogram[t][2]:
            edge_sampling[t] = edge_sampling[i - n]
            edge_sampling[i - n] = 0
        elif j >= n and height[j - n] == dendrogram[t][2]:
            edge_sampling[t] = edge_sampling[j - n]
            edge_sampling[j - n] = 0
        height[t] = dendrogram[t][2]
        if j in aggregate_graph.neighbors[i]:
            edge_sampling[t] += aggregate_graph.neighbors[i][j]
        cluster_weight[t] = aggregate_graph.cluster_out_weights[i] + aggregate_graph.cluster_out_weights[j] \
            + aggregate_graph.cluster_in_weights[i] + aggregate_graph.cluster_in_weights[j]
        aggregate_graph.merge(i, j)

    cost: float = edge_sampling.dot(cluster_weight) / 2

    if not normalized:
        if weights == 'degree':
            cost *= adjacency.data.sum()
        else:
            cost *= n

    return cost
Пример #3
0
def tree_sampling_divergence(adjacency: sparse.csr_matrix,
                             dendrogram: np.ndarray,
                             weights: str = 'degree',
                             normalized: bool = True) -> float:
    """Tree sampling divergence of a hierarchy (quality metric).

    * Graphs
    * Digraphs

    Parameters
    ----------
    adjacency :
        Adjacency matrix of the graph.
    dendrogram :
        Dendrogram.
    weights :
        Weights of nodes.
        ``'degree'`` (default) or ``'uniform'``.
    normalized :
        If ``True``, normalized score (between 0 and 1).

    Returns
    -------
    score : float
        Score.

    Example
    -------
    >>> from sknetwork.hierarchy import tree_sampling_divergence, Paris
    >>> from sknetwork.data import house
    >>> paris = Paris()
    >>> adjacency = house()
    >>> dendrogram = paris.fit_transform(adjacency)
    >>> score = tree_sampling_divergence(adjacency, dendrogram)
    >>> np.round(score, 2)
    0.52

    References
    ----------
    Charpentier, B. & Bonald, T. (2019).
    `Tree Sampling Divergence: An Information-Theoretic Metric for
    Hierarchical Graph Clustering.
    <https://hal.telecom-paristech.fr/hal-02144394/document>`_
    Proceedings of IJCAI.
    """
    adjacency = check_format(adjacency)
    check_square(adjacency)
    check_min_nnz(adjacency.nnz, 1)
    adjacency = adjacency.astype(float)
    n = adjacency.shape[0]
    check_min_size(n, 2)

    adjacency.data /= adjacency.data.sum()

    aggregate_graph, height, cluster_weight, edge_sampling, weights_row, weights_col = _instanciate_vars(
        adjacency, weights)
    node_sampling = np.zeros(n - 1)

    for t in range(n - 1):
        i = int(dendrogram[t][0])
        j = int(dendrogram[t][1])
        if i >= n and height[i - n] == dendrogram[t][2]:
            edge_sampling[t] = edge_sampling[i - n]
            edge_sampling[i - n] = 0
            node_sampling[t] = node_sampling[i - n]
        elif j >= n and height[j - n] == dendrogram[t][2]:
            edge_sampling[t] = edge_sampling[j - n]
            edge_sampling[j - n] = 0
            node_sampling[t] = node_sampling[j - n]
        if j in aggregate_graph.neighbors[i]:
            edge_sampling[t] += aggregate_graph.neighbors[i][j]
        node_sampling[t] += aggregate_graph.cluster_out_weights[i] * aggregate_graph.cluster_in_weights[j] + \
            aggregate_graph.cluster_out_weights[j] * aggregate_graph.cluster_in_weights[i]
        height[t] = dendrogram[t][2]
        aggregate_graph.merge(i, j)

    index = np.where(edge_sampling)[0]
    score = edge_sampling[index].dot(
        np.log(edge_sampling[index] / node_sampling[index]))
    if normalized:
        inv_out_weights = sparse.diags(weights_row, shape=(n, n), format='csr')
        inv_out_weights.data = 1 / inv_out_weights.data
        inv_in_weights = sparse.diags(weights_col, shape=(n, n), format='csr')
        inv_in_weights.data = 1 / inv_in_weights.data
        sampling_ratio = inv_out_weights.dot(adjacency.dot(inv_in_weights))
        inv_out_weights.data = np.ones(len(inv_out_weights.data))
        inv_in_weights.data = np.ones(len(inv_in_weights.data))
        edge_sampling = inv_out_weights.dot(adjacency.dot(inv_in_weights))
        mutual_information = edge_sampling.data.dot(np.log(
            sampling_ratio.data))
        score /= mutual_information
    return score
Пример #4
0
def tree_sampling_divergence(adjacency: sparse.csr_matrix,
                             dendrogram: np.ndarray,
                             weights: str = 'degree',
                             normalized: bool = True) -> float:
    """Tree sampling divergence of a hierarchy (quality metric).

    Parameters
    ----------
    adjacency :
        Adjacency matrix of the graph.
    dendrogram :
        Dendrogram.
    weights :
        Weights of nodes.
        ``'degree'`` (default) or ``'uniform'``.
    normalized :
        If ``True``, normalized score (between 0 and 1).

    Returns
    -------
    score : float
        Score.

    Example
    -------
    >>> from sknetwork.hierarchy import tree_sampling_divergence, Paris
    >>> from sknetwork.data import house
    >>> paris = Paris()
    >>> adjacency = house()
    >>> dendrogram = paris.fit_transform(adjacency)
    >>> score = tree_sampling_divergence(adjacency, dendrogram)
    >>> np.round(score, 2)
    0.05

    References
    ----------
    Charpentier, B. & Bonald, T. (2019).
    `Tree Sampling Divergence: An Information-Theoretic Metric for
    Hierarchical Graph Clustering.
    <https://hal.telecom-paristech.fr/hal-02144394/document>`_
    Proceedings of IJCAI.
    """
    adjacency = check_format(adjacency)
    check_square(adjacency)
    check_min_nnz(adjacency.nnz, 1)
    adjacency = adjacency.astype(float)
    n = adjacency.shape[0]
    check_min_size(n, 2)

    adjacency.data /= adjacency.data.sum()
    edge_sampling, node_sampling, _ = get_sampling_distributions(
        adjacency, dendrogram, weights)

    index = np.where(edge_sampling)[0]
    score = edge_sampling[index].dot(
        np.log(edge_sampling[index] / node_sampling[index]))
    if normalized:
        weights_row = get_probs(weights, adjacency)
        weights_col = get_probs(weights, adjacency.T)
        inv_out_weights = sparse.diags(weights_row, shape=(n, n), format='csr')
        inv_out_weights.data = 1 / inv_out_weights.data
        inv_in_weights = sparse.diags(weights_col, shape=(n, n), format='csr')
        inv_in_weights.data = 1 / inv_in_weights.data
        sampling_ratio = inv_out_weights.dot(adjacency.dot(inv_in_weights))
        inv_out_weights.data = np.ones(len(inv_out_weights.data))
        inv_in_weights.data = np.ones(len(inv_in_weights.data))
        edge_sampling = inv_out_weights.dot(adjacency.dot(inv_in_weights))
        mutual_information = edge_sampling.data.dot(np.log(
            sampling_ratio.data))
        if mutual_information > 0:
            score /= mutual_information
    return score