示例#1
0
def _instantiate_vars(adjacency: sparse.csr_matrix, weights: str = 'uniform'):
    """Initialize standard variables for metrics."""
    weights_row = get_probs(weights, adjacency)
    weights_col = get_probs(weights, adjacency.T)
    sym_adjacency = directed2undirected(adjacency)
    aggregate_graph = AggregateGraph(weights_row, weights_col,
                                     sym_adjacency.data.astype(float),
                                     sym_adjacency.indices,
                                     sym_adjacency.indptr)
    return aggregate_graph, weights_row, weights_col
示例#2
0
def _instanciate_vars(adjacency: sparse.csr_matrix, weights: str = 'uniform'):
    """Initialize standard variables for metrics."""
    n = adjacency.shape[0]
    weights_row = check_probs(weights, adjacency)
    weights_col = check_probs(weights, adjacency.T)
    sym_adjacency = directed2undirected(adjacency)

    aggregate_graph = AggregateGraph(weights_row, weights_col,
                                     sym_adjacency.data.astype(np.float),
                                     sym_adjacency.indices,
                                     sym_adjacency.indptr)

    height = np.zeros(n - 1)
    cluster_weight = np.zeros(n - 1)
    edge_sampling = np.zeros(n - 1)

    return aggregate_graph, height, cluster_weight, edge_sampling, weights_row, weights_col
示例#3
0
def tree_sampling_divergence(adjacency: sparse.csr_matrix, dendrogram: np.ndarray, weights: str = 'degree',
                             normalized: bool = True) -> float:
    """
    Tree sampling divergence of a hierarchy (quality metric).

    The higher the score, the better.

    Parameters
    ----------
    adjacency :
        Adjacency matrix of the graph.
    dendrogram :
        Dendrogram.
    weights :
        Weights of nodes.
        ``'degree'`` (default) or ``'uniform'``.
    normalized:
        If ``True``, normalized by the mutual information of the graph.

    Returns
    -------
    score : float
        The tree sampling divergence of the hierarchy.
        If normalized, returns a value between 0 and 1.

    References
    ----------
    Charpentier, B. & Bonald, T. (2019).
    `Tree Sampling Divergence: An Information-Theoretic Metric for
    Hierarchical Graph Clustering.
    <https://hal.telecom-paristech.fr/hal-02144394/document>`_
    Proceedings of IJCAI.

    """
    adjacency = check_format(adjacency)
    if not is_square(adjacency):
        raise ValueError('The adjacency matrix is not square.')

    n = adjacency.shape[0]
    if n <= 1:
        raise ValueError('The graph must contain at least two nodes.')

    total_weight = adjacency.data.sum()
    if total_weight <= 0:
        raise ValueError('The graph must contain at least one edge.')

    adjacency.data = adjacency.data / total_weight

    out_weights = check_probs(weights, adjacency)
    in_weights = check_probs(weights, adjacency.T)

    aggregate_graph = AggregateGraph(adjacency + adjacency.T, out_weights, in_weights)

    height = np.zeros(n - 1)
    edge_sampling = np.zeros(n - 1)
    node_sampling = np.zeros(n - 1)
    for t in range(n - 1):
        node1 = int(dendrogram[t][0])
        node2 = int(dendrogram[t][1])
        if node1 >= n and height[node1 - n] == dendrogram[t][2]:
            edge_sampling[t] = edge_sampling[node1 - n]
            edge_sampling[node1 - n] = 0
            node_sampling[t] = node_sampling[node1 - n]
        elif node2 >= n and height[node2 - n] == dendrogram[t][2]:
            edge_sampling[t] = edge_sampling[node2 - n]
            edge_sampling[node2 - n] = 0
            node_sampling[t] = node_sampling[node2 - n]
        if node2 in aggregate_graph.neighbors[node1]:
            edge_sampling[t] += aggregate_graph.neighbors[node1][node2]
        node_sampling[t] += aggregate_graph.cluster_out_weights[node1] * aggregate_graph.cluster_in_weights[node2] + \
            aggregate_graph.cluster_out_weights[node2] * aggregate_graph.cluster_in_weights[node1]
        height[t] = dendrogram[t][2]
        aggregate_graph.merge(node1, node2)

    index = np.where(edge_sampling)[0]
    score = edge_sampling[index].dot(np.log(edge_sampling[index] / node_sampling[index]))
    if normalized:
        inv_out_weights = sparse.diags(out_weights, shape=(n, n), format='csr')
        inv_out_weights.data = 1 / inv_out_weights.data
        inv_in_weights = sparse.diags(in_weights, shape=(n, n), format='csr')
        inv_in_weights.data = 1 / inv_in_weights.data
        sampling_ratio = inv_out_weights.dot(adjacency.dot(inv_in_weights))
        inv_out_weights.data = np.ones(len(inv_out_weights.data))
        inv_in_weights.data = np.ones(len(inv_in_weights.data))
        edge_sampling = inv_out_weights.dot(adjacency.dot(inv_in_weights))
        mutual_information = edge_sampling.data.dot(np.log(sampling_ratio.data))
        score /= mutual_information
    return score
示例#4
0
def dasgupta_score(adjacency: sparse.csr_matrix, dendrogram: np.ndarray, weights: str = 'uniform') -> float:
    """
    Dasgupta's score of a hierarchy, defined as 1 - Dasgupta's cost.

    The higher the score, the better.

    Parameters
    ----------
    adjacency :
        Adjacency matrix of the graph.
    dendrogram :
        Dendrogram.
    weights :
        Weights of nodes.
        ``'degree'`` or ``'uniform'`` (default).

    Returns
    -------
    score : float
        Dasgupta's score of the hierarchy, normalized to get a value between 0 and 1.

    References
    ----------
    Dasgupta, S. (2016). A cost function for similarity-based hierarchical clustering.
    Proceedings of ACM symposium on Theory of Computing.

    """
    adjacency = check_format(adjacency)
    if not is_square(adjacency):
        raise ValueError('The adjacency matrix is not square.')

    n = adjacency.shape[0]
    if n <= 1:
        raise ValueError('The graph must contain at least two nodes.')

    out_weights = check_probs(weights, adjacency)
    in_weights = check_probs(weights, adjacency.T)

    aggregate_graph = AggregateGraph(adjacency + adjacency.T, out_weights, in_weights)

    height = np.zeros(n - 1)
    edge_sampling = np.zeros(n - 1)
    cluster_weight = np.zeros(n - 1)
    for t in range(n - 1):
        node1 = int(dendrogram[t][0])
        node2 = int(dendrogram[t][1])
        if node1 >= n and height[node1 - n] == dendrogram[t][2]:
            edge_sampling[t] = edge_sampling[node1 - n]
            edge_sampling[node1 - n] = 0
        elif node2 >= n and height[node2 - n] == dendrogram[t][2]:
            edge_sampling[t] = edge_sampling[node2 - n]
            edge_sampling[node2 - n] = 0
        height[t] = dendrogram[t][2]
        if node2 in aggregate_graph.neighbors[node1]:
            edge_sampling[t] += aggregate_graph.neighbors[node1][node2]
        cluster_weight[t] = aggregate_graph.cluster_out_weights[node1] + aggregate_graph.cluster_out_weights[node2] \
            + aggregate_graph.cluster_in_weights[node1] + aggregate_graph.cluster_in_weights[node2]
        aggregate_graph.merge(node1, node2)

    cost: float = edge_sampling.dot(cluster_weight) / 2
    return 1 - cost