Exemplo n.º 1
0
    def test_mass_grid(self):
        """
        Check that the mass-based grid is constructed correctly.
        """
        ## Test typical input - should be sorted
        levels = utl.define_density_mass_grid(self.unique_density)
        answer = np.sort(self.unique_density)
        assert_array_equal(answer, levels)

        ## Test more levels than density values (answer is the same as typical
        #  input).
        levels = utl.define_density_mass_grid(self.unique_density,
                                              num_levels=self.n * 2)
        assert_array_equal(answer, levels)

        ## Test fewer levels than density values.
        levels = utl.define_density_mass_grid(self.unique_density,
                                              num_levels=2)
        answer = np.array([1, 10])
        assert_array_equal(answer, levels)

        ## Test negative values.
        levels = utl.define_density_mass_grid(self.generic_array)
        answer = np.sort(self.generic_array)
        assert_array_equal(answer, levels)

        ## Test uniform input.
        levels = utl.define_density_mass_grid(self.uniform_density)
        self.assertItemsEqual(levels, [1.])
Exemplo n.º 2
0
    def test_mass_grid(self):
        """
        Check that the mass-based grid is constructed correctly.
        """
        ## Test typical input - should be sorted
        levels = utl.define_density_mass_grid(self.unique_density)
        answer = np.sort(self.unique_density)
        assert_array_equal(answer, levels)

        ## Test more levels than density values (answer is the same as typical
        #  input).
        levels = utl.define_density_mass_grid(self.unique_density,
                                              num_levels=self.n * 2)
        assert_array_equal(answer, levels)

        ## Test fewer levels than density values.
        levels = utl.define_density_mass_grid(self.unique_density,
                                              num_levels=2)
        answer = np.array([1, 10])
        assert_array_equal(answer, levels)

        ## Test negative values.
        levels = utl.define_density_mass_grid(self.generic_array)
        answer = np.sort(self.generic_array)
        assert_array_equal(answer, levels)

        ## Test uniform input.
        levels = utl.define_density_mass_grid(self.uniform_density)
        self.assertItemsEqual(levels, [1.])
Exemplo n.º 3
0
def construct_tree_from_graph(adjacency_list, density, prune_threshold=None,
                              num_levels=None, verbose=False):
    """
    Construct a level set tree from a similarity graph and a density estimate.

    Parameters
    ----------
    adjacency_list : list [list]
        Adjacency list of the k-nearest neighbors graph on the data. Each entry
        contains the indices of the `k` closest neighbors to the data point at
        the same row index.

    density : list [float]
        Estimate of the density function, evaluated at the data points
        represented by the keys in `adjacency_list`.

    prune_threshold : int, optional
        Leaf nodes with fewer than this number of members are recursively
        merged into larger nodes. If 'None' (the default), then no pruning
        is performed.

    num_levels : list int, optional
        Number of density levels in the constructed tree. If None (default),
        `num_levels` is internally set to be the number of rows in `X`.

    verbose : bool, optional
        If True, a progress indicator is printed at every 100th level of tree
        construction.

    Returns
    -------
    T : levelSetTree
        See the LevelSetTree class for attributes and method definitions.

    See Also
    --------
    construct_tree, LevelSetTree

    Examples
    --------
    >>> X = numpy.random.rand(100, 2)
    >>> knn_graph, radii = debacl.utils.knn_graph(X, k=8)
    >>> density = debacl.utils.knn_density(radii, n=100, p=2, k=8)
    >>> tree = debacl.construct_tree_from_graph(knn_graph, density,
    ...                                         prune_threshold=5)
    >>> print tree
    +----+-------------+-----------+------------+----------+------+--------+----------+
    | id | start_level | end_level | start_mass | end_mass | size | parent | children |
    +----+-------------+-----------+------------+----------+------+--------+----------+
    | 0  |    0.000    |   0.768   |   0.000    |  0.390   | 100  |  None  |  [1, 2]  |
    | 1  |    0.768    |   1.494   |   0.390    |  0.790   |  30  |   0    |  [7, 8]  |
    | 2  |    0.768    |   4.812   |   0.390    |  1.000   |  31  |   0    |    []    |
    | 7  |    1.494    |   2.375   |   0.790    |  0.950   |  6   |   1    |    []    |
    | 8  |    1.494    |   2.308   |   0.790    |  0.940   |  5   |   1    |    []    |
    +----+-------------+-----------+------------+----------+------+--------+----------+
    """

    ## Initialize the graph and cluster tree
    levels = _utl.define_density_mass_grid(density, num_levels=num_levels)

    G = _nx.from_dict_of_lists(
        {i: neighbors for i, neighbors in enumerate(adjacency_list)})

    T = LevelSetTree(density, levels)

    ## Figure out roots of the tree
    cc0 = _nx.connected_components(G)

    for i, c in enumerate(cc0):  # c is only the vertex list, not the subgraph
        T._subgraphs[i] = G.subgraph(c)
        T.nodes[i] = ConnectedComponent(
            i, parent=None, children=[], start_level=0., end_level=None,
            start_mass=0., end_mass=None, members=c)

    # Loop through the removal grid
    previous_level = 0.
    n = float(len(adjacency_list))

    for i, level in enumerate(levels):
        if verbose and i % 100 == 0:
            _logging.info("iteration {}".format(i))

        ## figure out which points to remove, i.e. the background set.
        bg = _np.where((density > previous_level) & (density <= level))[0]
        previous_level = level

        ## compute the mass after the current bg set is removed
        old_vcount = sum([x.number_of_nodes()
                          for x in T._subgraphs.itervalues()])
        current_mass = 1. - ((old_vcount - len(bg)) / n)

        # loop through active components, i.e. subgraphs
        deactivate_keys = []     # subgraphs to deactivate at the iter end
        activate_subgraphs = {}  # new subgraphs to add at the end of the iter

        for (k, H) in T._subgraphs.iteritems():

            ## remove nodes at the current level
            H.remove_nodes_from(bg)

            ## check if subgraph has vanished
            if H.number_of_nodes() == 0:
                T.nodes[k].end_level = level
                T.nodes[k].end_mass = current_mass
                deactivate_keys.append(k)

            else:  # subgraph hasn't vanished

                ## check if subgraph now has multiple connected components
                # NOTE: this is *the* bottleneck
                if not _nx.is_connected(H):

                    ## deactivate the parent subgraph
                    T.nodes[k].end_level = level
                    T.nodes[k].end_mass = current_mass
                    deactivate_keys.append(k)

                    ## start a new subgraph & node for each child component
                    cc = _nx.connected_components(H)

                    for c in cc:
                        new_key = max(T.nodes.keys()) + 1
                        T.nodes[k].children.append(new_key)
                        activate_subgraphs[new_key] = H.subgraph(c)

                        T.nodes[new_key] = ConnectedComponent(
                            new_key, parent=k, children=[], start_level=level,
                            end_level=None, start_mass=current_mass,
                            end_mass=None, members=c)

        # update active components
        for k in deactivate_keys:
            del T._subgraphs[k]

        T._subgraphs.update(activate_subgraphs)

    ## Prune the tree
    if prune_threshold is not None:
        T = T.prune(threshold=prune_threshold)

    return T