Пример #1
0
def test_linkage_misc():
    # Misc tests on linkage
    X = np.ones((5, 5))
    assert_raises(ValueError,
                  AgglomerativeClustering(linkage='foobar').fit,
                  X)
    assert_raises(ValueError, linkage_tree, X, linkage='foobar')
    assert_raises(ValueError, linkage_tree, X, connectivity=np.ones((4, 4)))

    # Smoke test FeatureAgglomeration
    FeatureAgglomeration().fit(X)

    with warnings.catch_warnings(record=True) as warning_list:
        warnings.simplefilter("always", UserWarning)
        # Use the copy argument, to raise a warning
        Ward(copy=True).fit(X)
    # We should be getting 2 warnings: one for using Ward that is
    # deprecated, one for using the copy argument
    assert_equal(len(warning_list), 2)

    with warnings.catch_warnings(record=True) as warning_list:
        warnings.simplefilter("always", UserWarning)
        # Use the copy argument, to raise a warning
        ward_tree(X, copy=True)
    # We should be getting 1 warnings: for using the copy argument
    assert_equal(len(warning_list), 1)

    # Let's test a hiearchical clustering on a precomputed distances matrix
    dis = cosine_distances(X)
    res = linkage_tree(dis, affinity="precomputed")
    assert_array_equal(res[0], linkage_tree(X, affinity="cosine")[0])
Пример #2
0
def test_ward_tree_distance():
    """
    Check that children are ordered in the same way for both structured and
    unstructured versions of ward_tree.
    """
    # test on five random datasets
    n, p = 10, 5
    rng = np.random.RandomState(0)

    connectivity = np.ones((n, n))
    for i in range(5):
        X = .1 * rng.normal(size=(n, p))
        X -= 4. * np.arange(n)[:, np.newaxis]
        X -= X.mean(axis=1)[:, np.newaxis]

        out_unstructured = ward_tree(X, return_distance=True)
        out_structured = ward_tree(X,
                                   connectivity=connectivity,
                                   return_distance=True)

        # get children
        children_unstructured = out_unstructured[0]
        children_structured = out_structured[0]

        # check if we got the same clusters
        assert_array_equal(children_unstructured, children_structured)

        # check if the distances are the same
        dist_unstructured = out_unstructured[-1]
        dist_structured = out_structured[-1]

        assert_array_almost_equal(dist_unstructured, dist_structured)

    # test on the following dataset where we know the truth
    # taken from scipy/cluster/tests/hierarchy_test_data.py
    X = np.array([[1.43054825, -7.5693489], [6.95887839, 6.82293382],
                  [2.87137846, -9.68248579], [7.87974764, -6.05485803],
                  [8.24018364, -6.09495602], [7.39020262, 8.54004355]])
    # truth
    linkage_X_ward = np.array([[3., 4., 0.36265956, 2.],
                               [1., 5., 1.77045373, 2.],
                               [0., 2., 2.55760419, 2.],
                               [6., 8., 9.10208346, 4.],
                               [7., 9., 24.7784379, 6.]])

    n_samples, n_features = np.shape(X)
    connectivity_X = np.ones((n_samples, n_samples))

    out_X_unstructured = ward_tree(X, return_distance=True)
    out_X_structured = ward_tree(X,
                                 connectivity=connectivity_X,
                                 return_distance=True)

    # check that the labels are the same
    assert_array_equal(linkage_X_ward[:, :2], out_X_unstructured[0])
    assert_array_equal(linkage_X_ward[:, :2], out_X_structured[0])

    # check that the distances are correct
    assert_array_almost_equal(linkage_X_ward[:, 2], out_X_unstructured[4])
    assert_array_almost_equal(linkage_X_ward[:, 2], out_X_structured[4])
Пример #3
0
def _seg_by_hc_single_frame(obs_len, connectivity, data, width=9, hier=False, **kwargs):
    _children, _n_c, _n_leaves, parents, distances = \
        sklhc.ward_tree(data, connectivity=connectivity, return_distance=True)

    reconstructed_z = np.zeros((obs_len - 1, 4))
    reconstructed_z[:, :2] = _children
    reconstructed_z[:, 2] = distances

    if 'criterion' in kwargs.keys():
        criterion = kwargs['criterion']
    else:
        criterion = 'distance'

    if hier:
        t_list = range(2, 11)

        label_dict = OrderedDict()
        boundary_dict = OrderedDict()
        criterion = 'maxclust'
        for t in t_list:
            boundaries, labels = _agg_segment(reconstructed_z, t, criterion, width, data)
            label_dict[np.max(labels) + 1] = labels
            boundary_dict[np.max(labels) + 1] = boundaries
        return boundary_dict, label_dict
    else:
        t = 0.7 * np.max(reconstructed_z[:, 2])
        return _agg_segment(reconstructed_z, t, criterion, width, data)
Пример #4
0
    def __build_tree(self, weights):

        # get clusters with ward_tree function
        pairs = ward_tree(weights.T)[0]
        w = weights.T
        n_samples = weights.T.shape[0]
        tree_nodes = {}

        idx = 0
        for pair in pairs:
            w_list = []
            children = []
            for el in pair:
                if el < n_samples:
                    norm_weight = w[el] / np.linalg.norm(w[el], ord=2)
                    tree_nodes[el] = TreeNode(
                        weights=norm_weight,
                        right_child=None,
                        left_child=None,
                        class_idx=el,
                    )
                    w_list.append(norm_weight)
                else:
                    w_list.append(tree_nodes[el].weight)

                children.append(el)

            tree_nodes[idx + n_samples] = TreeNode(
                weights=(w_list[0] + w_list[1]) / 2.0,
                right_child=tree_nodes[children[1]],
                left_child=tree_nodes[children[0]],
                class_idx=None,
            )
            idx += 1
        return tree_nodes[idx + n_samples - 1]
Пример #5
0
def _seg_by_hc_single_frame(obs_len,
                            connectivity,
                            data,
                            width=9,
                            hier=False,
                            **kwargs):
    _children, _n_c, _n_leaves, parents, distances = \
        sklhc.ward_tree(data, connectivity=connectivity, return_distance=True)

    reconstructed_z = np.zeros((obs_len - 1, 4))
    reconstructed_z[:, :2] = _children
    reconstructed_z[:, 2] = distances

    if 'criterion' in kwargs.keys():
        criterion = kwargs['criterion']
    else:
        criterion = 'distance'

    if hier:
        t_list = range(2, 11)

        label_dict = OrderedDict()
        boundary_dict = OrderedDict()
        criterion = 'maxclust'
        for t in t_list:
            boundaries, labels = _agg_segment(reconstructed_z, t, criterion,
                                              width, data)
            label_dict[np.max(labels) + 1] = labels
            boundary_dict[np.max(labels) + 1] = boundaries
        return boundary_dict, label_dict
    else:
        t = 0.7 * np.max(reconstructed_z[:, 2])
        return _agg_segment(reconstructed_z, t, criterion, width, data)
def test_scikit_vs_scipy():
    """Test scikit ward with full connectivity (i.e. unstructured) vs scipy
    """
    from scipy.sparse import lil_matrix

    n, p, k = 10, 5, 3
    rnd = np.random.RandomState(0)

    connectivity = lil_matrix(np.ones((n, n)))
    for i in range(5):
        X = 0.1 * rnd.normal(size=(n, p))
        X -= 4 * np.arange(n)[:, np.newaxis]
        X -= X.mean(axis=1)[:, np.newaxis]

        out = hierarchy.ward(X)

        children_ = out[:, :2].astype(np.int)
        children, _, n_leaves, _ = ward_tree(X, connectivity)

        cut = _hc_cut(k, children, n_leaves)
        cut_ = _hc_cut(k, children_, n_leaves)
        assess_same_labelling(cut, cut_)

    # Test error management in _hc_cut
    assert_raises(ValueError, _hc_cut, n_leaves + 1, children, n_leaves)
Пример #7
0
def test_unstructured_ward_tree():
    """
    Check that we obtain the correct solution for unstructured ward tree.
    """
    rnd = np.random.RandomState(0)
    X = rnd.randn(50, 100)
    children, n_nodes, n_leaves = ward_tree(X.T)
    n_nodes = 2 * X.shape[1] - 1
    assert_true(len(children) + n_leaves == n_nodes)
Пример #8
0
def test_unstructured_ward_tree():
    """
    Check that we obtain the correct solution for unstructured ward tree.
    """
    np.random.seed(0)
    X = np.random.randn(50, 100)
    children, n_nodes, n_leaves = ward_tree(X.T)
    n_nodes = 2 * X.shape[1] - 1
    assert_true(len(children) + n_leaves == n_nodes)
def test_ward_tree_children_order():
    # Check that children are ordered in the same way for both structured and
    # unstructured versions of ward_tree.

    # test on five random datasets
    n, p = 10, 5
    rng = np.random.RandomState(0)

    connectivity = np.ones((n, n))
    for i in range(5):
        X = .1 * rng.normal(size=(n, p))
        X -= 4. * np.arange(n)[:, np.newaxis]
        X -= X.mean(axis=1)[:, np.newaxis]

        out_unstructured = ward_tree(X)
        out_structured = ward_tree(X, connectivity=connectivity)

        assert_array_equal(out_unstructured[0], out_structured[0])
Пример #10
0
def test_ward_tree_children_order():
    # Check that children are ordered in the same way for both structured and
    # unstructured versions of ward_tree.

    # test on five random datasets
    n, p = 10, 5
    rng = np.random.RandomState(0)

    connectivity = np.ones((n, n))
    for i in range(5):
        X = .1 * rng.normal(size=(n, p))
        X -= 4. * np.arange(n)[:, np.newaxis]
        X -= X.mean(axis=1)[:, np.newaxis]

        out_unstructured = ward_tree(X)
        out_structured = ward_tree(X, connectivity=connectivity)

        assert_array_equal(out_unstructured[0], out_structured[0])
Пример #11
0
def test_structured_ward_tree():
    """
    Check that we obtain the correct solution for structured ward tree.
    """
    rnd = np.random.RandomState(0)
    mask = np.ones([10, 10], dtype=np.bool)
    X = rnd.randn(50, 100)
    connectivity = grid_to_graph(*mask.shape)
    children, n_components, n_leaves = ward_tree(X.T, connectivity)
    n_nodes = 2 * X.shape[1] - 1
    assert_true(len(children) + n_leaves == n_nodes)
def test_height_ward_tree():
    """
    Check that the height of ward tree is sorted.
    """
    rnd = np.random.RandomState(0)
    mask = np.ones([10, 10], dtype=np.bool)
    X = rnd.randn(50, 100)
    connectivity = grid_to_graph(*mask.shape)
    children, n_nodes, n_leaves, parent = ward_tree(X.T, connectivity)
    n_nodes = 2 * X.shape[1] - 1
    assert_true(len(children) + n_leaves == n_nodes)
Пример #13
0
def test_structured_ward_tree():
    """
    Check that we obtain the correct solution for structured ward tree.
    """
    np.random.seed(0)
    mask = np.ones([10, 10], dtype=np.bool)
    X = np.random.randn(50, 100)
    connectivity = grid_to_graph(*mask.shape)
    children, n_components, n_leaves = ward_tree(X.T, connectivity)
    n_nodes = 2 * X.shape[1] - 1
    assert_true(len(children) + n_leaves == n_nodes)
Пример #14
0
    def test_ward_tree(self):
        iris = datasets.load_iris()
        df = pdml.ModelFrame(iris)

        result = df.cluster.ward_tree()
        expected = cluster.ward_tree(iris.data)

        self.assertEqual(len(result), 4)
        self.assert_numpy_array_almost_equal(result[0], expected[0])
        self.assertEqual(result[1], expected[1])
        self.assertEqual(result[2], expected[2])
        self.assertEqual(result[3], expected[3])

        connectivity = np.ones((len(df), len(df)))
        result = df.cluster.ward_tree(connectivity)
        expected = cluster.ward_tree(iris.data, connectivity)

        self.assert_numpy_array_almost_equal(result[0], expected[0])
        self.assertEqual(result[1], expected[1])
        self.assertEqual(result[2], expected[2])
        self.assert_numpy_array_almost_equal(result[3], expected[3])
Пример #15
0
    def test_ward_tree(self):
        iris = datasets.load_iris()
        df = pdml.ModelFrame(iris)

        result = df.cluster.ward_tree()
        expected = cluster.ward_tree(iris.data)

        self.assertEqual(len(result), 4)
        self.assert_numpy_array_almost_equal(result[0], expected[0])
        self.assertEqual(result[1], expected[1])
        self.assertEqual(result[2], expected[2])
        self.assertEqual(result[3], expected[3])

        connectivity = np.ones((len(df), len(df)))
        result = df.cluster.ward_tree(connectivity)
        expected = cluster.ward_tree(iris.data, connectivity)

        self.assert_numpy_array_almost_equal(result[0], expected[0])
        self.assertEqual(result[1], expected[1])
        self.assertEqual(result[2], expected[2])
        self.assert_numpy_array_almost_equal(result[3], expected[3])
Пример #16
0
def test_structured_linkage_tree():
    """
    Check that we obtain the correct solution for structured linkage trees.
    """
    rnd = np.random.RandomState(0)
    mask = np.ones([10, 10], dtype=np.bool)
    # Avoiding a mask with only 'True' entries
    mask[4:7, 4:7] = 0
    X = rnd.randn(50, 100)
    connectivity = grid_to_graph(*mask.shape)
    children, n_components, n_leaves, parent = ward_tree(X.T, connectivity)
        n_nodes = 2 * X.shape[1] - 1
        assert_true(len(children) + n_leaves == n_nodes)
Пример #17
0
def test_unstructured_ward_tree():
    """
    Check that we obtain the correct solution for unstructured ward tree.
    """
    rnd = np.random.RandomState(0)
    X = rnd.randn(50, 100)
    for this_X in (X, X[0]):
        with warnings.catch_warnings(record=True) as warning_list:
            warnings.simplefilter("always", UserWarning)
            # With specified a number of clusters just for the sake of
            # raising a warning and testing the warning code
            children, n_nodes, n_leaves, parent = ward_tree(this_X.T, n_clusters=10)
        assert_equal(len(warning_list), 1)
        n_nodes = 2 * X.shape[1] - 1
        assert_equal(len(children) + n_leaves, n_nodes)
Пример #18
0
def test_unstructured_ward_tree():
    """
    Check that we obtain the correct solution for unstructured ward tree.
    """
    rnd = np.random.RandomState(0)
    X = rnd.randn(50, 100)
    for this_X in (X, X[0]):
        with warnings.catch_warnings(record=True) as warning_list:
            warnings.simplefilter("always", UserWarning)
            # With specified a number of clusters just for the sake of
            # raising a warning and testing the warning code
            children, n_nodes, n_leaves, parent = ward_tree(this_X.T,
                                                            n_clusters=10)
        assert_equal(len(warning_list), 1)
        n_nodes = 2 * X.shape[1] - 1
        assert_equal(len(children) + n_leaves, n_nodes)
def test_structured_ward_tree():
    """
    Check that we obtain the correct solution for structured ward tree.
    """
    rnd = np.random.RandomState(0)
    mask = np.ones([10, 10], dtype=np.bool)
    # Avoiding a mask with only 'True' entries
    mask[4:7, 4:7] = 0
    X = rnd.randn(50, 100)
    connectivity = grid_to_graph(*mask.shape)
    children, n_components, n_leaves, parent = ward_tree(X.T, connectivity)
    n_nodes = 2 * X.shape[1] - 1
    assert_true(len(children) + n_leaves == n_nodes)
    # Check that ward_tree raises a ValueError with a connectivity matrix
    # of the wrong shape
    assert_raises(ValueError, ward_tree, X.T, np.ones((4, 4)))
Пример #20
0
def test_scikit_vs_scipy():
    """Test scikit ward with full connectivity (i.e. unstructured) vs scipy
    """
    from scipy.sparse import lil_matrix
    n, p, k = 10, 5, 3

    connectivity = lil_matrix(np.ones((n, n)))
    for i in range(5):
        X = .1 * np.random.normal(size=(n, p))
        X -= 4 * np.arange(n)[:, np.newaxis]
        X -= X.mean(axis=1)[:, np.newaxis]

        out = hierarchy.ward(X)

        children_ = out[:, :2].astype(np.int)
        children, _, n_leaves = ward_tree(X, connectivity)

        cut = _hc_cut(k, children, n_leaves)
        cut_ = _hc_cut(k, children_, n_leaves)
        assess_same_labelling(cut, cut_)
Пример #21
0
def test_unstructured_linkage_tree():
    # Check that we obtain the correct solution for unstructured linkage trees.
    rng = np.random.RandomState(0)
    X = rng.randn(50, 100)
    for this_X in (X, X[0]):
        # With specified a number of clusters just for the sake of
        # raising a warning and testing the warning code
        with ignore_warnings():
            with pytest.warns(UserWarning):
                children, n_nodes, n_leaves, parent = ward_tree(this_X.T,
                                                                n_clusters=10)
        n_nodes = 2 * X.shape[1] - 1
        assert len(children) + n_leaves == n_nodes

    for tree_builder in _TREE_BUILDERS.values():
        for this_X in (X, X[0]):
            with ignore_warnings():
                with pytest.warns(UserWarning):
                    children, n_nodes, n_leaves, parent = tree_builder(
                        this_X.T, n_clusters=10)
            n_nodes = 2 * X.shape[1] - 1
            assert len(children) + n_leaves == n_nodes
Пример #22
0
def test_ward_linkage_tree_return_distance():
    # Test return_distance option on linkage and ward trees

    # test that return_distance when set true, gives same
    # output on both structured and unstructured clustering.
    n, p = 10, 5
    rng = np.random.RandomState(0)

    connectivity = np.ones((n, n))
    for i in range(5):
        X = .1 * rng.normal(size=(n, p))
        X -= 4. * np.arange(n)[:, np.newaxis]
        X -= X.mean(axis=1)[:, np.newaxis]

        out_unstructured = ward_tree(X, return_distance=True)
        out_structured = ward_tree(X,
                                   connectivity=connectivity,
                                   return_distance=True)

        # get children
        children_unstructured = out_unstructured[0]
        children_structured = out_structured[0]

        # check if we got the same clusters
        assert_array_equal(children_unstructured, children_structured)

        # check if the distances are the same
        dist_unstructured = out_unstructured[-1]
        dist_structured = out_structured[-1]

        assert_array_almost_equal(dist_unstructured, dist_structured)

        for linkage in ['average', 'complete', 'single']:
            structured_items = linkage_tree(X,
                                            connectivity=connectivity,
                                            linkage=linkage,
                                            return_distance=True)[-1]
            unstructured_items = linkage_tree(X,
                                              linkage=linkage,
                                              return_distance=True)[-1]
            structured_dist = structured_items[-1]
            unstructured_dist = unstructured_items[-1]
            structured_children = structured_items[0]
            unstructured_children = unstructured_items[0]
            assert_array_almost_equal(structured_dist, unstructured_dist)
            assert_array_almost_equal(structured_children,
                                      unstructured_children)

    # test on the following dataset where we know the truth
    # taken from scipy/cluster/tests/hierarchy_test_data.py
    X = np.array([[1.43054825, -7.5693489], [6.95887839, 6.82293382],
                  [2.87137846, -9.68248579], [7.87974764, -6.05485803],
                  [8.24018364, -6.09495602], [7.39020262, 8.54004355]])
    # truth
    linkage_X_ward = np.array([[3., 4., 0.36265956, 2.],
                               [1., 5., 1.77045373, 2.],
                               [0., 2., 2.55760419, 2.],
                               [6., 8., 9.10208346, 4.],
                               [7., 9., 24.7784379, 6.]])

    linkage_X_complete = np.array([[3., 4., 0.36265956, 2.],
                                   [1., 5., 1.77045373, 2.],
                                   [0., 2., 2.55760419, 2.],
                                   [6., 8., 6.96742194, 4.],
                                   [7., 9., 18.77445997, 6.]])

    linkage_X_average = np.array([[3., 4., 0.36265956, 2.],
                                  [1., 5., 1.77045373, 2.],
                                  [0., 2., 2.55760419, 2.],
                                  [6., 8., 6.55832839, 4.],
                                  [7., 9., 15.44089605, 6.]])

    n_samples, n_features = np.shape(X)
    connectivity_X = np.ones((n_samples, n_samples))

    out_X_unstructured = ward_tree(X, return_distance=True)
    out_X_structured = ward_tree(X,
                                 connectivity=connectivity_X,
                                 return_distance=True)

    # check that the labels are the same
    assert_array_equal(linkage_X_ward[:, :2], out_X_unstructured[0])
    assert_array_equal(linkage_X_ward[:, :2], out_X_structured[0])

    # check that the distances are correct
    assert_array_almost_equal(linkage_X_ward[:, 2], out_X_unstructured[4])
    assert_array_almost_equal(linkage_X_ward[:, 2], out_X_structured[4])

    linkage_options = ['complete', 'average', 'single']
    X_linkage_truth = [linkage_X_complete, linkage_X_average]
    for (linkage, X_truth) in zip(linkage_options, X_linkage_truth):
        out_X_unstructured = linkage_tree(X,
                                          return_distance=True,
                                          linkage=linkage)
        out_X_structured = linkage_tree(X,
                                        connectivity=connectivity_X,
                                        linkage=linkage,
                                        return_distance=True)

        # check that the labels are the same
        assert_array_equal(X_truth[:, :2], out_X_unstructured[0])
        assert_array_equal(X_truth[:, :2], out_X_structured[0])

        # check that the distances are correct
        assert_array_almost_equal(X_truth[:, 2], out_X_unstructured[4])
        assert_array_almost_equal(X_truth[:, 2], out_X_structured[4])
Пример #23
0
import numpy as np


dataframe_train = pandas.read_csv('Data\EASY_TRAIN.csv', header=None)
dataset_train = dataframe_train.values
data_train = dataset_train[:,0:26].astype(float)
labels_train = dataset_train[:,26]

arr = [None]*4120
for index in range(0, 4120):
	arr[index] = index + 1
titles = np.array(labels_train[1:1000])

print titles[0]

w = cluster.ward_tree(data_train[1:1000,:], return_distance=True)


linkage_matrix = ward(w[0]) #define the linkage_matrix using ward clustering pre-computed distances

fig, ax = plt.subplots(figsize=(64, 100)) # set size
ax = dendrogram(linkage_matrix, orientation="left", labels=titles);

plt.tick_params(\
    axis= 'x',          # changes apply to the x-axis
    which='both',      # both major and minor ticks are affected
    bottom='off',      # ticks along the bottom edge are off
    top='off',         # ticks along the top edge are off
    labelbottom='on')

plt.tight_layout() #show plot with tight layout
Пример #24
0
    with warnings.catch_warnings(record=True) as warning_list:
        warnings.simplefilter("always", UserWarning)
        # Use the copy argument, to raise a warning
        Ward(copy=True).fit(X)
    # We should be getting 2 warnings: one for using Ward that is
    # deprecated, one for using the copy argument
    assert_equal(len(warning_list), 2)
=======
from sklearn.utils.testing import assert_warns
>>>>>>> remote

    with warnings.catch_warnings(record=True) as warning_list:
        warnings.simplefilter("always", UserWarning)
        # Use the copy argument, to raise a warning
        ward_tree(X, copy=True)
    # We should be getting 1 warnings: for using the copy argument
    assert_equal(len(warning_list), 1)

    # Let's test a hiearchical clustering on a precomputed distances matrix
    dis = cosine_distances(X)
    res = linkage_tree(dis, affinity="precomputed")
    assert_array_equal(res[0], linkage_tree(X, affinity="cosine")[0])


def test_structured_linkage_tree():
    """
    Check that we obtain the correct solution for structured linkage trees.
    """
    rnd = np.random.RandomState(0)
    mask = np.ones([10, 10], dtype=np.bool)
Пример #25
0
def test_ward_linkage_tree_return_distance():
    """Test return_distance option on linkage and ward trees"""

    # test that return_distance when set true, gives same
    # output on both structured and unstructured clustering.
    n, p = 10, 5
    rng = np.random.RandomState(0)

    connectivity = np.ones((n, n))
    for i in range(5):
        X = .1 * rng.normal(size=(n, p))
        X -= 4. * np.arange(n)[:, np.newaxis]
        X -= X.mean(axis=1)[:, np.newaxis]

        out_unstructured = ward_tree(X, return_distance=True)
        out_structured = ward_tree(X, connectivity=connectivity,
                                   return_distance=True)

        # get children
        children_unstructured = out_unstructured[0]
        children_structured = out_structured[0]

        # check if we got the same clusters
        assert_array_equal(children_unstructured, children_structured)

        # check if the distances are the same
        dist_unstructured = out_unstructured[-1]
        dist_structured = out_structured[-1]

        assert_array_almost_equal(dist_unstructured, dist_structured)

        for linkage in ['average', 'complete']:
            structured_items = linkage_tree(
                X, connectivity=connectivity, linkage=linkage,
                return_distance=True)[-1]
            unstructured_items = linkage_tree(
                X, linkage=linkage, return_distance=True)[-1]
            structured_dist = structured_items[-1]
            unstructured_dist = unstructured_items[-1]
            structured_children = structured_items[0]
            unstructured_children = unstructured_items[0]
            assert_array_almost_equal(structured_dist, unstructured_dist)
            assert_array_almost_equal(
                structured_children, unstructured_children)

    # test on the following dataset where we know the truth
    # taken from scipy/cluster/tests/hierarchy_test_data.py
    X = np.array([[1.43054825, -7.5693489],
                  [6.95887839, 6.82293382],
                  [2.87137846, -9.68248579],
                  [7.87974764, -6.05485803],
                  [8.24018364, -6.09495602],
                  [7.39020262, 8.54004355]])
    # truth
    linkage_X_ward = np.array([[3., 4., 0.36265956, 2.],
                               [1., 5., 1.77045373, 2.],
                               [0., 2., 2.55760419, 2.],
                               [6., 8., 9.10208346, 4.],
                               [7., 9., 24.7784379, 6.]])

    linkage_X_complete = np.array(
        [[3., 4., 0.36265956, 2.],
         [1., 5., 1.77045373, 2.],
         [0., 2., 2.55760419, 2.],
         [6., 8., 6.96742194, 4.],
         [7., 9., 18.77445997, 6.]])

    linkage_X_average = np.array(
        [[3., 4., 0.36265956, 2.],
         [1., 5., 1.77045373, 2.],
         [0., 2., 2.55760419, 2.],
         [6., 8., 6.55832839, 4.],
         [7., 9., 15.44089605, 6.]])

    n_samples, n_features = np.shape(X)
    connectivity_X = np.ones((n_samples, n_samples))

    out_X_unstructured = ward_tree(X, return_distance=True)
    out_X_structured = ward_tree(X, connectivity=connectivity_X,
                                 return_distance=True)

    # check that the labels are the same
    assert_array_equal(linkage_X_ward[:, :2], out_X_unstructured[0])
    assert_array_equal(linkage_X_ward[:, :2], out_X_structured[0])

    # check that the distances are correct
    assert_array_almost_equal(linkage_X_ward[:, 2], out_X_unstructured[4])
    assert_array_almost_equal(linkage_X_ward[:, 2], out_X_structured[4])

    linkage_options = ['complete', 'average']
    X_linkage_truth = [linkage_X_complete, linkage_X_average]
    for (linkage, X_truth) in zip(linkage_options, X_linkage_truth):
        out_X_unstructured = linkage_tree(
            X, return_distance=True, linkage=linkage)
        out_X_structured = linkage_tree(
            X, connectivity=connectivity_X, linkage=linkage,
            return_distance=True)

        # check that the labels are the same
        assert_array_equal(X_truth[:, :2], out_X_unstructured[0])
        assert_array_equal(X_truth[:, :2], out_X_structured[0])

        # check that the distances are correct
        assert_array_almost_equal(X_truth[:, 2], out_X_unstructured[4])
        assert_array_almost_equal(X_truth[:, 2], out_X_structured[4])
Пример #26
0
def test_ward_tree_distance():
    """
    Check that children are ordered in the same way for both structured and
    unstructured versions of ward_tree.
    """
    # test on five random datasets
    n, p = 10, 5
    rng = np.random.RandomState(0)

    connectivity = np.ones((n, n))
    for i in range(5):
        X = .1 * rng.normal(size=(n, p))
        X -= 4. * np.arange(n)[:, np.newaxis]
        X -= X.mean(axis=1)[:, np.newaxis]

        out_unstructured = ward_tree(X, return_distance=True)
        out_structured = ward_tree(X, connectivity=connectivity,
                                   return_distance=True)

        # get children
        children_unstructured = out_unstructured[0]
        children_structured = out_structured[0]

        # check if we got the same clusters
        assert_array_equal(children_unstructured, children_structured)

        # check if the distances are the same
        dist_unstructured = out_unstructured[-1]
        dist_structured = out_structured[-1]

        assert_array_almost_equal(dist_unstructured, dist_structured)

    # test on the following dataset where we know the truth
    # taken from scipy/cluster/tests/hierarchy_test_data.py
    X = np.array([[1.43054825, -7.5693489],
                  [6.95887839, 6.82293382],
                  [2.87137846, -9.68248579],
                  [7.87974764, -6.05485803],
                  [8.24018364, -6.09495602],
                  [7.39020262, 8.54004355]])
    # truth
    linkage_X_ward = np.array([[3., 4., 0.36265956, 2.],
                               [1., 5., 1.77045373, 2.],
                               [0., 2., 2.55760419, 2.],
                               [6., 8., 9.10208346, 4.],
                               [7., 9., 24.7784379, 6.]])

    n_samples, n_features = np.shape(X)
    connectivity_X = np.ones((n_samples, n_samples))

    out_X_unstructured = ward_tree(X, return_distance=True)
    out_X_structured = ward_tree(X, connectivity=connectivity_X,
                                 return_distance=True)

    # check that the labels are the same
    assert_array_equal(linkage_X_ward[:, :2], out_X_unstructured[0])
    assert_array_equal(linkage_X_ward[:, :2], out_X_structured[0])

    # check that the distances are correct
    assert_array_almost_equal(linkage_X_ward[:, 2], out_X_unstructured[4])
    assert_array_almost_equal(linkage_X_ward[:, 2], out_X_structured[4])
Пример #27
0
def ward_tree(data):
    wardtree = ward_tree(data)
    print wardtree
Пример #28
0
    def fit(self, X, y):
        """
        Fits Supervised Clustering.

        Parameters
        ----------
        X : ndarray of shape = (n_samples, n_features)

        Y : ndarray of shape = (n_samples)

        Returns
        -------
        self
        """
        # n_components computed here because the user can change connectivity
        if self.connectivity is not None:
            self.n_components = cs_graph_components(self.connectivity)[0]
        else:
            self.n_components = 1

        children, n_components, n_leaves = ward_tree(X.T,
                connectivity=self.connectivity, n_components=self.n_components)
        children = children.tolist()  # Faster with a list
        avg_signals = average_signals(X, children, n_leaves)

        # The first parcellations is the list of the tree roots
        parcellation = tree_roots(children, n_components, n_leaves)
        parcellations = []  # List of the best parcellations
        self.scores_ = []
        if self.verbose >= 2:
            print "\n# First parcellation (=tree roots) : %s" % parcellations

        ## EXPLORATION LOOP
        for i in range(1, self.n_iterations+1):  # for verbose mode

            if self.verbose:
                print "# Iteration %d" % i
            iteration_parcellations = split_parcellation(parcellation,
                    children, n_leaves)

            if (len(iteration_parcellations) == 0):
                # No parcellation can be splitted
                print " UserWARNING : n_iterations is too big :"
                print " Ending function at iteration %d." % i
                break

            # Selecting the best parcellation for current iteration
            scores = Parallel(n_jobs=self.n_jobs)(delayed(cross_val_score)
                (estimator=self.estimator, X=avg_signals[:, j], y=y,
                cv=self.cv, n_jobs=1, verbose=0)
                for j in iteration_parcellations)
            scores = np.mean(scores, axis=1)
            indice = np.argmax(scores)
            parcellation = np.copy(iteration_parcellations[indice])
            parcellations.append(np.copy(parcellation))
            self.scores_.append(np.copy(scores[indice]))

        ## SELECTION LOOP
        # We select the parcellation for wich the variation of score is
        # the bigger, only if it score is > score_max / 2
        # Furthermore we select only parcellations obtained after 20 iterations
        indice_min = 20
        self.score_min_ = 7 * (np.max(self.scores_) / 10)
        max = 0
        indice = 0
        self.delta_scores_ = [0]
        for i in range(indice_min):
            self.delta_scores.append(0)

        for i in range(indice_min, len(self.scores_)-1):
            if self.scores_[i+1] >= self.score_min_:
                current_delta = self.scores_[i+1] - self.scores_[i]
                if current_delta > max:
                    max = current_delta
                    indice = i
                self.delta_scores_.append(current_delta)
            else:
                self.delta_scores_.append(0)

        parcellation = parcellations[indice]

        # Computing the corresponding labels array
        self.labels_ = parcellation_to_label(parcellation, children, n_leaves)
        self.estimator.fit(avg_signals[:, parcellation], y)

        if hasattr(self.estimator, 'coef_'):
            if len(self.estimator.coef_.shape) == 1:
                self.coef_ = self.estimator.coef_
            else:
                self.coef_ = self.estimator.coef_[-1]

        return self