def test_linkage_misc(): # Misc tests on linkage X = np.ones((5, 5)) assert_raises(ValueError, AgglomerativeClustering(linkage='foobar').fit, X) assert_raises(ValueError, linkage_tree, X, linkage='foobar') assert_raises(ValueError, linkage_tree, X, connectivity=np.ones((4, 4))) # Smoke test FeatureAgglomeration FeatureAgglomeration().fit(X) with warnings.catch_warnings(record=True) as warning_list: warnings.simplefilter("always", UserWarning) # Use the copy argument, to raise a warning Ward(copy=True).fit(X) # We should be getting 2 warnings: one for using Ward that is # deprecated, one for using the copy argument assert_equal(len(warning_list), 2) with warnings.catch_warnings(record=True) as warning_list: warnings.simplefilter("always", UserWarning) # Use the copy argument, to raise a warning ward_tree(X, copy=True) # We should be getting 1 warnings: for using the copy argument assert_equal(len(warning_list), 1) # Let's test a hiearchical clustering on a precomputed distances matrix dis = cosine_distances(X) res = linkage_tree(dis, affinity="precomputed") assert_array_equal(res[0], linkage_tree(X, affinity="cosine")[0])
def test_ward_tree_distance(): """ Check that children are ordered in the same way for both structured and unstructured versions of ward_tree. """ # test on five random datasets n, p = 10, 5 rng = np.random.RandomState(0) connectivity = np.ones((n, n)) for i in range(5): X = .1 * rng.normal(size=(n, p)) X -= 4. * np.arange(n)[:, np.newaxis] X -= X.mean(axis=1)[:, np.newaxis] out_unstructured = ward_tree(X, return_distance=True) out_structured = ward_tree(X, connectivity=connectivity, return_distance=True) # get children children_unstructured = out_unstructured[0] children_structured = out_structured[0] # check if we got the same clusters assert_array_equal(children_unstructured, children_structured) # check if the distances are the same dist_unstructured = out_unstructured[-1] dist_structured = out_structured[-1] assert_array_almost_equal(dist_unstructured, dist_structured) # test on the following dataset where we know the truth # taken from scipy/cluster/tests/hierarchy_test_data.py X = np.array([[1.43054825, -7.5693489], [6.95887839, 6.82293382], [2.87137846, -9.68248579], [7.87974764, -6.05485803], [8.24018364, -6.09495602], [7.39020262, 8.54004355]]) # truth linkage_X_ward = np.array([[3., 4., 0.36265956, 2.], [1., 5., 1.77045373, 2.], [0., 2., 2.55760419, 2.], [6., 8., 9.10208346, 4.], [7., 9., 24.7784379, 6.]]) n_samples, n_features = np.shape(X) connectivity_X = np.ones((n_samples, n_samples)) out_X_unstructured = ward_tree(X, return_distance=True) out_X_structured = ward_tree(X, connectivity=connectivity_X, return_distance=True) # check that the labels are the same assert_array_equal(linkage_X_ward[:, :2], out_X_unstructured[0]) assert_array_equal(linkage_X_ward[:, :2], out_X_structured[0]) # check that the distances are correct assert_array_almost_equal(linkage_X_ward[:, 2], out_X_unstructured[4]) assert_array_almost_equal(linkage_X_ward[:, 2], out_X_structured[4])
def _seg_by_hc_single_frame(obs_len, connectivity, data, width=9, hier=False, **kwargs): _children, _n_c, _n_leaves, parents, distances = \ sklhc.ward_tree(data, connectivity=connectivity, return_distance=True) reconstructed_z = np.zeros((obs_len - 1, 4)) reconstructed_z[:, :2] = _children reconstructed_z[:, 2] = distances if 'criterion' in kwargs.keys(): criterion = kwargs['criterion'] else: criterion = 'distance' if hier: t_list = range(2, 11) label_dict = OrderedDict() boundary_dict = OrderedDict() criterion = 'maxclust' for t in t_list: boundaries, labels = _agg_segment(reconstructed_z, t, criterion, width, data) label_dict[np.max(labels) + 1] = labels boundary_dict[np.max(labels) + 1] = boundaries return boundary_dict, label_dict else: t = 0.7 * np.max(reconstructed_z[:, 2]) return _agg_segment(reconstructed_z, t, criterion, width, data)
def __build_tree(self, weights): # get clusters with ward_tree function pairs = ward_tree(weights.T)[0] w = weights.T n_samples = weights.T.shape[0] tree_nodes = {} idx = 0 for pair in pairs: w_list = [] children = [] for el in pair: if el < n_samples: norm_weight = w[el] / np.linalg.norm(w[el], ord=2) tree_nodes[el] = TreeNode( weights=norm_weight, right_child=None, left_child=None, class_idx=el, ) w_list.append(norm_weight) else: w_list.append(tree_nodes[el].weight) children.append(el) tree_nodes[idx + n_samples] = TreeNode( weights=(w_list[0] + w_list[1]) / 2.0, right_child=tree_nodes[children[1]], left_child=tree_nodes[children[0]], class_idx=None, ) idx += 1 return tree_nodes[idx + n_samples - 1]
def test_scikit_vs_scipy(): """Test scikit ward with full connectivity (i.e. unstructured) vs scipy """ from scipy.sparse import lil_matrix n, p, k = 10, 5, 3 rnd = np.random.RandomState(0) connectivity = lil_matrix(np.ones((n, n))) for i in range(5): X = 0.1 * rnd.normal(size=(n, p)) X -= 4 * np.arange(n)[:, np.newaxis] X -= X.mean(axis=1)[:, np.newaxis] out = hierarchy.ward(X) children_ = out[:, :2].astype(np.int) children, _, n_leaves, _ = ward_tree(X, connectivity) cut = _hc_cut(k, children, n_leaves) cut_ = _hc_cut(k, children_, n_leaves) assess_same_labelling(cut, cut_) # Test error management in _hc_cut assert_raises(ValueError, _hc_cut, n_leaves + 1, children, n_leaves)
def test_unstructured_ward_tree(): """ Check that we obtain the correct solution for unstructured ward tree. """ rnd = np.random.RandomState(0) X = rnd.randn(50, 100) children, n_nodes, n_leaves = ward_tree(X.T) n_nodes = 2 * X.shape[1] - 1 assert_true(len(children) + n_leaves == n_nodes)
def test_unstructured_ward_tree(): """ Check that we obtain the correct solution for unstructured ward tree. """ np.random.seed(0) X = np.random.randn(50, 100) children, n_nodes, n_leaves = ward_tree(X.T) n_nodes = 2 * X.shape[1] - 1 assert_true(len(children) + n_leaves == n_nodes)
def test_ward_tree_children_order(): # Check that children are ordered in the same way for both structured and # unstructured versions of ward_tree. # test on five random datasets n, p = 10, 5 rng = np.random.RandomState(0) connectivity = np.ones((n, n)) for i in range(5): X = .1 * rng.normal(size=(n, p)) X -= 4. * np.arange(n)[:, np.newaxis] X -= X.mean(axis=1)[:, np.newaxis] out_unstructured = ward_tree(X) out_structured = ward_tree(X, connectivity=connectivity) assert_array_equal(out_unstructured[0], out_structured[0])
def test_structured_ward_tree(): """ Check that we obtain the correct solution for structured ward tree. """ rnd = np.random.RandomState(0) mask = np.ones([10, 10], dtype=np.bool) X = rnd.randn(50, 100) connectivity = grid_to_graph(*mask.shape) children, n_components, n_leaves = ward_tree(X.T, connectivity) n_nodes = 2 * X.shape[1] - 1 assert_true(len(children) + n_leaves == n_nodes)
def test_height_ward_tree(): """ Check that the height of ward tree is sorted. """ rnd = np.random.RandomState(0) mask = np.ones([10, 10], dtype=np.bool) X = rnd.randn(50, 100) connectivity = grid_to_graph(*mask.shape) children, n_nodes, n_leaves, parent = ward_tree(X.T, connectivity) n_nodes = 2 * X.shape[1] - 1 assert_true(len(children) + n_leaves == n_nodes)
def test_structured_ward_tree(): """ Check that we obtain the correct solution for structured ward tree. """ np.random.seed(0) mask = np.ones([10, 10], dtype=np.bool) X = np.random.randn(50, 100) connectivity = grid_to_graph(*mask.shape) children, n_components, n_leaves = ward_tree(X.T, connectivity) n_nodes = 2 * X.shape[1] - 1 assert_true(len(children) + n_leaves == n_nodes)
def test_ward_tree(self): iris = datasets.load_iris() df = pdml.ModelFrame(iris) result = df.cluster.ward_tree() expected = cluster.ward_tree(iris.data) self.assertEqual(len(result), 4) self.assert_numpy_array_almost_equal(result[0], expected[0]) self.assertEqual(result[1], expected[1]) self.assertEqual(result[2], expected[2]) self.assertEqual(result[3], expected[3]) connectivity = np.ones((len(df), len(df))) result = df.cluster.ward_tree(connectivity) expected = cluster.ward_tree(iris.data, connectivity) self.assert_numpy_array_almost_equal(result[0], expected[0]) self.assertEqual(result[1], expected[1]) self.assertEqual(result[2], expected[2]) self.assert_numpy_array_almost_equal(result[3], expected[3])
def test_structured_linkage_tree(): """ Check that we obtain the correct solution for structured linkage trees. """ rnd = np.random.RandomState(0) mask = np.ones([10, 10], dtype=np.bool) # Avoiding a mask with only 'True' entries mask[4:7, 4:7] = 0 X = rnd.randn(50, 100) connectivity = grid_to_graph(*mask.shape) children, n_components, n_leaves, parent = ward_tree(X.T, connectivity) n_nodes = 2 * X.shape[1] - 1 assert_true(len(children) + n_leaves == n_nodes)
def test_unstructured_ward_tree(): """ Check that we obtain the correct solution for unstructured ward tree. """ rnd = np.random.RandomState(0) X = rnd.randn(50, 100) for this_X in (X, X[0]): with warnings.catch_warnings(record=True) as warning_list: warnings.simplefilter("always", UserWarning) # With specified a number of clusters just for the sake of # raising a warning and testing the warning code children, n_nodes, n_leaves, parent = ward_tree(this_X.T, n_clusters=10) assert_equal(len(warning_list), 1) n_nodes = 2 * X.shape[1] - 1 assert_equal(len(children) + n_leaves, n_nodes)
def test_structured_ward_tree(): """ Check that we obtain the correct solution for structured ward tree. """ rnd = np.random.RandomState(0) mask = np.ones([10, 10], dtype=np.bool) # Avoiding a mask with only 'True' entries mask[4:7, 4:7] = 0 X = rnd.randn(50, 100) connectivity = grid_to_graph(*mask.shape) children, n_components, n_leaves, parent = ward_tree(X.T, connectivity) n_nodes = 2 * X.shape[1] - 1 assert_true(len(children) + n_leaves == n_nodes) # Check that ward_tree raises a ValueError with a connectivity matrix # of the wrong shape assert_raises(ValueError, ward_tree, X.T, np.ones((4, 4)))
def test_scikit_vs_scipy(): """Test scikit ward with full connectivity (i.e. unstructured) vs scipy """ from scipy.sparse import lil_matrix n, p, k = 10, 5, 3 connectivity = lil_matrix(np.ones((n, n))) for i in range(5): X = .1 * np.random.normal(size=(n, p)) X -= 4 * np.arange(n)[:, np.newaxis] X -= X.mean(axis=1)[:, np.newaxis] out = hierarchy.ward(X) children_ = out[:, :2].astype(np.int) children, _, n_leaves = ward_tree(X, connectivity) cut = _hc_cut(k, children, n_leaves) cut_ = _hc_cut(k, children_, n_leaves) assess_same_labelling(cut, cut_)
def test_unstructured_linkage_tree(): # Check that we obtain the correct solution for unstructured linkage trees. rng = np.random.RandomState(0) X = rng.randn(50, 100) for this_X in (X, X[0]): # With specified a number of clusters just for the sake of # raising a warning and testing the warning code with ignore_warnings(): with pytest.warns(UserWarning): children, n_nodes, n_leaves, parent = ward_tree(this_X.T, n_clusters=10) n_nodes = 2 * X.shape[1] - 1 assert len(children) + n_leaves == n_nodes for tree_builder in _TREE_BUILDERS.values(): for this_X in (X, X[0]): with ignore_warnings(): with pytest.warns(UserWarning): children, n_nodes, n_leaves, parent = tree_builder( this_X.T, n_clusters=10) n_nodes = 2 * X.shape[1] - 1 assert len(children) + n_leaves == n_nodes
def test_ward_linkage_tree_return_distance(): # Test return_distance option on linkage and ward trees # test that return_distance when set true, gives same # output on both structured and unstructured clustering. n, p = 10, 5 rng = np.random.RandomState(0) connectivity = np.ones((n, n)) for i in range(5): X = .1 * rng.normal(size=(n, p)) X -= 4. * np.arange(n)[:, np.newaxis] X -= X.mean(axis=1)[:, np.newaxis] out_unstructured = ward_tree(X, return_distance=True) out_structured = ward_tree(X, connectivity=connectivity, return_distance=True) # get children children_unstructured = out_unstructured[0] children_structured = out_structured[0] # check if we got the same clusters assert_array_equal(children_unstructured, children_structured) # check if the distances are the same dist_unstructured = out_unstructured[-1] dist_structured = out_structured[-1] assert_array_almost_equal(dist_unstructured, dist_structured) for linkage in ['average', 'complete', 'single']: structured_items = linkage_tree(X, connectivity=connectivity, linkage=linkage, return_distance=True)[-1] unstructured_items = linkage_tree(X, linkage=linkage, return_distance=True)[-1] structured_dist = structured_items[-1] unstructured_dist = unstructured_items[-1] structured_children = structured_items[0] unstructured_children = unstructured_items[0] assert_array_almost_equal(structured_dist, unstructured_dist) assert_array_almost_equal(structured_children, unstructured_children) # test on the following dataset where we know the truth # taken from scipy/cluster/tests/hierarchy_test_data.py X = np.array([[1.43054825, -7.5693489], [6.95887839, 6.82293382], [2.87137846, -9.68248579], [7.87974764, -6.05485803], [8.24018364, -6.09495602], [7.39020262, 8.54004355]]) # truth linkage_X_ward = np.array([[3., 4., 0.36265956, 2.], [1., 5., 1.77045373, 2.], [0., 2., 2.55760419, 2.], [6., 8., 9.10208346, 4.], [7., 9., 24.7784379, 6.]]) linkage_X_complete = np.array([[3., 4., 0.36265956, 2.], [1., 5., 1.77045373, 2.], [0., 2., 2.55760419, 2.], [6., 8., 6.96742194, 4.], [7., 9., 18.77445997, 6.]]) linkage_X_average = np.array([[3., 4., 0.36265956, 2.], [1., 5., 1.77045373, 2.], [0., 2., 2.55760419, 2.], [6., 8., 6.55832839, 4.], [7., 9., 15.44089605, 6.]]) n_samples, n_features = np.shape(X) connectivity_X = np.ones((n_samples, n_samples)) out_X_unstructured = ward_tree(X, return_distance=True) out_X_structured = ward_tree(X, connectivity=connectivity_X, return_distance=True) # check that the labels are the same assert_array_equal(linkage_X_ward[:, :2], out_X_unstructured[0]) assert_array_equal(linkage_X_ward[:, :2], out_X_structured[0]) # check that the distances are correct assert_array_almost_equal(linkage_X_ward[:, 2], out_X_unstructured[4]) assert_array_almost_equal(linkage_X_ward[:, 2], out_X_structured[4]) linkage_options = ['complete', 'average', 'single'] X_linkage_truth = [linkage_X_complete, linkage_X_average] for (linkage, X_truth) in zip(linkage_options, X_linkage_truth): out_X_unstructured = linkage_tree(X, return_distance=True, linkage=linkage) out_X_structured = linkage_tree(X, connectivity=connectivity_X, linkage=linkage, return_distance=True) # check that the labels are the same assert_array_equal(X_truth[:, :2], out_X_unstructured[0]) assert_array_equal(X_truth[:, :2], out_X_structured[0]) # check that the distances are correct assert_array_almost_equal(X_truth[:, 2], out_X_unstructured[4]) assert_array_almost_equal(X_truth[:, 2], out_X_structured[4])
import numpy as np dataframe_train = pandas.read_csv('Data\EASY_TRAIN.csv', header=None) dataset_train = dataframe_train.values data_train = dataset_train[:,0:26].astype(float) labels_train = dataset_train[:,26] arr = [None]*4120 for index in range(0, 4120): arr[index] = index + 1 titles = np.array(labels_train[1:1000]) print titles[0] w = cluster.ward_tree(data_train[1:1000,:], return_distance=True) linkage_matrix = ward(w[0]) #define the linkage_matrix using ward clustering pre-computed distances fig, ax = plt.subplots(figsize=(64, 100)) # set size ax = dendrogram(linkage_matrix, orientation="left", labels=titles); plt.tick_params(\ axis= 'x', # changes apply to the x-axis which='both', # both major and minor ticks are affected bottom='off', # ticks along the bottom edge are off top='off', # ticks along the top edge are off labelbottom='on') plt.tight_layout() #show plot with tight layout
with warnings.catch_warnings(record=True) as warning_list: warnings.simplefilter("always", UserWarning) # Use the copy argument, to raise a warning Ward(copy=True).fit(X) # We should be getting 2 warnings: one for using Ward that is # deprecated, one for using the copy argument assert_equal(len(warning_list), 2) ======= from sklearn.utils.testing import assert_warns >>>>>>> remote with warnings.catch_warnings(record=True) as warning_list: warnings.simplefilter("always", UserWarning) # Use the copy argument, to raise a warning ward_tree(X, copy=True) # We should be getting 1 warnings: for using the copy argument assert_equal(len(warning_list), 1) # Let's test a hiearchical clustering on a precomputed distances matrix dis = cosine_distances(X) res = linkage_tree(dis, affinity="precomputed") assert_array_equal(res[0], linkage_tree(X, affinity="cosine")[0]) def test_structured_linkage_tree(): """ Check that we obtain the correct solution for structured linkage trees. """ rnd = np.random.RandomState(0) mask = np.ones([10, 10], dtype=np.bool)
def test_ward_linkage_tree_return_distance(): """Test return_distance option on linkage and ward trees""" # test that return_distance when set true, gives same # output on both structured and unstructured clustering. n, p = 10, 5 rng = np.random.RandomState(0) connectivity = np.ones((n, n)) for i in range(5): X = .1 * rng.normal(size=(n, p)) X -= 4. * np.arange(n)[:, np.newaxis] X -= X.mean(axis=1)[:, np.newaxis] out_unstructured = ward_tree(X, return_distance=True) out_structured = ward_tree(X, connectivity=connectivity, return_distance=True) # get children children_unstructured = out_unstructured[0] children_structured = out_structured[0] # check if we got the same clusters assert_array_equal(children_unstructured, children_structured) # check if the distances are the same dist_unstructured = out_unstructured[-1] dist_structured = out_structured[-1] assert_array_almost_equal(dist_unstructured, dist_structured) for linkage in ['average', 'complete']: structured_items = linkage_tree( X, connectivity=connectivity, linkage=linkage, return_distance=True)[-1] unstructured_items = linkage_tree( X, linkage=linkage, return_distance=True)[-1] structured_dist = structured_items[-1] unstructured_dist = unstructured_items[-1] structured_children = structured_items[0] unstructured_children = unstructured_items[0] assert_array_almost_equal(structured_dist, unstructured_dist) assert_array_almost_equal( structured_children, unstructured_children) # test on the following dataset where we know the truth # taken from scipy/cluster/tests/hierarchy_test_data.py X = np.array([[1.43054825, -7.5693489], [6.95887839, 6.82293382], [2.87137846, -9.68248579], [7.87974764, -6.05485803], [8.24018364, -6.09495602], [7.39020262, 8.54004355]]) # truth linkage_X_ward = np.array([[3., 4., 0.36265956, 2.], [1., 5., 1.77045373, 2.], [0., 2., 2.55760419, 2.], [6., 8., 9.10208346, 4.], [7., 9., 24.7784379, 6.]]) linkage_X_complete = np.array( [[3., 4., 0.36265956, 2.], [1., 5., 1.77045373, 2.], [0., 2., 2.55760419, 2.], [6., 8., 6.96742194, 4.], [7., 9., 18.77445997, 6.]]) linkage_X_average = np.array( [[3., 4., 0.36265956, 2.], [1., 5., 1.77045373, 2.], [0., 2., 2.55760419, 2.], [6., 8., 6.55832839, 4.], [7., 9., 15.44089605, 6.]]) n_samples, n_features = np.shape(X) connectivity_X = np.ones((n_samples, n_samples)) out_X_unstructured = ward_tree(X, return_distance=True) out_X_structured = ward_tree(X, connectivity=connectivity_X, return_distance=True) # check that the labels are the same assert_array_equal(linkage_X_ward[:, :2], out_X_unstructured[0]) assert_array_equal(linkage_X_ward[:, :2], out_X_structured[0]) # check that the distances are correct assert_array_almost_equal(linkage_X_ward[:, 2], out_X_unstructured[4]) assert_array_almost_equal(linkage_X_ward[:, 2], out_X_structured[4]) linkage_options = ['complete', 'average'] X_linkage_truth = [linkage_X_complete, linkage_X_average] for (linkage, X_truth) in zip(linkage_options, X_linkage_truth): out_X_unstructured = linkage_tree( X, return_distance=True, linkage=linkage) out_X_structured = linkage_tree( X, connectivity=connectivity_X, linkage=linkage, return_distance=True) # check that the labels are the same assert_array_equal(X_truth[:, :2], out_X_unstructured[0]) assert_array_equal(X_truth[:, :2], out_X_structured[0]) # check that the distances are correct assert_array_almost_equal(X_truth[:, 2], out_X_unstructured[4]) assert_array_almost_equal(X_truth[:, 2], out_X_structured[4])
def ward_tree(data): wardtree = ward_tree(data) print wardtree
def fit(self, X, y): """ Fits Supervised Clustering. Parameters ---------- X : ndarray of shape = (n_samples, n_features) Y : ndarray of shape = (n_samples) Returns ------- self """ # n_components computed here because the user can change connectivity if self.connectivity is not None: self.n_components = cs_graph_components(self.connectivity)[0] else: self.n_components = 1 children, n_components, n_leaves = ward_tree(X.T, connectivity=self.connectivity, n_components=self.n_components) children = children.tolist() # Faster with a list avg_signals = average_signals(X, children, n_leaves) # The first parcellations is the list of the tree roots parcellation = tree_roots(children, n_components, n_leaves) parcellations = [] # List of the best parcellations self.scores_ = [] if self.verbose >= 2: print "\n# First parcellation (=tree roots) : %s" % parcellations ## EXPLORATION LOOP for i in range(1, self.n_iterations+1): # for verbose mode if self.verbose: print "# Iteration %d" % i iteration_parcellations = split_parcellation(parcellation, children, n_leaves) if (len(iteration_parcellations) == 0): # No parcellation can be splitted print " UserWARNING : n_iterations is too big :" print " Ending function at iteration %d." % i break # Selecting the best parcellation for current iteration scores = Parallel(n_jobs=self.n_jobs)(delayed(cross_val_score) (estimator=self.estimator, X=avg_signals[:, j], y=y, cv=self.cv, n_jobs=1, verbose=0) for j in iteration_parcellations) scores = np.mean(scores, axis=1) indice = np.argmax(scores) parcellation = np.copy(iteration_parcellations[indice]) parcellations.append(np.copy(parcellation)) self.scores_.append(np.copy(scores[indice])) ## SELECTION LOOP # We select the parcellation for wich the variation of score is # the bigger, only if it score is > score_max / 2 # Furthermore we select only parcellations obtained after 20 iterations indice_min = 20 self.score_min_ = 7 * (np.max(self.scores_) / 10) max = 0 indice = 0 self.delta_scores_ = [0] for i in range(indice_min): self.delta_scores.append(0) for i in range(indice_min, len(self.scores_)-1): if self.scores_[i+1] >= self.score_min_: current_delta = self.scores_[i+1] - self.scores_[i] if current_delta > max: max = current_delta indice = i self.delta_scores_.append(current_delta) else: self.delta_scores_.append(0) parcellation = parcellations[indice] # Computing the corresponding labels array self.labels_ = parcellation_to_label(parcellation, children, n_leaves) self.estimator.fit(avg_signals[:, parcellation], y) if hasattr(self.estimator, 'coef_'): if len(self.estimator.coef_.shape) == 1: self.coef_ = self.estimator.coef_ else: self.coef_ = self.estimator.coef_[-1] return self