def test_scikit_vs_scipy(): # Test scikit linkage with full connectivity (i.e. unstructured) vs scipy n, p, k = 10, 5, 3 rng = np.random.RandomState(0) # Not using a lil_matrix here, just to check that non sparse # matrices are well handled connectivity = np.ones((n, n)) for linkage in _TREE_BUILDERS.keys(): for i in range(5): X = .1 * rng.normal(size=(n, p)) X -= 4. * np.arange(n)[:, np.newaxis] X -= X.mean(axis=1)[:, np.newaxis] out = hierarchy.linkage(X, method=linkage) children_ = out[:, :2].astype(np.int, copy=False) children, _, n_leaves, _ = _TREE_BUILDERS[linkage](X, connectivity) # Sort the order of child nodes per row for consistency children.sort(axis=1) assert_array_equal(children, children_, 'linkage tree differs' ' from scipy impl for' ' linkage: ' + linkage) cut = _hc_cut(k, children, n_leaves) cut_ = _hc_cut(k, children_, n_leaves) assess_same_labelling(cut, cut_) # Test error management in _hc_cut assert_raises(ValueError, _hc_cut, n_leaves + 1, children, n_leaves)
def test_scikit_vs_scipy(): # Test scikit linkage with full connectivity (i.e. unstructured) vs scipy n, p, k = 10, 5, 3 rng = np.random.RandomState(0) # Not using a lil_matrix here, just to check that non sparse # matrices are well handled connectivity = np.ones((n, n)) for linkage in _TREE_BUILDERS.keys(): for i in range(5): X = .1 * rng.normal(size=(n, p)) X -= 4. * np.arange(n)[:, np.newaxis] X -= X.mean(axis=1)[:, np.newaxis] out = hierarchy.linkage(X, method=linkage) children_ = out[:, :2].astype(np.int, copy=False) children, _, n_leaves, _ = _TREE_BUILDERS[linkage](X, connectivity) # Sort the order of child nodes per row for consistency children.sort(axis=1) assert_array_equal( children, children_, 'linkage tree differs' ' from scipy impl for' ' linkage: ' + linkage) cut = _hc_cut(k, children, n_leaves) cut_ = _hc_cut(k, children_, n_leaves) assess_same_labelling(cut, cut_) # Test error management in _hc_cut assert_raises(ValueError, _hc_cut, n_leaves + 1, children, n_leaves)
def test_scikit_vs_scipy(): """Test scikit linkage with full connectivity (i.e. unstructured) vs scipy """ n, p, k = 10, 5, 3 rng = np.random.RandomState(0) # Not using a lil_matrix here, just to check that non sparse # matrices are well handled connectivity = np.ones((n, n)) for linkage in _TREE_BUILDERS.keys(): for i in range(5): X = .1 * rng.normal(size=(n, p)) X -= 4. * np.arange(n)[:, np.newaxis] X -= X.mean(axis=1)[:, np.newaxis] out = hierarchy.linkage(X, method=linkage) children_ = out[:, :2].astype(np.int) children, _, n_leaves, _ = _TREE_BUILDERS[linkage](X, connectivity) cut = _hc_cut(k, children, n_leaves) cut_ = _hc_cut(k, children_, n_leaves) assess_same_labelling(cut, cut_) # Test error management in _hc_cut assert_raises(ValueError, _hc_cut, n_leaves + 1, children, n_leaves)
def test_scikit_vs_scipy(): """Test scikit ward with full connectivity (i.e. unstructured) vs scipy """ from scipy.sparse import lil_matrix n, p, k = 10, 5, 3 rnd = np.random.RandomState(0) connectivity = lil_matrix(np.ones((n, n))) for i in range(5): X = 0.1 * rnd.normal(size=(n, p)) X -= 4 * np.arange(n)[:, np.newaxis] X -= X.mean(axis=1)[:, np.newaxis] out = hierarchy.ward(X) children_ = out[:, :2].astype(np.int) children, _, n_leaves, _ = ward_tree(X, connectivity) cut = _hc_cut(k, children, n_leaves) cut_ = _hc_cut(k, children_, n_leaves) assess_same_labelling(cut, cut_) # Test error management in _hc_cut assert_raises(ValueError, _hc_cut, n_leaves + 1, children, n_leaves)
def test_agglomerative_clustering_with_distance_threshold(linkage): # Check that we obtain the correct number of clusters with # agglomerative clustering with distance_threshold. rng = np.random.RandomState(0) mask = np.ones([10, 10], dtype=np.bool) n_samples = 100 X = rng.randn(n_samples, 50) connectivity = grid_to_graph(*mask.shape) # test when distance threshold is set to 10 distance_threshold = 10 for conn in [None, connectivity]: clustering = AgglomerativeClustering( n_clusters=None, distance_threshold=distance_threshold, connectivity=conn, linkage=linkage) clustering.fit(X) clusters_produced = clustering.labels_ num_clusters_produced = len(np.unique(clustering.labels_)) # test if the clusters produced match the point in the linkage tree # where the distance exceeds the threshold tree_builder = _TREE_BUILDERS[linkage] children, n_components, n_leaves, parent, distances = \ tree_builder(X, connectivity=conn, n_clusters=None, return_distance=True) num_clusters_at_threshold = np.count_nonzero( distances >= distance_threshold) + 1 # test number of clusters produced assert num_clusters_at_threshold == num_clusters_produced # test clusters produced clusters_at_threshold = _hc_cut(n_clusters=num_clusters_produced, children=children, n_leaves=n_leaves) assert np.array_equiv(clusters_produced, clusters_at_threshold)
def _cut_tree_scipy(Y, k): """ Given the output Y of a hierarchical clustering solution from scipy and a number k, cuts the tree and returns the labels. """ children = Y[:, 0:2].astype(int) # convert children to correct values for _hc_cut return _hc_cut(k, children, len(children) + 1)
def predict(self, data, k): train_prediction = _hc_cut(k, self._children, self._n_leaves) # XXX: is there a way to avoid this? if np.array_equal(data, self._train_data): return train_prediction else: return _predict_knn(self._train_data, data, train_prediction)
def complete_linkage(X, connectivity=None, n_clusters=4): from sklearn.cluster.hierarchical import _hc_cut if connectivity is None: d = euclidean_distances(X) else: connectivity = connectivity.copy() # Remove the diagonal mask = connectivity.row != connectivity.col connectivity.row = connectivity.row[mask] connectivity.col = connectivity.col[mask] connectivity.data = connectivity.data[mask] d_ = X[connectivity.row] d_ -= X[connectivity.col] d_ **= 2 d_ = d_.sum(axis=-1) # XXX: not necessary: complete_linkage is invariant by increasing # function d_ = np.sqrt(d_) d = connectivity d.data = d_ L = nn_chain_core(d) a, b, height = np.array(L).T children = np.c_[a, b].astype(np.int) labels = _hc_cut(n_clusters=n_clusters, children=children, n_leaves=len(X)) return labels
def cut_tree_scipy(Y, k): """ Given the output Y of a hierarchical clustering solution from scipy and a number k, cuts the tree and returns the labels. """ children = Y[:, 0:2].astype(int) # convert children to correct values for _hc_cut return _hc_cut(k, children, len(children)+1)
def test_scikit_vs_scipy(): """Test scikit ward with full connectivity (i.e. unstructured) vs scipy """ from scipy.sparse import lil_matrix n, p, k = 10, 5, 3 connectivity = lil_matrix(np.ones((n, n))) for i in range(5): X = .1 * np.random.normal(size=(n, p)) X -= 4 * np.arange(n)[:, np.newaxis] X -= X.mean(axis=1)[:, np.newaxis] out = hierarchy.ward(X) children_ = out[:, :2].astype(np.int) children, _, n_leaves = ward_tree(X, connectivity) cut = _hc_cut(k, children, n_leaves) cut_ = _hc_cut(k, children_, n_leaves) assess_same_labelling(cut, cut_)
def cluster(clusterType, vectors, y): if (clusterType == "KMeans"): kclusterer = KMeansClusterer( NUM_CLUSTERS, distance=nltk.cluster.util.cosine_distance, repeats=25) assigned_clusters = kclusterer.cluster(vectors, assign_clusters=True) elif (clusterType == "GMM"): GMM = GaussianMixture(n_components=NUM_CLUSTERS) assigned_clusters = GMM.fit_predict(vectors) elif (clusterType == "SVM"): classifier = SVC(kernel='rbf', gamma='auto', random_state=0) #cross-validation assigned_clusters = cross_validation(classifier, vectors, y) elif (clusterType == "T2VH"): ret = hierarchical.ward_tree(vectors, n_clusters=NUM_CLUSTERS) children = ret[0] n_leaves = ret[2] assigned_clusters = hierarchical._hc_cut(NUM_CLUSTERS, children, n_leaves) elif (clusterType == "RandomForest"): classifier = RandomForestClassifier() #cross-validation assigned_clusters = cross_validation(classifier, vectors, y) # classifier.fit(vectors, y) # assigned_clusters=classifier.predict(vectors) elif (clusterType == "DecisionTree"): classifier = DecisionTreeClassifier() #cross-validation assigned_clusters = cross_validation(classifier, vectors, y) # classifier.fit(vectors, y) # assigned_clusters=classifier.predict(vectors) elif (clusterType == "LogisticRegression"): classifier = sklearn.linear_model.LogisticRegression() #cross-validation assigned_clusters = cross_validation(classifier, vectors, y) # classifier.fit(vectors, y) # assigned_clusters=classifier.predict(vectors) else: print(clusterType, " is not a predefined cluster type.") return return assigned_clusters
def fit(self, X, y=None): """Fit the hierarchical clustering on the data Parameters ---------- X : array-like, shape = [n_samples, n_features] Training data. Shape [n_samples, n_features], or [n_samples, n_samples] if affinity=='precomputed'. y : Ignored Returns ------- self """ if (self.pooling_func != 'deprecated' and not isinstance(self, AgglomerationTransform)): warnings.warn( 'Agglomerative "pooling_func" parameter is not used.' ' It has been deprecated in version 0.20 and will be' 'removed in 0.22', DeprecationWarning) X = check_array(X, ensure_min_samples=2, estimator=self) memory = check_memory(self.memory) if self.n_clusters is not None and self.n_clusters <= 0: raise ValueError("n_clusters should be an integer greater than 0." " %s was provided." % str(self.n_clusters)) if not ((self.n_clusters is None) ^ (self.distance_threshold is None)): raise ValueError("Exactly one of n_clusters and " "distance_threshold has to be set, and the other " "needs to be None.") if (self.distance_threshold is not None and not self.compute_full_tree): raise ValueError("compute_full_tree must be True if " "distance_threshold is set.") if self.linkage == "ward" and self.affinity != "euclidean": raise ValueError("%s was provided as affinity. Ward can only " "work with euclidean distances." % (self.affinity, )) if self.linkage not in _TREE_BUILDERS: raise ValueError("Unknown linkage type %s. " "Valid options are %s" % (self.linkage, _TREE_BUILDERS.keys())) tree_builder = _TREE_BUILDERS[self.linkage] connectivity = self.connectivity if self.connectivity is not None: if callable(self.connectivity): connectivity = self.connectivity(X) connectivity = check_array(connectivity, accept_sparse=['csr', 'coo', 'lil']) n_samples = len(X) compute_full_tree = self.compute_full_tree if self.connectivity is None: compute_full_tree = True if compute_full_tree == 'auto': if self.distance_threshold is not None: compute_full_tree = True else: # Early stopping is likely to give a speed up only for # a large number of clusters. The actual threshold # implemented here is heuristic compute_full_tree = self.n_clusters < max(100, .02 * n_samples) n_clusters = self.n_clusters if compute_full_tree: n_clusters = None # Construct the tree kwargs = {} if self.linkage != 'ward': kwargs['linkage'] = self.linkage kwargs['affinity'] = self.affinity distance_threshold = self.distance_threshold return_distance = distance_threshold is not None out = memory.cache(tree_builder)(X, connectivity, n_clusters=n_clusters, return_distance=return_distance, **kwargs) (self.children_, self.n_connected_components_, self.n_leaves_, parents) = out[:4] if distance_threshold is not None: distances = out[-1] self.distances_ = distances self.n_clusters_ = np.count_nonzero( distances >= distance_threshold) + 1 else: self.n_clusters_ = self.n_clusters # Cut the tree if compute_full_tree: self.labels_ = _hc_cut(self.n_clusters_, self.children_, self.n_leaves_) else: labels = _hierarchical.hc_get_heads(parents, copy=False) # copy to avoid holding a reference on the original array labels = np.copy(labels[:n_samples]) # Reassign cluster numbers self.labels_ = np.searchsorted(np.unique(labels), labels) return self
N = 1000 np.random.seed(0) X = np.random.random((N, 2)) d = euclidean_distances(X) L = nn_chain_core(X) a, b, height = np.array(L).T #order = np.argsort(height, kind='mergesort') #a = a[order] #b = b[order] #height = height[order] if 1: import pylab as pl children = np.c_[a, b].astype(np.int) from sklearn.cluster.hierarchical import _hc_cut, ward_tree labels = _hc_cut(n_clusters=4, children=children, n_leaves=N) pl.figure(1) pl.clf() pl.scatter(X[:, 0], X[:, 1], c=labels, cmap=pl.cm.spectral) pl.title('Complete linkage') if 1: from scipy.cluster import hierarchy children_s = hierarchy.complete(X)[:, :2].astype(np.int) labels_s = _hc_cut(n_clusters=4, children=children_s, n_leaves=N) import pylab as pl pl.figure(0) pl.clf() pl.scatter(X[:, 0], X[:, 1], c=labels_s, cmap=pl.cm.spectral) pl.title('Complete linkage (scipy)') if 0: pl.figure(2)
def compute_stability_fold(samples, train, test, method='ward', max_k=None, stack=False, stability=True, cv_likelihood=False, corr_score=None, ground_truth=None, n_neighbors=1, **kwargs): """ General function to compute the stability on a cross-validation fold. Parameters: ----------- samples : list of arrays List of arrays containing the samples to cluster, each array has shape (n_samples, n_features) in PyMVPA terminology. We are clustering the features, i.e., the nodes. train : list or array Indices for the training set. test : list or array Indices for the test set. method : {'complete', 'gmm', 'kmeans', 'ward'} Clustering method to use. Default is 'ward'. max_k : int or None Maximum k to compute the stability testing, starting from 2. By default it will compute up to the maximum possible k, i.e., the number of points. stack : bool Whether to stack or average the datasets. Default is False, meaning that the datasets are averaged by default. stability : bool Whether to compute the stability measure described in Lange et al., 2004. Default is True. cv_likelihood : bool Whether to compute the cross-validated likelihood for mixture model; only valid if 'gmm' method is used. Default is False. corr_score : {'pearson','spearman'} or None Whether to compute the specified type of correlation score. Default is None. ground_truth : array or None Array containing the ground truth of the clustering of the data, useful to compare stability against ground truth for simulations. n_neighbors : int Number of neighbors to use to predict clustering solution on test set using K-nearest neighbors. Currently used only for methods `complete` and `ward`. Default is 1. kwargs : optional Keyword arguments being passed to the clustering method (only for 'ward' and 'gmm'). Returns: -------- ks : array A (max_k-1,) array, where ks[i] is the `k` of the clustering solution for iteration `i`. ari : array A (max_k-1,) array, where ari[i] is the Adjusted Rand Index of the predicted clustering solution on the test set and the actual clustering solution of the test set for `k` of ks[i]. ami : array A (max_k-1,) array, where ari[i] is the Adjusted Mutual Information of the predicted clustering solution on the test set and the actual clustering solution of the test set for `k` of ks[i]. stab : array or None A (max_k-1,) array, where stab[i] is the stability measure described in Lange et al., 2004 for `k` of ks[i]. Note that this measure is the un-normalized one. It will be normalized later in the process. likelihood : array or None If method is 'gmm' and cv_likelihood is True, a (max_k-1,) array, where likelihood[i] is the cross-validated likelihood of the GMM clustering solution for `k` of ks[i]. Otherwise returns None. ari_gt : array or None If ground_truth is not None, a (max_k-1,) array, where ari_gt[i] is the Adjusted Rand Index of the predicted clustering solution on the test set for `k` of ks[i] and the ground truth clusters of the data. Otherwise returns None. ami_gt : array or None If ground_truth is not None, a (max_k-1,) array, where ami_gt[i] is the Adjusted Mutual Information of the predicted clustering solution on the test set for `k` of ks[i] and the ground truth clusters of the data. Otherwise returns None. stab_gt : array or None If ground_truth is not None, a (max_k-1,) array, where stab_gt[i] is the stability measure of the predicted clustering solution on the test set for `k` of ks[i] and the ground truth clusters of the data. Otherwise returns None. corr : array or None Average correlation for each fold. TODO corr_gt : array or None Avg correlation against GT. TODO """ if method not in AVAILABLE_METHODS: raise ValueError('Method {0} not implemented'.format(method)) if cv_likelihood and method != 'gmm': raise ValueError( "Cross-validated likelihood is only available for 'gmm' method") # if max_k is None, set max_k to maximum value if not max_k: max_k = samples[0].shape[1] # preallocate arrays for results ks = np.zeros(max_k-1, dtype=int) ari = np.zeros(max_k-1) ami = np.zeros(max_k-1) if stability: stab = np.zeros(max_k-1) if cv_likelihood: likelihood = np.zeros(max_k-1) if corr_score is not None: corr = np.zeros(max_k-1) if ground_truth is not None: ari_gt = np.zeros(max_k-1) ami_gt = np.zeros(max_k-1) if stability: stab_gt = np.zeros(max_k-1) if corr_score is not None: corr_gt = np.zeros(max_k-1) # get training and test train_set = [samples[x] for x in train] test_set = [samples[x] for x in test] if stack: train_ds = np.vstack(train_set) test_ds = np.vstack(test_set) else: train_ds = np.mean(np.dstack(train_set), axis=2) test_ds = np.mean(np.dstack(test_set), axis=2) # compute clustering on training set if method == 'complete': train_ds_dist = pdist(train_ds.T, metric='correlation') test_ds_dist = pdist(test_ds.T, metric='correlation') # I'm computing the full tree and then cutting # afterwards to speed computation Y_train = complete(train_ds_dist) # same on testing set Y_test = complete(test_ds_dist) elif method == 'ward': (children_train, n_comp_train, n_leaves_train, parents_train) = ward_tree(train_ds.T, **kwargs) # same on testing set (children_test, n_comp_test, n_leaves_test, parents_test) = ward_tree(test_ds.T, **kwargs) elif method == 'gmm' or method == 'kmeans': pass # we'll have to run it for each k else: raise ValueError("We shouldn't get here") for i_k, k in enumerate(range(2, max_k+1)): if method == 'complete': # cut the tree with right K for both train and test train_label = cut_tree_scipy(Y_train, k) test_label = cut_tree_scipy(Y_test, k) # train a classifier on this clustering knn = KNeighborsClassifier(#algorithm='brute', # metric='correlation', n_neighbors=n_neighbors) knn.fit(train_ds.T, train_label) # predict the clusters in the test set prediction_label = knn.predict(test_ds.T) elif method == 'ward': # cut the tree with right K for both train and test train_label = _hc_cut(k, children_train, n_leaves_train) test_label = _hc_cut(k, children_test, n_leaves_test) # train a classifier on this clustering knn = KNeighborsClassifier(n_neighbors=n_neighbors) knn.fit(train_ds.T, train_label) # predict the clusters in the test set prediction_label = knn.predict(test_ds.T) elif method == 'gmm': gmm = GMM(n_components=k, **kwargs) # fit on train and predict test gmm.fit(train_ds.T) prediction_label = gmm.predict(test_ds.T) if cv_likelihood: log_prob = np.sum(gmm.score(test_ds.T)) # fit on test and get labels gmm.fit(test_ds.T) test_label = gmm.predict(test_ds.T) elif method == 'kmeans': kmeans = KMeans(n_clusters=k) # fit on train and predict test kmeans.fit(train_ds.T) prediction_label = kmeans.predict(test_ds.T) # fit on test and get labels kmeans.fit(test_ds.T) test_label = kmeans.predict(test_ds.T) else: raise ValueError("We shouldn't get here") # append results ks[i_k] = k ari[i_k] = adjusted_rand_score(prediction_label, test_label) ami[i_k] = adjusted_mutual_info_score(prediction_label, test_label) if stability: stab[i_k] = stability_score(prediction_label, test_label, k) if cv_likelihood: likelihood[i_k] = log_prob if corr_score is not None: corr[i_k] = correlation_score(prediction_label, test_label, test_ds, corr_score) if ground_truth is not None: ari_gt[i_k] = adjusted_rand_score(prediction_label, ground_truth) ami_gt[i_k] = adjusted_mutual_info_score(prediction_label, ground_truth) if stability: stab_gt[i_k] = stability_score(prediction_label, ground_truth, k) if corr_score is not None: corr_gt[i_k] = correlation_score(prediction_label, ground_truth, test_ds, corr_score) results = [ks, ari, ami] if stability: results.append(stab) else: results.append(None) if cv_likelihood: results.append(likelihood) else: results.append(None) if ground_truth is not None: results += [ari_gt, ami_gt] else: results += [None, None] if stability and ground_truth is not None: results.append(stab_gt) else: results.append(None) if corr_score is not None: results.append(corr) else: results.append(None) if corr_score is not None and ground_truth is not None: results.append(corr_gt) else: results.append(None) return results