def Judge_1(inputfile,n): from sklearn.metrics.cluster import silhouette_score from sklearn.preprocessing import StandardScaler from sklearn.cluster import KMeans, AgglomerativeClustering, DBSCAN import matplotlib.pyplot as plt import mglearn X= pd.read_csv(inputfile) # 将数据缩放成平均值为 0,方差为 1 scaler = StandardScaler() scaler.fit(X) X_scaled = scaler.transform(X) fig, axes = plt.subplots(1, 4, figsize=(15, 3), subplot_kw={'xticks': (), 'yticks': ()}) # 需要使用的算法 algorithms = [KMeans(n_clusters=n), AgglomerativeClustering(n_clusters=n), DBSCAN()] # 创建一个随机的簇分配,作为参考 random_state = np.random.RandomState(seed=0) random_clusters = random_state.randint(low=0, high=2, size=len(X)) axes[0].scatter(X_scaled[:, 0], X_scaled[:, 1], c=random_clusters, cmap=mglearn.cm3, s=60) axes[0].set_title('Random assignment:{:.2f}'.format(silhouette_score(X_scaled, random_clusters))) for ax, algorithm in zip(axes[1:], algorithms): clusters = algorithm.fit_predict(X_scaled) ax.scatter(X_scaled[:, 0], X_scaled[:, 1], c=clusters, cmap=mglearn.cm3, s=60) ax.set_title('{}:{:.2f}'.format(algorithm.__class__.__name__, silhouette_score(X_scaled, clusters))) plt.show() return True
def test_silhouette(): # Tests the Silhouette Coefficient. dataset = datasets.load_iris() X = dataset.data y = dataset.target D = pairwise_distances(X, metric='euclidean') # Given that the actual labels are used, we can assume that S would be # positive. silhouette = silhouette_score(D, y, metric='precomputed') assert(silhouette > 0) # Test without calculating D silhouette_metric = silhouette_score(X, y, metric='euclidean') assert_almost_equal(silhouette, silhouette_metric) # Test with sampling silhouette = silhouette_score(D, y, metric='precomputed', sample_size=int(X.shape[0] / 2), random_state=0) silhouette_metric = silhouette_score(X, y, metric='euclidean', sample_size=int(X.shape[0] / 2), random_state=0) assert(silhouette > 0) assert(silhouette_metric > 0) assert_almost_equal(silhouette_metric, silhouette) # Test with sparse X X_sparse = csr_matrix(X) D = pairwise_distances(X_sparse, metric='euclidean') silhouette = silhouette_score(D, y, metric='precomputed') assert(silhouette > 0)
def fg1(): from sklearn.metrics.cluster import adjusted_mutual_info_score,silhouette_score from sklearn.cluster import KMeans, AgglomerativeClustering,DBSCAN from sklearn.datasets import make_moons from sklearn.preprocessing import StandardScaler X,y = make_moons(n_samples=500,noise=0.07,random_state=0) scaler = StandardScaler() scaler.fit(X) X_scaled = scaler.transform(X) fig,axes = plt.subplots(1,4,figsize=(15,3), subplot_kw={'xticks':(),'yticks':()}) algorithms = [KMeans(n_clusters=2),AgglomerativeClustering(n_clusters=2),DBSCAN()] random_state = np.random.RandomState(seed=0) random_clusters = random_state.randint(low=0,high=2,size=len(X)) axes[0].scatter(X_scaled[:,0],X_scaled[:,1],c=random_clusters,cmap=mglearn.cm3,s=60) axes[0].set_title("Random assignment - ARI: {:.2f}".format( silhouette_score(X_scaled,random_clusters))) for ax, algorithm in zip(axes[1:],algorithms): clusters = algorithm.fit_predict(X_scaled) ax.scatter(X_scaled[:,0],X_scaled[:,1],c=clusters,cmap=mglearn.cm3,s=30) ax.set_title("{} - ARI: {:.2f}".format(algorithm.__class__.__name__, silhouette_score(X_scaled,clusters))) plt.show()
def test_silhouette(): # Tests the Silhouette Coefficient. dataset = datasets.load_iris() X = dataset.data y = dataset.target D = pairwise_distances(X, metric='euclidean') # Given that the actual labels are used, we can assume that S would be # positive. silhouette = silhouette_score(D, y, metric='precomputed') assert (silhouette > 0) # Test without calculating D silhouette_metric = silhouette_score(X, y, metric='euclidean') assert_almost_equal(silhouette, silhouette_metric) # Test with sampling silhouette = silhouette_score(D, y, metric='precomputed', sample_size=int(X.shape[0] / 2), random_state=0) silhouette_metric = silhouette_score(X, y, metric='euclidean', sample_size=int(X.shape[0] / 2), random_state=0) assert (silhouette > 0) assert (silhouette_metric > 0) assert_almost_equal(silhouette_metric, silhouette) # Test with sparse X X_sparse = csr_matrix(X) D = pairwise_distances(X_sparse, metric='euclidean') silhouette = silhouette_score(D, y, metric='precomputed') assert (silhouette > 0)
def test_non_encoded_labels(): dataset = datasets.load_iris() X = dataset.data labels = dataset.target assert silhouette_score(X, labels * 2 + 10) == silhouette_score(X, labels) assert_array_equal(silhouette_samples(X, labels * 2 + 10), silhouette_samples(X, labels))
def test_non_encoded_labels(): dataset = datasets.load_iris() X = dataset.data labels = dataset.target assert_equal( silhouette_score(X, labels * 2 + 10), silhouette_score(X, labels)) assert_array_equal( silhouette_samples(X, labels * 2 + 10), silhouette_samples(X, labels))
def apply(self): k_range = range(self.k_min, self.k_max) if isinstance(self.model, Simlr): # uses generative object and np.fromiter for memory efficiency go = (silhouette_score(X=self.matrix, labels=KMeans(k).fit_predict(self.model.set_params(n_clusters=k).fit_predict(self.matrix))) for k in k_range) silhouette_scores = np.fromiter(go, dtype=float, count=len(k_range)) else: predicted_labels = [self.model.set_params(n_clusters=k).fit_predict(self.matrix) for k in k_range] silhouette_scores = [silhouette_score(X=self.matrix, labels=obj, metric=self.metric) for obj in predicted_labels] max_index = np.argmax(silhouette_scores) self.results = predicted_labels[max_index]
def test_silhouette_score_integer_precomputed(): """Check that silhouette_score works for precomputed metrics that are integers. Non-regression test for #22107. """ result = silhouette_score([[0, 1, 2], [1, 0, 1], [2, 1, 0]], [0, 0, 1], metric="precomputed") assert result == pytest.approx(1 / 6) # non-zero on diagonal for ints raises an error with pytest.raises(ValueError, match="contains non-zero"): silhouette_score([[1, 1, 2], [1, 0, 1], [2, 1, 0]], [0, 0, 1], metric="precomputed")
def evaluate_algorithms_with_silhouette_coefficient(): # 将数据缩放成均值为0,方差为1 from sklearn.datasets import make_moons X_moons, y_moons = make_moons(n_samples=200, noise=0.05, random_state=seed) from sklearn.preprocessing import StandardScaler scaler = StandardScaler() scaler.fit(X_moons) X_scaled = scaler.transform(X_moons) fig, axes = plt.subplots(1, 4, figsize=(10, 5), subplot_kw={ 'xticks': (), 'yticks': () }) # 创建一个随机的簇分配,作为参考 random_state = np.random.RandomState(seed=0) random_clusters = random_state.randint(low=0, high=2, size=len(X_moons)) axes[0].scatter(X_scaled[:, 0], X_scaled[:, 1], c=random_clusters, cmap=mglearn.cm3, s=60) from sklearn.metrics.cluster import silhouette_score axes[0].set_title('随机分配: {:.2f}'.format( silhouette_score(X_scaled, random_clusters))) from sklearn.cluster import KMeans from sklearn.cluster import DBSCAN from sklearn.cluster import AgglomerativeClustering algorithms = [ KMeans(n_clusters=2), AgglomerativeClustering(n_clusters=2), DBSCAN() ] for ax, algorithm in zip(axes[1:], algorithms): clusters = algorithm.fit_predict(X_scaled) ax.scatter(X_scaled[:, 0], X_scaled[:, 1], c=clusters, cmap=mglearn.cm3, s=60) ax.set_title('{} : {:.2f}'.format(algorithm.__class__.__name__, silhouette_score(X_scaled, clusters))) pass plt.suptitle("图3-40:基于轮廓分数对two_moons数据集上的算法进行评价")
def cluster(train, val, type, number_of_clusters, plot_folder, classes): # todo this should be a class if type == "spectral_clustering": clustering_model = SpectralClustering(n_clusters=number_of_clusters, assign_labels="discretize", random_state=0).fit( train["data"]) elif type == "kmeans": clustering_model = KMeans(n_clusters=number_of_clusters, random_state=0).fit(train["data"]) else: raise NotImplementedError # compute metrics accuracies = {} random_array = np.random.randint(9, size=train["labels"].shape) centroids = find_centroids(number_of_clusters, train, clustering_model.labels_) test_classifications = cluster_test(val, centroids) visualize_clustering(train, clustering_model.labels_, type + "_training", plot_folder, number_of_clusters, centroids) visualize_clustering(val, np.asarray(test_classifications), type + "_validation", plot_folder, number_of_clusters, centroids) accuracies["random_score"] = homogeneity_score(train["labels"], random_array) accuracies["v_measure_score"] = v_measure_score(train["labels"], clustering_model.labels_) accuracies["homogeneity_score"] = homogeneity_score( train["labels"], clustering_model.labels_) accuracies["completeness_score"] = completeness_score( train["labels"], clustering_model.labels_) accuracies["silhouette_score"] = silhouette_score(train["data"], clustering_model.labels_) accuracies["purity_score"], accuracies[ "contingency_matrix"] = purity_score(train["labels"], clustering_model.labels_) accuracies["v_measure_score_test"] = v_measure_score( val["labels"], test_classifications) accuracies["homogeneity_score_test"] = homogeneity_score( val["labels"], test_classifications) accuracies["completeness_score_test"] = completeness_score( val["labels"], test_classifications) accuracies["silhouette_score_test"] = silhouette_score( val["data"], test_classifications) accuracies["purity_score_test"], accuracies[ "contingency_matrix_test"] = purity_score(val["labels"], test_classifications) return accuracies
def main(data, clustering, reduce_dims=True, outpath=None, verbose=True): if verbose: print("Reading the data...") print("Creating the indexes...") ind = data[:, 0] data = data[:, 1] if verbose: print('Read {0} rows of data...'.format(ind.shape[0])) X, secs = build_and_clean(data, outpath=outpath, verbose=verbose) if verbose: print("Complete data build in {:0.3f} seconds".format(secs)) print("New data has {0},{1} dimension".format(X.shape[0], X.shape[1])) print("Starting Clustering...") if reduce_dims: (X, var), secs = feature_reduction(X, 5000, verbose=verbose) if verbose: print("Complete data build in {:0.3f} seconds".format(secs)) print("Explained variance of the SVD : {}".format(var)) print("New data has {0},{1} dimension".format( X.shape[0], X.shape[1])) clustering.fit(X) pred_lbl = clustering.predict(X) if verbose: print("Finished Clustering with score {:0.3f}".format( clustering.inertia_)) print("Computing Calinski_Harabaz Score") # print(calinski_harabaz_score(X.toarray(),pred_lbl)) if verbose: print("Computing Silhouette Score") print(silhouette_score(X, pred_lbl, sample_size=10000))
def cluster(csv, k): data = pd.read_csv(csv) # X Features X = np.array(data.drop(['botname'], 1)) X = scale(X.data) # Wähle Anzahl der Cluster, Startpunk der Centroids, Iterationen # Random State seed für Reproduktion der Ergebnisse clustering = KMeans(n_clusters=k, init='k-means++', n_init=10, random_state=6) clustering.fit(X) X_scaled = X result = clustering.fit_predict(X) data['Cluster'] = result data = data.sort_values(['Cluster']) data.to_csv(r"C:\Users\Ronald Scheffler\.spyder-py3\clusterresult" + str(k) + ".csv") print(silhouette_score(X_scaled, result))
def _extract_best_optics(self, clusterer): max_score = -inf best_pred = None # Traverse epsilon to detect the best cut for my_eps in arange(0.01, 0.5, 0.01): pred = cluster_optics_dbscan( reachability=clusterer.reachability_, core_distances=clusterer.core_distances_, ordering=clusterer.ordering_, eps=my_eps) if not len(unique(pred)) in (1, len(self.data)): score = silhouette_score(X=self.data, labels=pred, metric=self.distance_metric, random_state=13712) if score > max_score: max_score = score best_pred = pred if best_pred is not None: return self._process_noise_as_singletons(best_pred) else: # All outputs are either one cluster or n clusters return self._process_noise_as_singletons(pred)
def get_clustering_metrics(train_data, cluster_labels, ground_truth_labels=None): clustering_metric_dict = dict({}) clustering_metric_dict['silhouette_score'] = silhouette_score( train_data, cluster_labels, random_state=42) clustering_metric_dict[ 'calinski_harabasz_score'] = calinski_harabasz_score( train_data, cluster_labels) clustering_metric_dict['davies_bouldin_score'] = davies_bouldin_score( train_data, cluster_labels) if ground_truth_labels is not None: clustering_metric_dict['v_measure_score'] = v_measure_score( ground_truth_labels, cluster_labels) clustering_metric_dict[ 'fowlkes_mallows_score'] = fowlkes_mallows_score( ground_truth_labels, cluster_labels) clustering_metric_dict['homogeneity_score'] = homogeneity_score( ground_truth_labels, cluster_labels) clustering_metric_dict[ 'normalized_mutual_info_score'] = normalized_mutual_info_score( ground_truth_labels, cluster_labels) clustering_metric_dict['adjusted_rand_score'] = adjusted_rand_score( ground_truth_labels, cluster_labels) clustering_metric_dict['completeness_score'] = completeness_score( ground_truth_labels, cluster_labels) return clustering_metric_dict
def get_single_linkage(dataframe): dists = pdist(dataframe) Z = single(dists) best_score = (-1, 2) last_score = -1 non_improving_iter = 0 k = 2 while non_improving_iter < 10: labels = fcluster(Z, k, criterion='maxclust') if len(np.unique(labels)) > 1: res = silhouette_score(dataframe, labels) if res > last_score: non_improving_iter = 0 else: non_improving_iter += 1 if res > best_score[0]: best_score = (res, k, labels) last_score = res k += 1 return best_score[2]
def main(): args, atom_indices, project, project_root = parse_cmdline() # load all of the data from disk xyzlist, sampled_frames = load_trajs(project, os.path.dirname(args.project_yaml), atom_indices, args.stride, args.fraction) assignments = io.loadh(args.assignments, 'arr_0') # pick only the assignments that had their xyz data loaded assignments = np.concatenate([assignments[i, sampled_frames[i]] for i in range(len(sampled_frames))]) # make sure we didn't mess up the subsampling and get nonsense data assert not np.any(assignments < 0), 'assignments negative? stride/sampling messed up probs. did you use a different strid than you clustered with?' #assert np.all(np.unique(assignments) == np.arange(np.max(assignments)+1)), 'assignments dont go from 0 to max. did you use a different strid than you clustered with?' n_real_atoms = len(atom_indices) n_padded_atoms = xyzlist.shape[2] assert n_padded_atoms >= n_real_atoms pairwise = calculate_pairwise_rmsd(xyzlist, n_real_atoms) print 'computing silhouette...' score = silhouette_score(pairwise, assignments, metric='precomputed') print 'silhouette score: %f' % score path = os.path.join(args.output, 'silhouette.dat') print 'saving results to flat text file (append): %s...' % path if not os.path.exists(args.output): os.makedirs(args.output) with open(path, 'a') as f: f.write('%f\n' % score)
def _cluster_ispherical_kmeans(self, init: str = "k-means++"): """ Employ spherical k-means on L2 normalised directional data points in an iterative manner to select the best k according to intrinsic clustering evaluation measures. Parameters ---------- init: str The initialisation method - "random" or "k-means++" """ max_sil = -inf best_pred = None # Pay attention that k-means++ initialiser may be using Eucledian # distances still.. for ik in range(2, len(self.data) - 1): skm = SphericalKMeans(n_clusters=ik, init=init, n_init=500, random_state=13712, normalize=False) pred = skm.fit_predict(self.data) score = silhouette_score(X=self.data, metric=self.distance_metric, labels=pred, random_state=13712) if score > max_sil: max_sil = score best_pred = pred return best_pred
def cluster(csv): data = pd.read_csv(csv) # X Features X = np.array(data.drop(['botname'], 1)) #print(X) X = scale(X.data) # Wähle Anzahl der Cluster, Random State seed für Reproduktion der Ergebnisse clustering = MeanShift() clustering.fit(X) # print(X_scaled) X_scaled = X #print(X_scaled) result = clustering.fit_predict(X) data['Cluster'] = result data = data.sort_values(['Cluster']) data.to_csv(r"C:\Users\Ronald Scheffler\.spyder-py3\meanshiftresult.csv") # Auswertung: # Silhouette Score? print(silhouette_score(X_scaled, result)) print(data) # CLass Prediction for Trainingsset from sklearn.model_selection import train_test_split X = np.array(data.drop(['botname'], 1)) y = data['Cluster'] # Klassen? X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) print(X_test) print(y)
def no_label_metrics(input_feature, assigned_label, print_metric, metric='euclidean'): """ https://scikit-learn.org/stable/modules/clustering.html#clustering-evaluation """ no_label_metrics = {} no_label_metrics['silhouette_score'] = \ cluster_metric.silhouette_score(input_feature, assigned_label, metric=metric) no_label_metrics['calinski_score'] = \ cluster_metric.calinski_harabaz_score(input_feature, assigned_label) # no_label_metrics['davie_bouldin_score'] = \ # cluster_metric.davies_bouldin_score(input_feature, # assigned_label) if (print_metric): print('Metrics without ture labels') print("silhouette score: % s" % no_label_metrics['silhouette_score']) print("calinski score: % s" % no_label_metrics['calinski_score']) # print("davie bouldin score: % s" # % no_label_metrics['davie_bouldin_score']) return no_label_metrics
def cluster_number_study(n=50): """ Check out some basic cluster metrics for different cluster sizes. """ fnamecsv = './AL_pchange_vars.csv' df = pd.read_csv(fnamecsv) variables = (df.as_matrix())[:, 1:].astype(float) for j in range(len(variables[0, :])): #ugly way of looping over columns variables[:, j] = (variables[:, j] - np.mean(variables[:, j])) / np.std( variables[:, j]) scores = [] for i in (2 + np.array(range(n))): k = KMeans(n_clusters=i, n_init=50, n_jobs=3).fit(variables) y = silhouette_score(variables, k.labels_) scores.append((i, y)) with open('cluster_vs_silhouette.txt', 'w') as f: for s in scores: f.write(str(s[0]) + "\t" + str(s[1]) + "\n") print scores scores = [] for i in (2 + np.array(range(n))): k = KMeans(n_clusters=i, n_init=50, n_jobs=3).fit(variables) #y = silhouette_score(variables,k.labels_) y = calinski_harabaz_score(variables, k.labels_) scores.append((i, y)) with open('cluster_vs_calharabaz.txt', 'w') as f: for s in scores: f.write(str(s[0]) + "\t" + str(s[1]) + "\n")
def calculate_scores(self): x, c, labels = self.x, self.c, self.labels self.v_measure = v_measure_score(c, labels) self.complete = completeness_score(c, labels) self.adjusted_mutual = adjusted_mutual_info_score(c, labels) self.adjusted_rand = adjusted_rand_score(c, labels) self.silhouette = silhouette_score(x, c) self.purity, self.partial_purity = self.__purity__()
def _check_silhouette(self, dataset, transformed): expected = KMeans().fit_predict(dataset) got = KMeans().fit_predict(transformed) if type(dataset) is not np.ndarray: dataset = dataset.toarray() if type(expected) is not np.ndarray: expected = expected.toarray() if type(got) is not np.ndarray: got = got.toarray() print("Silhouette Index: expected:", silhouette_score(dataset, expected), "got:", silhouette_score(dataset, got)) print("Calinski-Harabaz Index: expected:", calinski_harabaz_score(dataset, expected), "got:", calinski_harabaz_score(dataset, got))
def test_correct_labelsize(): # Assert 1 < n_labels < n_samples dataset = datasets.load_iris() X = dataset.data # n_labels = n_samples y = np.arange(X.shape[0]) err_msg = (r'Number of labels is %d\. Valid values are 2 ' r'to n_samples - 1 \(inclusive\)' % len(np.unique(y))) with pytest.raises(ValueError, match=err_msg): silhouette_score(X, y) # n_labels = 1 y = np.zeros(X.shape[0]) err_msg = (r'Number of labels is %d\. Valid values are 2 ' r'to n_samples - 1 \(inclusive\)' % len(np.unique(y))) with pytest.raises(ValueError, match=err_msg): silhouette_score(X, y)
def evaluation(X_selected, X_test, n_clusters, y): """ This function calculates ARI, ACC and NMI of clustering results Input ----- X_selected: {numpy array}, shape (n_samples, n_selected_features} input data on the selected features n_clusters: {int} number of clusters y: {numpy array}, shape (n_samples,) true labels Output ------ nmi: {float} Normalized Mutual Information acc: {float} Accuracy """ k_means = KMeans(n_clusters=n_clusters, init='k-means++', n_init=10, max_iter=300, tol=0.0001, precompute_distances=True, verbose=0, random_state=None, copy_x=True, n_jobs=1) k_means.fit(X_selected) y_predict = k_means.predict(X_test) # calculate NMI nmi = normalized_mutual_info_score(y, y_predict, average_method='arithmetic') # calculate Silhouette score try: sil = silhouette_score(X_test, y_predict, metric='euclidean') except ValueError: sil = float('nan') app_logger.warning('K-means lables are {0}; but y_predict are: {1}. Silhouette score requires predicts in 2 or more clusters.'.format(np.unique(k_means.labels_), np.unique(y_predict)), extra = LOGGER_EXTRA_OBJECT) # calculate Davies Bouldin try: db = davies_bouldin_score(X_test, y_predict) except ValueError: db = float('nan') app_logger.warning('K-means lables are {0}; but y_predict are: {1}. Davies Bouldin score requires predicts in 2 or more clusters.'.format(np.unique(k_means.labels_), np.unique(y_predict)), extra = LOGGER_EXTRA_OBJECT) # calculate Calinski Harabasz score try: ch = calinski_harabasz_score(X_test, y_predict) except ValueError: ch = float('nan') app_logger.warning('K-means lables are {0}; but y_predict are: {1}. Calinski Harabasz score requires predicts in 2 or more clusters.'.format(np.unique(k_means.labels_), np.unique(y_predict)), extra = LOGGER_EXTRA_OBJECT) # calculate Purity pur = purity(y, y_predict) return nmi, sil, db, ch, pur '''
def silTest(): """ 轮廓系数 """ x, y = make_moons(n_samples=200, noise=0.05, random_state=0) scaler = StandardScaler() scaler.fit(x) x_scaled = scaler.transform(x) fig, axes = plt.subplots(1, 4, figsize=(15, 3), subplot_kw={ 'xticks': (), 'yticks': () }) algorithms = [ KMeans(n_clusters=2), AgglomerativeClustering(n_clusters=2), DBSCAN() ] random_state = np.random.RandomState(seed=0) random_clusters = random_state.randint(low=0, high=2, size=len(x)) axes[0].scatter(x_scaled[:, 0], x_scaled[:, 1], c=random_clusters, cmap=mglearn.cm3, s=60) axes[0].set_title("Random assignment - ARI: {:.2f}".format( silhouette_score(x_scaled, random_clusters))) for ax, algorithm in zip(axes[1:], algorithms): clusters = algorithm.fit_predict(x_scaled) ax.scatter(x_scaled[:, 0], x_scaled[:, 1], c=clusters, cmap=mglearn.cm3, s=60) ax.set_title("{} - ARI: {:.2f}".format( algorithm.__class__.__name__, silhouette_score(x_scaled, clusters))) plt.show()
def in70(): from sklearn.datasets import make_moons from sklearn.metrics.cluster import silhouette_score x, y = make_moons(n_samples=200, noise=0.05, random_state=0) from sklearn.preprocessing import StandardScaler std = StandardScaler() std.fit(x) x_scaled = std.transform(x) from sklearn.cluster import KMeans from sklearn.cluster import AgglomerativeClustering from sklearn.cluster import DBSCAN fig, axer = plt.subplots(1, 3, figsize=(15, 3)) axer[0].scatter(x_scaled[:, 0], x_scaled[:, 1], c=KMeans().fit_predict(x_scaled), cmap=mglearn.cm2, s=60) axer[0].set_title('KMeans:{}'.format( silhouette_score(x_scaled, KMeans().fit_predict(x_scaled)))) axer[1].scatter(x_scaled[:, 0], x_scaled[:, 1], c=AgglomerativeClustering().fit_predict(x_scaled), cmap=mglearn.cm2, s=60) axer[1].set_title('AgglomerativeClustering:{}'.format( silhouette_score(x_scaled, AgglomerativeClustering().fit_predict(x_scaled)))) axer[2].scatter(x_scaled[:, 0], x_scaled[:, 1], c=DBSCAN().fit_predict(x_scaled), cmap=mglearn.cm2, s=60) axer[2].set_title('DBSCAN:{}'.format( silhouette_score(x_scaled, DBSCAN().fit_predict(x_scaled)))) plt.legend(['feature 0', 'feature 1']) plt.show()
def test_no_nan(): # Assert Silhouette Coefficient != nan when there is 1 sample in a class. # This tests for the condition that caused issue 960. # Note that there is only one sample in cluster 0. This used to cause the # silhouette_score to return nan (see bug #960). labels = np.array([1, 0, 1, 1, 1]) # The distance matrix doesn't actually matter. D = np.random.RandomState(0).rand(len(labels), len(labels)) silhouette = silhouette_score(D, labels, metric='precomputed') assert_false(np.isnan(silhouette))
def _print_clusteringMetrics(_kMean, _X): metrics = [['Clustering K-Means', 'Datos obtenidos'], ['Inercia', _kMean.inertia_], ['Entropy', entropy(_kMean.labels_)], ['Silhouette Score', silhouette_score(_X, _kMean.labels_, random_state = 0)], ['Calinski-Harabaz Score', calinski_harabaz_score(_X, _kMean.labels_)], ] print('\nMinería de Datos - Clustering K-Means - <VORT>', '\n') print(_kMean, '\n') print(look(metrics))
def test_silhouette(): # Tests the Silhouette Coefficient. dataset = datasets.load_iris() X_dense = dataset.data X_csr = csr_matrix(X_dense) X_dok = sp.dok_matrix(X_dense) X_lil = sp.lil_matrix(X_dense) y = dataset.target for X in [X_dense, X_csr, X_dok, X_lil]: D = pairwise_distances(X, metric='euclidean') # Given that the actual labels are used, we can assume that S would be # positive. score_precomputed = silhouette_score(D, y, metric='precomputed') assert score_precomputed > 0 # Test without calculating D score_euclidean = silhouette_score(X, y, metric='euclidean') pytest.approx(score_precomputed, score_euclidean) if X is X_dense: score_dense_without_sampling = score_precomputed else: pytest.approx(score_euclidean, score_dense_without_sampling) # Test with sampling score_precomputed = silhouette_score(D, y, metric='precomputed', sample_size=int(X.shape[0] / 2), random_state=0) score_euclidean = silhouette_score(X, y, metric='euclidean', sample_size=int(X.shape[0] / 2), random_state=0) assert score_precomputed > 0 assert score_euclidean > 0 pytest.approx(score_euclidean, score_precomputed) if X is X_dense: score_dense_with_sampling = score_precomputed else: pytest.approx(score_euclidean, score_dense_with_sampling)
def calc_si( self, representations: np.array, category_labels: List[int], metric: str = 'cosine', ): """ """ print(f'Computing silhouette scores...') return silhouette_score(representations, category_labels, metric)
def _clustering_metrics(labels, X, digits): if X is None: SIL = None DB = None CH = None else: SIL = round(silhouette_score(X, labels),digits) DB = round(davies_bouldin_score(X, labels),digits) CH = round(calinski_harabasz_score(X, labels),digits) return SIL, DB, CH
def get_clustering_cluster_output(df_arr, clustering_method, clustering_options, titles, bow): clusters = get_clusters(df_arr, clustering_method, clustering_options) cluster_info_df = None if clusters is not None and titles is not None and bow is not None: cluster_info_df = get_cluster_info_df(10, clusters, titles, bow) cluster_info_score = None if np.unique(clusters).size > 1: cluster_info_score = "Silhouette Score: %.2f" % silhouette_score(df_arr.values, clusters) return misc.generate_datatable(cluster_info_df, "cluster_info", 1000, "600px"), cluster_info_score
def my_kmeans(feature_vector, no_of_centers=8): start = time() km = KMeans(n_clusters=no_of_centers).fit(feature_vector) end = time() labels = km.labels_ n_clusters = len(set(labels)) - (1 if -1 in labels else 0) print 'The no of non noisy clusters is {} with no of centers = {}'.format(n_clusters, no_of_centers) print "Time taken to finish {} seconds".format(end - start) if option == 1: cluster_entropy(labels) else: print 'The silhouette score is {}'.format(silhouette_score(feature_vector, labels, metric='euclidean'))
def get_score(self, name="None"): self.pred = self.cluster.labels_ self.pred = np.where(self.pred > 1000, -1, self.pred) self.class_ = np.unique(self.pred) score = {} score1 = silhouette_score(self.pred.reshape(-1, 1), self.labels) score2 = metrics.adjusted_rand_score(self.pred, self.labels) score["轮廓系数"] = score1 score["调整兰德系数"] = score2 return score
def test_silhouette(): # Tests the Silhouette Coefficient. dataset = datasets.load_iris() X_dense = dataset.data X_csr = csr_matrix(X_dense) X_dok = sp.dok_matrix(X_dense) X_lil = sp.lil_matrix(X_dense) y = dataset.target for X in [X_dense, X_csr, X_dok, X_lil]: D = pairwise_distances(X, metric='euclidean') # Given that the actual labels are used, we can assume that S would be # positive. score_precomputed = silhouette_score(D, y, metric='precomputed') assert_greater(score_precomputed, 0) # Test without calculating D score_euclidean = silhouette_score(X, y, metric='euclidean') assert_almost_equal(score_precomputed, score_euclidean) if X is X_dense: score_dense_without_sampling = score_precomputed else: assert_almost_equal(score_euclidean, score_dense_without_sampling) # Test with sampling score_precomputed = silhouette_score(D, y, metric='precomputed', sample_size=int(X.shape[0] / 2), random_state=0) score_euclidean = silhouette_score(X, y, metric='euclidean', sample_size=int(X.shape[0] / 2), random_state=0) assert_greater(score_precomputed, 0) assert_greater(score_euclidean, 0) assert_almost_equal(score_euclidean, score_precomputed) if X is X_dense: score_dense_with_sampling = score_precomputed else: assert_almost_equal(score_euclidean, score_dense_with_sampling)
def my_agg_clustering(feature_vector, no_of_centers, metric_name): start = time() ag_c = AgglomerativeClustering(n_clusters=no_of_centers, affinity=metric_name).fit(feature_vector) end = time() labels = ag_c.labels_ n_clusters = len(set(labels)) - (1 if -1 in labels else 0) print 'The no of non noisy clusters is {} with no of centers = {} with metric = {}'.format( n_clusters, no_of_centers, metric_name) print "Time taken to finish {} seconds".format(end - start) if option == 1: cluster_entropy(labels) else: print 'The silhouette score is {}'.format(silhouette_score(feature_vector, labels, metric=metric_name))
def test_silhouette_paper_example(): # Explicitly check per-sample results against Rousseeuw (1987) # Data from Table 1 lower = [5.58, 7.00, 6.50, 7.08, 7.00, 3.83, 4.83, 5.08, 8.17, 5.83, 2.17, 5.75, 6.67, 6.92, 4.92, 6.42, 5.00, 5.58, 6.00, 4.67, 6.42, 3.42, 5.50, 6.42, 6.42, 5.00, 3.92, 6.17, 2.50, 4.92, 6.25, 7.33, 4.50, 2.25, 6.33, 2.75, 6.08, 6.67, 4.25, 2.67, 6.00, 6.17, 6.17, 6.92, 6.17, 5.25, 6.83, 4.50, 3.75, 5.75, 5.42, 6.08, 5.83, 6.67, 3.67, 4.75, 3.00, 6.08, 6.67, 5.00, 5.58, 4.83, 6.17, 5.67, 6.50, 6.92] D = np.zeros((12, 12)) D[np.tril_indices(12, -1)] = lower D += D.T names = ['BEL', 'BRA', 'CHI', 'CUB', 'EGY', 'FRA', 'IND', 'ISR', 'USA', 'USS', 'YUG', 'ZAI'] # Data from Figure 2 labels1 = [1, 1, 2, 2, 1, 1, 2, 1, 1, 2, 2, 1] expected1 = {'USA': .43, 'BEL': .39, 'FRA': .35, 'ISR': .30, 'BRA': .22, 'EGY': .20, 'ZAI': .19, 'CUB': .40, 'USS': .34, 'CHI': .33, 'YUG': .26, 'IND': -.04} score1 = .28 # Data from Figure 3 labels2 = [1, 2, 3, 3, 1, 1, 2, 1, 1, 3, 3, 2] expected2 = {'USA': .47, 'FRA': .44, 'BEL': .42, 'ISR': .37, 'EGY': .02, 'ZAI': .28, 'BRA': .25, 'IND': .17, 'CUB': .48, 'USS': .44, 'YUG': .31, 'CHI': .31} score2 = .33 for labels, expected, score in [(labels1, expected1, score1), (labels2, expected2, score2)]: expected = [expected[name] for name in names] # we check to 2dp because that's what's in the paper pytest.approx(expected, silhouette_samples(D, np.array(labels), metric='precomputed'), abs=1e-2) pytest.approx(score, silhouette_score(D, np.array(labels), metric='precomputed'), abs=1e-2)
def my_dbscan(feature_vector, metric_name, eps=None, minpts=None): start = time() if eps is None and minpts is None: db = DBSCAN(metric=metric_name).fit(feature_vector) elif minpts is None: db = DBSCAN(eps=eps, metric=metric_name).fit(feature_vector) elif eps is None: db = DBSCAN(min_samples=minpts, metric=metric_name).fit(feature_vector) else: db = DBSCAN(eps=eps, min_samples=minpts, metric=metric_name).fit(feature_vector) end = time() labels = db.labels_ n_clusters = len(set(labels)) - (1 if -1 in labels else 0) # ignoring noise if present print 'The no of non noisy clusters is {} with metric = {}'.format(n_clusters, metric_name) print "Time taken to finish {} seconds".format(end - start) if option == 1: cluster_entropy(labels) else: print 'The silhouette score is {}'.format(silhouette_score(feature_vector, labels, metric=metric_name))
def test_cluster_size_1(): # Assert Silhouette Coefficient == 0 when there is 1 sample in a cluster # (cluster 0). We also test the case where there are identical samples # as the only members of a cluster (cluster 2). To our knowledge, this case # is not discussed in reference material, and we choose for it a sample # score of 1. X = [[0.], [1.], [1.], [2.], [3.], [3.]] labels = np.array([0, 1, 1, 1, 2, 2]) # Cluster 0: 1 sample -> score of 0 by Rousseeuw's convention # Cluster 1: intra-cluster = [.5, .5, 1] # inter-cluster = [1, 1, 1] # silhouette = [.5, .5, 0] # Cluster 2: intra-cluster = [0, 0] # inter-cluster = [arbitrary, arbitrary] # silhouette = [1., 1.] silhouette = silhouette_score(X, labels) assert_false(np.isnan(silhouette)) ss = silhouette_samples(X, labels) assert_array_equal(ss, [0, .5, .5, 0, 1, 1])
-1 代表噪声 增大eps,更多的点会被包含在一个簇中。这让簇变大,但可能也会导致多个簇合并成一个 增大min_samples,核心点会变得更少,更多的点被标记为噪声 参数eps 在某种程度上更加重要,因为它决定了点与点之间“接近”的含义。 将eps 设置得非常小,意味着没有点是核心样本,可能会导致所有点都被标记为噪声。 将eps 设置得非常大,可能会导致所有点形成单个簇 设置min_samples 主要是为了判断稀疏区域内的点被标记为异常值还是形成自己的簇。 如果增大min_samples,任何一个包含少于min_samples 个样本的簇现在将被标记为噪声。 因此,min_samples 决定簇的最小尺寸 ''' print(clusters) print(len(set(clusters))) if len(set(clusters)) > 1: print('{} {} {}'.format(eps, min_samples, silhouette_score(X, clusters))) # 0.5 5 -0.12276159423271887 # 0.7 5 0.3593629426203677 ''' 如果全是-1就抛异常 1 < n_labels 不成立 def check_number_of_labels(n_labels, n_samples): if not 1 < n_labels < n_samples: raise ValueError("Number of labels is %d. Valid values are 2 " "to n_samples - 1 (inclusive)" % n_labels) '''
def test_non_numpy_labels(): dataset = datasets.load_iris() X = dataset.data y = dataset.target assert_equal( silhouette_score(list(X), list(y)), silhouette_score(X, y))