def test_calinski_harabaz_score(): rng = np.random.RandomState(seed=0) # Assert message when there is only one label assert_raise_message(ValueError, "Number of labels is", calinski_harabaz_score, rng.rand(10, 2), np.zeros(10)) # Assert message when all point are in different clusters assert_raise_message(ValueError, "Number of labels is", calinski_harabaz_score, rng.rand(10, 2), np.arange(10)) # Assert the value is 1. when all samples are equals assert_equal(1., calinski_harabaz_score(np.ones((10, 2)), [0] * 5 + [1] * 5)) # Assert the value is 0. when all the mean cluster are equal assert_equal(0., calinski_harabaz_score([[-1, -1], [1, 1]] * 10, [0] * 10 + [1] * 10)) # General case (with non numpy arrays) X = ([[0, 0], [1, 1]] * 5 + [[3, 3], [4, 4]] * 5 + [[0, 4], [1, 3]] * 5 + [[3, 1], [4, 0]] * 5) labels = [0] * 10 + [1] * 10 + [2] * 10 + [3] * 10 assert_almost_equal(calinski_harabaz_score(X, labels), 45 * (40 - 4) / (5 * (4 - 1)))
def test_calinski_harabaz_score(): rng = np.random.RandomState(seed=0) # Assert message when there is only one label assert_raise_message(ValueError, "Number of labels is", calinski_harabaz_score, rng.rand(10, 2), np.zeros(10)) # Assert message when all point are in different clusters assert_raise_message(ValueError, "Number of labels is", calinski_harabaz_score, rng.rand(10, 2), np.arange(10)) # Assert the value is 1. when all samples are equals assert_equal(1., calinski_harabaz_score(np.ones((10, 2)), [0] * 5 + [1] * 5)) # Assert the value is 0. when all the mean cluster are equal assert_equal( 0., calinski_harabaz_score([[-1, -1], [1, 1]] * 10, [0] * 10 + [1] * 10)) # General case (with non numpy arrays) X = ([[0, 0], [1, 1]] * 5 + [[3, 3], [4, 4]] * 5 + [[0, 4], [1, 3]] * 5 + [[3, 1], [4, 0]] * 5) labels = [0] * 10 + [1] * 10 + [2] * 10 + [3] * 10 assert_almost_equal(calinski_harabaz_score(X, labels), 45 * (40 - 4) / (5 * (4 - 1)))
def cluster_number_study(n=50): """ Check out some basic cluster metrics for different cluster sizes. """ fnamecsv = './AL_pchange_vars.csv' df = pd.read_csv(fnamecsv) variables = (df.as_matrix())[:, 1:].astype(float) for j in range(len(variables[0, :])): #ugly way of looping over columns variables[:, j] = (variables[:, j] - np.mean(variables[:, j])) / np.std( variables[:, j]) scores = [] for i in (2 + np.array(range(n))): k = KMeans(n_clusters=i, n_init=50, n_jobs=3).fit(variables) y = silhouette_score(variables, k.labels_) scores.append((i, y)) with open('cluster_vs_silhouette.txt', 'w') as f: for s in scores: f.write(str(s[0]) + "\t" + str(s[1]) + "\n") print scores scores = [] for i in (2 + np.array(range(n))): k = KMeans(n_clusters=i, n_init=50, n_jobs=3).fit(variables) #y = silhouette_score(variables,k.labels_) y = calinski_harabaz_score(variables, k.labels_) scores.append((i, y)) with open('cluster_vs_calharabaz.txt', 'w') as f: for s in scores: f.write(str(s[0]) + "\t" + str(s[1]) + "\n")
def no_label_metrics(input_feature, assigned_label, print_metric, metric='euclidean'): """ https://scikit-learn.org/stable/modules/clustering.html#clustering-evaluation """ no_label_metrics = {} no_label_metrics['silhouette_score'] = \ cluster_metric.silhouette_score(input_feature, assigned_label, metric=metric) no_label_metrics['calinski_score'] = \ cluster_metric.calinski_harabaz_score(input_feature, assigned_label) # no_label_metrics['davie_bouldin_score'] = \ # cluster_metric.davies_bouldin_score(input_feature, # assigned_label) if (print_metric): print('Metrics without ture labels') print("silhouette score: % s" % no_label_metrics['silhouette_score']) print("calinski score: % s" % no_label_metrics['calinski_score']) # print("davie bouldin score: % s" # % no_label_metrics['davie_bouldin_score']) return no_label_metrics
def _check_silhouette(self, dataset, transformed): expected = KMeans().fit_predict(dataset) got = KMeans().fit_predict(transformed) if type(dataset) is not np.ndarray: dataset = dataset.toarray() if type(expected) is not np.ndarray: expected = expected.toarray() if type(got) is not np.ndarray: got = got.toarray() print("Silhouette Index: expected:", silhouette_score(dataset, expected), "got:", silhouette_score(dataset, got)) print("Calinski-Harabaz Index: expected:", calinski_harabaz_score(dataset, expected), "got:", calinski_harabaz_score(dataset, got))
def _print_clusteringMetrics(_kMean, _X): metrics = [['Clustering K-Means', 'Datos obtenidos'], ['Inercia', _kMean.inertia_], ['Entropy', entropy(_kMean.labels_)], ['Silhouette Score', silhouette_score(_X, _kMean.labels_, random_state = 0)], ['Calinski-Harabaz Score', calinski_harabaz_score(_X, _kMean.labels_)], ] print('\nMinería de Datos - Clustering K-Means - <VORT>', '\n') print(_kMean, '\n') print(look(metrics))
def test_calinski_harabaz_score(): assert_raises_on_only_one_label(calinski_harabaz_score) assert_raises_on_all_points_same_cluster(calinski_harabaz_score) # Assert the value is 1. when all samples are equals assert_equal(1., calinski_harabaz_score(np.ones((10, 2)), [0] * 5 + [1] * 5)) # Assert the value is 0. when all the mean cluster are equal assert_equal(0., calinski_harabaz_score([[-1, -1], [1, 1]] * 10, [0] * 10 + [1] * 10)) # General case (with non numpy arrays) X = ([[0, 0], [1, 1]] * 5 + [[3, 3], [4, 4]] * 5 + [[0, 4], [1, 3]] * 5 + [[3, 1], [4, 0]] * 5) labels = [0] * 10 + [1] * 10 + [2] * 10 + [3] * 10 pytest.approx(calinski_harabaz_score(X, labels), 45 * (40 - 4) / (5 * (4 - 1)))
def test_calinski_harabaz_score(): assert_raises_on_only_one_label(calinski_harabaz_score) assert_raises_on_all_points_same_cluster(calinski_harabaz_score) # Assert the value is 1. when all samples are equals assert_equal(1., calinski_harabaz_score(np.ones((10, 2)), [0] * 5 + [1] * 5)) # Assert the value is 0. when all the mean cluster are equal assert_equal( 0., calinski_harabaz_score([[-1, -1], [1, 1]] * 10, [0] * 10 + [1] * 10)) # General case (with non numpy arrays) X = ([[0, 0], [1, 1]] * 5 + [[3, 3], [4, 4]] * 5 + [[0, 4], [1, 3]] * 5 + [[3, 1], [4, 0]] * 5) labels = [0] * 10 + [1] * 10 + [2] * 10 + [3] * 10 pytest.approx(calinski_harabaz_score(X, labels), 45 * (40 - 4) / (5 * (4 - 1)))
def ECBO(dataSet, c): n, m = dataSet.shape clusterAssment = np.mat(np.zeros((n, 1))) #1. init membership and centroids #initail_U = initial_U(dataSet, c) initial_centroids = initCentroids(dataSet, c) print('initial centroids:\n', initial_centroids) #2. update membership U lastJ, lastU = upUwithSimplex(dataSet, initial_centroids, c, A1, A2) last_centroids = calCentroids(dataSet, lastU, c) #3. update centroids J, U = upUwithSimplex(dataSet, last_centroids, c, A1, A2) centroids = calCentroids(dataSet, U, c) #while last_centroids.all() != centroids.all(): #count = 0 #while (count < 2): lastcentroids = centroids J, U = upUwithSimplex(dataSet, lastcentroids, c, A1, A2) centroids = calCentroids(dataSet, U, c) #count += 1 #assgin data to clusters for k in range(n): for i in range(c): if (U[k, i] == 1): clusterAssment[k] = i """for i in range(n): minDist = 10000.0 minIdex = 0 for j in range(c): distance = euclDistance(dataSet[i], centroids[j]) if distance < minDist: minDist = distance #update min distance minIdex = j #update assignment of data to cluster if clusterAssment[i, 0] != minIdex: clusterAssment[i, :] = minIdex""" label_pred = clusterAssment #print(label_pred) CHI = calinski_harabaz_score(dataSet, label_pred) print('final U:\n', np.mat(U)) print('final centroids:\n', centroids) print('objective function:', value(J.objective)) print('cluster assignment:', clusterAssment) print('Calinski-Harabaz Index:', CHI) for i in range(c): print(np.sum(U[:, i])) return U, centroids, J, clusterAssment
def ECBO(dataSet, c): n, m = dataSet.shape clusterAssment = np.mat(np.zeros((n, 1))) #1. init membership and centroids #initail_U = initial_U(dataSet, c) initial_centroids = initCentroids(dataSet, c) print('initial centroids:\n', initial_centroids) #2. update membership U lastJ, lastU = upUwithSimplex(dataSet, initial_centroids, c, k) last_centroids = calCentroids(dataSet, lastU, c) #3. update centroids J, U = upUwithSimplex(dataSet, last_centroids, c, k) centroids = calCentroids(dataSet, U, c) #while last_centroids.all() != centroids.all(): count = 0 while (count < 10): lastcentroids = centroids J, U = upUwithSimplex(dataSet, lastcentroids, c, k) centroids = calCentroids(dataSet, U, c) count += 1 for p in range(n): for i in range(c): if (U[p, i] == 1): clusterAssment[p] = i label_pred = clusterAssment # print(label_pred) CHI = calinski_harabaz_score(dataSet, label_pred) print('final U:\n', np.mat(U)) print('final centroids:\n', centroids) print('objective function:', value(J.objective)) print('cluster assignment:', clusterAssment) print('Calinski-Harabaz Index:', CHI) for i in range(c): print(np.sum(U[:, i])) return U, centroids, J, clusterAssment
def main(): min_pts = 69 epsilon = 0.0031894736842105263 with open('train.dat', 'r') as fh: data = fh.read().splitlines() data_csr = build_csr(data) # tf-idf + TruncatedSVD = LSA (latent semantic analysis) X = TfidfTransformer(norm='l2', use_idf=True, smooth_idf=True, sublinear_tf=False).fit_transform(data_csr) selector = TruncatedSVD(n_components=5, algorithm="arpack", random_state=8) X = selector.fit_transform(X) X = sparse.csr_matrix(X) X = csr_l2normalize(X, copy=True) print('Running DBSCAN clustering algorithm now....') print("For eps: {}, minPts: {}".format(epsilon, min_pts)) labels, core_pts, border_pts = DBSCAN(X, eps=epsilon, minPts=min_pts) labels = assign_noise_to_centroid(X, labels, recompute_centroid=True) n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0) print(' Estimated number of clusters: %d' % n_clusters_) sil_score = silhouette_score(X, labels, metric='cosine', random_state=8) print(" Silhouette Coefficient: %0.5f" % sil_score) X_dense = X.toarray() ch_score = calinski_harabaz_score(X_dense, labels) print(" Calinski Harabaz Score: %0.5f" % ch_score) print('Writing out predictions now....') submission_df = pd.read_csv("format.dat", sep="\t", header=None) submission_df[0] = labels submission_df.to_csv('submission.txt', sep='\n', index=False, header=False)
def L1_ECBO(dataSet, c): n, m = dataSet.shape clusterAssment = np.mat(np.zeros((n, 1))) initial_centroids = initCentroids(dataSet, c) print('initial centroids:\n', initial_centroids) lastJ, lastU = upUwithSimplex(dataSet, initial_centroids, c, A1, A2) last_centroids = calCentroids(dataSet, lastU, c) # 3. update centroids J, U = upUwithSimplex(dataSet, last_centroids, c, A1, A2) centroids = calCentroids(dataSet, U, c) count = 0 while count < 9: lastcentroids = centroids J, U = upUwithSimplex(dataSet, lastcentroids, c, A1, A2) centroids = calCentroids(dataSet, U, c) count += 1 for p in range(n): for i in range(c): if (U[p, i] == 1): clusterAssment[p] = i label_pred = clusterAssment # print(label_pred) CHI = calinski_harabaz_score(dataSet, label_pred) print('final membership:', U) print('objetive function:', J) print('final centroids:', centroids, type(centroids)) #print('cluster assignment:', clusterAssment) print('Calinski-Harabaz Index:', CHI) for i in range(c): print(np.sum(U[:, i])) return U, J, centroids, clusterAssment
def L1_kMeans(dataSet, c): n, m = dataSet.shape clusterAssment = np.mat(np.zeros((n, 1))) initial_centroids = initCentroids(dataSet, c) print('initial centroids:\n', initial_centroids) lastJ, lastU = updateU(dataSet, initial_centroids, c) centroids = calCentroids(dataSet, lastU, c) J, U = updateU(dataSet, centroids, c) centroids = np.array(centroids) count = 0 while count < 10: lastJ = J centroids = calCentroids(dataSet, U, c) J, U = updateU(dataSet, centroids, c) count += 1 print('count:', count) for k in range(n): for i in range(c): if (U[k, i] == 1): clusterAssment[k] = i label_pred = clusterAssment # print(label_pred) CHI = calinski_harabaz_score(dataSet, label_pred) print('final membership:', U) print('objetive function:', J) print('final centroids:', centroids) #print('cluster assignment:', clusterAssment) print('Calinski-Harabaz Index:', CHI) for i in range(c): print(np.sum(U[:, i])) return U, J, centroids, clusterAssment
def fuzzyCMeans(dataSet, c, m): #n, p = dataSet.shape # 1. init centroids randomly initial_centroids = initCentroids(dataSet, c) # 2. update membership U with fixing centroids last_U = calUwithCent(dataSet, initial_centroids, c, m) last_J = objFuncJ(dataSet, last_U, initial_centroids, c, m) # 3. update centroids with fixing U centroids = calCentwithU(dataSet, last_U, c, m) U = calUwithCent(dataSet, centroids, c, m) J = objFuncJ(dataSet, U, centroids, c, m) count = 0 while count < 10: centroids = calCentwithU(dataSet, U, c, m) U = calUwithCent(dataSet, centroids, c, m) J = objFuncJ(dataSet, U, centroids, c, m) count += 1 clusterAssment = getCluster(dataSet, U, c) #label_pred = clusterAssment CHI = calinski_harabaz_score(dataSet, clusterAssment) print('final membership:', U) print('objetive function:', J) print('final centroids:', centroids) print('cluster assignment:', clusterAssment) print('Calinski-Harabaz Index:', CHI) for i in range(c): print("cluster size:", np.sum(U[:, i])) return U, centroids, J, clusterAssment
#x = glass.values #data = x[:, [3, 5]] #wine = datasets.load_wine() #x = wine.data #data = x[:, [0, 11]] #iris = datasets.load_iris() #original_x = iris.data #data = original_x[:, :2] + original_x[:, 2:] #data = pd.read_csv('D:/Tsukuba/My Research/Program/dataset/4_three_ciecles_with_diffR/three_ciecles_with_diffR.csv') #x = data.values[1:, :2] #y = data.values[1:, -1] #print(x, y) data = pd.read_csv('D:/Tsukuba/My Research/Program/dataset/diff_var.csv') x = data.values[:, 1:3] y = data.values[:, 0] pso = PSO(n_cluster=3, n_particle=10, data=x) # max_iter, print_debug pso.run() pred_cluster = pso.cluster ari = adjusted_rand_score(y, pred_cluster) ch = calinski_harabaz_score(x, pred_cluster) print('ARI:', ari) print('CH:', ch) pso.show_cluter()
from sklearn.cluster import KMeans from sklearn.metrics.cluster import calinski_harabaz_score, silhouette_score # 加载数据 data = pd.read_csv('data.csv') # print(data.head()) x_train = data[["2019年国际排名", "2018世界杯", "2015亚洲杯"]] # 特征工程 - 标准化 min_max_scaler = preprocessing.StandardScaler() x_train = min_max_scaler.fit_transform(x_train) # 创建KMeans聚类评估器 estimator = KMeans(n_clusters=3) # 模型训练 estimator.fit(x_train) y_predict = estimator.predict(x_train) # 模型评估 # CH系数 # 值越大聚类效果越好 print('CH系数:', calinski_harabaz_score(x_train, y_predict)) # 平均轮廓系数的取值范围为[-1,1],系数越大,聚类效果越好 print('平均轮廓系数:', silhouette_score(x_train, y_predict)) # 合并聚类结果,插入到原数据中 result = pd.concat((data, pd.DataFrame(y_predict)), axis=1) result.rename({0: u'聚类'}, axis=1, inplace=True) print(result)
''' for K in range(4, 5): print('\n\n\nK = ', K) kmeans = KMeans(n_clusters=K).fit(numpy_selected_data) labels = kmeans.labels_ save_name = 'Set1_KMeans_K=' + str(K) + '.csv' np.savetxt(save_name, labels, delimiter=',') label_counter = collections.Counter(labels) print(label_counter) unique_clusters, centroids = getAveragePlayerFromCluster( non_normalized_data, labels) print("Start calculating centroids") i = 0 column_names = [] for _cent in centroids: if i == 0: new_file = pd.DataFrame(_cent.values).T column_names = _cent.index else: new_file = new_file.append(pd.DataFrame(_cent.values).T) i = i + 1 new_file.columns = column_names new_file.to_csv('KMeans_K=' + str(K) + '.csv', index=False) unique_clusters = set(labels) palette = sns.color_palette('hls', len(unique_clusters)) cluster_colors = [palette[col] for col in labels] print("K-Means Calinski-Harabaz: " + str(smc.calinski_harabaz_score(selected_data, labels)))
from sklearn.datasets.samples_generator import make_blobs from sklearn.cluster import KMeans from sklearn.metrics.cluster import calinski_harabaz_score # 1. 生成数据 X, y = make_blobs(n_samples=1000, n_features=2, centers=[(-1, -1), (0, 0), (1, 1), (2, 2)], cluster_std=[0.4, 0.2, 0.2, 0.2]) print(X) # 2. 使用KMeans进行聚类 estimator = KMeans(n_clusters=4) y_pred = estimator.fit_predict(X) # 3. 数据可视化(绘制分类结果图) plt.figure(figsize=(5, 4), dpi=80) # 绘制散点图, 查看数据的分布情况 plt.scatter(X[:, 0], X[:, 1], c=y_pred) # 显示 plt.show() # 分类评估 # 2 3096.7473856516135 # 3 2940.6149446783725 # 4 5866.614435267102 # CH系数: 值越大聚类效果越好. print('CH系数', calinski_harabaz_score(X, y_pred))