def test_non_consecutive_labels(): # regression tests for labels with gaps h, c, v = homogeneity_completeness_v_measure( [0, 0, 0, 2, 2, 2], [0, 1, 0, 1, 2, 2]) assert_almost_equal(h, 0.67, 2) assert_almost_equal(c, 0.42, 2) assert_almost_equal(v, 0.52, 2) h, c, v = homogeneity_completeness_v_measure( [0, 0, 0, 1, 1, 1], [0, 4, 0, 4, 2, 2]) assert_almost_equal(h, 0.67, 2) assert_almost_equal(c, 0.42, 2) assert_almost_equal(v, 0.52, 2) ari_1 = adjusted_rand_score([0, 0, 0, 1, 1, 1], [0, 1, 0, 1, 2, 2]) ari_2 = adjusted_rand_score([0, 0, 0, 1, 1, 1], [0, 4, 0, 4, 2, 2]) assert_almost_equal(ari_1, 0.24, 2) assert_almost_equal(ari_2, 0.24, 2) ri_1 = rand_score([0, 0, 0, 1, 1, 1], [0, 1, 0, 1, 2, 2]) ri_2 = rand_score([0, 0, 0, 1, 1, 1], [0, 4, 0, 4, 2, 2]) assert_almost_equal(ri_1, 0.66, 2) assert_almost_equal(ri_2, 0.66, 2)
def test_non_consicutive_labels(): # regression tests for labels with gaps h, c, v = homogeneity_completeness_v_measure([0, 0, 0, 2, 2, 2], [0, 1, 0, 1, 2, 2]) assert_almost_equal(h, 0.67, 2) assert_almost_equal(c, 0.42, 2) assert_almost_equal(v, 0.52, 2) h, c, v = homogeneity_completeness_v_measure([0, 0, 0, 1, 1, 1], [0, 4, 0, 4, 2, 2]) assert_almost_equal(h, 0.67, 2) assert_almost_equal(c, 0.42, 2) assert_almost_equal(v, 0.52, 2)
def do_kr(x, y, nclusters, verbose, n_init): start_time = time() tracemalloc.start() # Fill in missing values in numeric attributes in advances xDataFrame = pd.DataFrame(x) attrList = [0, 3, 4, 5, 6, 8, 9, 11, 12] numOfRows = x.shape[0] numOfCols = x.shape[1] for i in range(0, numOfCols): if i not in attrList: colTmp = x[:, i].copy() colTmp.sort() if "?" not in colTmp: continue missIndex = colTmp.tolist().index("?") colTmp = list(map(float, colTmp[0:missIndex])) average = round(mean(colTmp), 2) for j in range(0, numOfRows): if xDataFrame.iloc[j, i] == "?": xDataFrame.iloc[j, i] = average x = np.asarray(xDataFrame) kr = kpro.KPrototypes(n_clusters=nclusters, max_iter=1, init='random', n_init=n_init, verbose=verbose) kr.fit_predict(x, categorical=attrList) ari = evaluation.rand(kr.labels_, y) nmi = evaluation.nmi(kr.labels_, y) purity = evaluation.purity(kr.labels_, y) homogenity, completeness, v_measure = homogeneity_completeness_v_measure( y, kr.labels_) end_time = time() elapsedTime = timedelta(seconds=end_time - start_time).total_seconds() memoryUsage = tracemalloc.get_tracemalloc_memory() / 1024 / 1024 if verbose == 1: print("Purity = {:8.3f}".format(purity)) print("NMI = {:8.3f}".format(nmi)) print("Homogenity = {:8.3f}".format(homogenity)) print("Completeness = {:8.3f}".format(completeness)) print("V-measure = {:8.3f}".format(v_measure)) print("Elapsed Time = {:8.3f} secs".format(elapsedTime)) print("Memory usage = {:8.3f} MB".format(memoryUsage)) # snapshot = tracemalloc.take_snapshot() # top_stats = snapshot.statistics('lineno') # print("[ Top 10 ]") # for stat in top_stats[:10]: # print(stat) tracemalloc.stop() return [ round(purity, 3), round(nmi, 3), round(homogenity, 3), round(completeness, 3), round(v_measure, 3), round(elapsedTime, 3), round(memoryUsage, 3) ]
def do_kr(x, y, nclusters=4, verbose=1, use_global_attr_count=1, n_init=10): kr = k_center1.KRepresentative(n_clusters=nclusters, init='random', n_init=n_init, verbose=verbose, use_global_attr_count=use_global_attr_count) kr.fit_predict(x) ari = evaluation.rand(kr.labels_, y) nmi = evaluation.nmi(kr.labels_, y) purity = evaluation.purity(kr.labels_, y) homogenity, completeness, v_measure = homogeneity_completeness_v_measure( y, kr.labels_) if verbose == 1: print("Purity = {:8.3f}".format(purity)) print("NMI = {:8.3f}".format(nmi)) print("Homogenity = {:8.3f}".format(homogenity)) print("Completeness = {:8.3f}".format(completeness)) print("V-measure = {:8.3f}".format(v_measure)) return [ round(purity, 3), round(nmi, 3), round(homogenity, 3), round(completeness, 3), round(v_measure, 3) ]
def test_not_complete_and_not_homogeneous_labeling(): # neither complete nor homogeneous but not so bad either h, c, v = homogeneity_completeness_v_measure([0, 0, 0, 1, 1, 1], [0, 1, 0, 1, 2, 2]) assert_almost_equal(h, 0.67, 2) assert_almost_equal(c, 0.42, 2) assert_almost_equal(v, 0.52, 2)
def test_homogeneous_but_not_complete_labeling(): # homogeneous but not complete clustering h, c, v = homogeneity_completeness_v_measure([0, 0, 0, 1, 1, 1], [0, 0, 0, 1, 2, 2]) assert_almost_equal(h, 1.00, 2) assert_almost_equal(c, 0.69, 2) assert_almost_equal(v, 0.81, 2)
def test_complete_but_not_homogeneous_labeling(): # complete but not homogeneous clustering h, c, v = homogeneity_completeness_v_measure([0, 0, 1, 1, 2, 2], [0, 0, 1, 1, 1, 1]) assert_almost_equal(h, 0.58, 2) assert_almost_equal(c, 1.00, 2) assert_almost_equal(v, 0.73, 2)
def do_kr(x, y, nclusters=4, verbose=1, n_init=10): kr = kmodes.KModes(n_clusters=nclusters, max_iter=1, init='Huang', n_init=n_init, verbose=verbose) kr.fit_predict(x) ari = evaluation.rand(kr.labels_, y) nmi = evaluation.nmi(kr.labels_, y) purity = evaluation.purity(kr.labels_, y) homogenity, completeness, v_measure = homogeneity_completeness_v_measure( y, kr.labels_) if verbose == 1: print("Purity = {:8.3f}".format(purity)) print("NMI = {:8.3f}".format(nmi)) print("Homogenity = {:8.3f}".format(homogenity)) print("Completeness = {:8.3f}".format(completeness)) print("V-measure = {:8.3f}".format(v_measure)) return [ round(purity, 3), round(nmi, 3), round(homogenity, 3), round(completeness, 3), round(v_measure, 3) ]
def do_kr(x, y, nclusters, verbose, use_global_attr_count, n_init): start_time = time() tracemalloc.start() categorical = [0, 3, 4, 5, 6, 8, 9, 11, 12] kr = KCMM(categorical, n_clusters = nclusters, init='random', n_init = n_init, verbose = verbose, use_global_attr_count = use_global_attr_count) kr.fit_predict(x) # print(kr.labels_) ari = evaluation.rand(kr.labels_, y) nmi = evaluation.nmi(kr.labels_, y) purity = evaluation.purity(kr.labels_, y) homogenity, completeness, v_measure = homogeneity_completeness_v_measure(y, kr.labels_) end_time = time() elapsedTime = timedelta(seconds=end_time - start_time).total_seconds() memoryUsage = tracemalloc.get_tracemalloc_memory() / 1024 / 1024 if verbose == 1: print("Purity = {:8.3f}" . format(purity)) print("NMI = {:8.3f}" . format(nmi)) print("Homogenity = {:8.3f}" . format(homogenity)) print("Completeness = {:8.3f}" . format(completeness)) print("V-measure = {:8.3f}" . format(v_measure)) print("Elapsed Time = {:8.3f} secs".format(elapsedTime)) print("Memory usage = {:8.3f} MB".format(memoryUsage)) tracemalloc.stop() return [round(purity,3),round(nmi,3),round(homogenity,3),round(completeness,3),round(v_measure,3),round(elapsedTime,3),round(memoryUsage,3)]
def test_not_complete_and_not_homogeneous_labeling(): # neither complete nor homogeneous but not so bad either h, c, v = homogeneity_completeness_v_measure( [0, 0, 0, 1, 1, 1], [0, 1, 0, 1, 2, 2]) assert_almost_equal(h, 0.67, 2) assert_almost_equal(c, 0.42, 2) assert_almost_equal(v, 0.52, 2)
def test_complete_but_not_homogeneous_labeling(): # complete but not homogeneous clustering h, c, v = homogeneity_completeness_v_measure( [0, 0, 1, 1, 2, 2], [0, 0, 1, 1, 1, 1]) assert_almost_equal(h, 0.58, 2) assert_almost_equal(c, 1.00, 2) assert_almost_equal(v, 0.73, 2)
def test_homogeneous_but_not_complete_labeling(): # homogeneous but not complete clustering h, c, v = homogeneity_completeness_v_measure( [0, 0, 0, 1, 1, 1], [0, 0, 0, 1, 2, 2]) assert_almost_equal(h, 1.00, 2) assert_almost_equal(c, 0.69, 2) assert_almost_equal(v, 0.81, 2)
def compare(hp_w, hp_c): kk = hp_w.keys() & hp_c.keys() log.debug("Found %d hours in common", len(kk)) w = [hp_w[k] for k in kk] c = [hp_c[k] for k in kk] ari = adjusted_rand_score(w, c) h, c, v = homogeneity_completeness_v_measure(w, c) log.info("ARI: " + str(ari)) log.info("H: %f, C: %f, V:%f", h, c, v) return ari, h, v, c
def test_non_consecutive_labels(): # regression tests for labels with gaps h, c, v = homogeneity_completeness_v_measure( [0, 0, 0, 2, 2, 2], [0, 1, 0, 1, 2, 2]) assert_almost_equal(h, 0.67, 2) assert_almost_equal(c, 0.42, 2) assert_almost_equal(v, 0.52, 2) h, c, v = homogeneity_completeness_v_measure( [0, 0, 0, 1, 1, 1], [0, 4, 0, 4, 2, 2]) assert_almost_equal(h, 0.67, 2) assert_almost_equal(c, 0.42, 2) assert_almost_equal(v, 0.52, 2) ari_1 = adjusted_rand_score([0, 0, 0, 1, 1, 1], [0, 1, 0, 1, 2, 2]) ari_2 = adjusted_rand_score([0, 0, 0, 1, 1, 1], [0, 4, 0, 4, 2, 2]) assert_almost_equal(ari_1, 0.24, 2) assert_almost_equal(ari_2, 0.24, 2)
def test_beta_parameter(): # test for when beta passed to # homogeneity_completeness_v_measure # and v_measure_score beta_test = 0.2 h_test = 0.67 c_test = 0.42 v_test = (1 + beta_test) * h_test * c_test / (beta_test * h_test + c_test) h, c, v = homogeneity_completeness_v_measure( [0, 0, 0, 1, 1, 1], [0, 1, 0, 1, 2, 2], beta=beta_test ) assert_almost_equal(h, h_test, 2) assert_almost_equal(c, c_test, 2) assert_almost_equal(v, v_test, 2) v = v_measure_score([0, 0, 0, 1, 1, 1], [0, 1, 0, 1, 2, 2], beta=beta_test) assert_almost_equal(v, v_test, 2)
def test_beta_parameter(): # test for when beta passed to # homogeneity_completeness_v_measure # and v_measure_score beta_test = 0.2 h_test = 0.67 c_test = 0.42 v_test = ((1 + beta_test) * h_test * c_test / (beta_test * h_test + c_test)) h, c, v = homogeneity_completeness_v_measure( [0, 0, 0, 1, 1, 1], [0, 1, 0, 1, 2, 2], beta=beta_test) assert_almost_equal(h, h_test, 2) assert_almost_equal(c, c_test, 2) assert_almost_equal(v, v_test, 2) v = v_measure_score( [0, 0, 0, 1, 1, 1], [0, 1, 0, 1, 2, 2], beta=beta_test) assert_almost_equal(v, v_test, 2)
def compute_external_metrics(labels_true: List[str], labels_pred: List[int]) -> ExternalEvaluation: if len(labels_true) == 0 and len(labels_pred) == 0: return None homogeneity, completeness, v_measure = homogeneity_completeness_v_measure( labels_true, labels_pred) adjusted_mutual_info = adjusted_mutual_info_score(labels_true, labels_pred) adjusted_rand_index = adjusted_rand_score(labels_true, labels_pred) fowlkes_mallows = fowlkes_mallows_score(labels_true, labels_pred) mat = contingency_matrix(labels_true, labels_pred) purity = purity_score(mat) inverse_purity = purity_score(mat, inverse=True) return ExternalEvaluation(homogeneity=homogeneity, completeness=completeness, v_measure=v_measure, adjusted_mutual_information=adjusted_mutual_info, adjusted_rand_index=adjusted_rand_index, fowlkes_mallows=fowlkes_mallows, purity=purity, inverse_purity=inverse_purity)
cluster_agg_ap = clusterer_agg_ap.fit_predict(data_agg) cluster_agg_ap2 = clusterer_agg_ap.fit_predict(data_agg2) cluster_agg_ap4 = clusterer_agg_ap.fit_predict(data_agg4) cluster_agg_ap4_w = clusterer_agg_ap.fit_predict(data_agg4_w) cluster_agg_ap4_ws = clusterer_agg_ap.fit_predict(data_agg4_ws) cluster_agg_ap4_just_season = clusterer_agg_ap.fit_predict( data_agg4_just_season) cluster_agg_ap4_just_leaf = clusterer_agg_ap.fit_predict(data_agg4_just_leaf) cluster_agg_ap4_just_seed = clusterer_agg_ap.fit_predict(data_agg4_just_seed) cluster_agg_ap4_just_weather = clusterer_agg_ap.fit_predict( data_agg4_just_weather) mutual_info_score = adjusted_mutual_info_score(labels, cluster_ap) mutual_info_score_agg = adjusted_mutual_info_score(labels, cluster_agg_ap) v_score = homogeneity_completeness_v_measure(labels, cluster_ap) v_score_agg2 = homogeneity_completeness_v_measure(labels, cluster_agg_ap2) v_score_agg4 = homogeneity_completeness_v_measure(labels, cluster_agg_ap4) v_score_agg4_w = homogeneity_completeness_v_measure(labels, cluster_agg_ap4_w) v_score_agg4_ws = homogeneity_completeness_v_measure(labels, cluster_agg_ap4_ws) v_score_agg4_just_season = homogeneity_completeness_v_measure( labels, cluster_agg_ap4_just_season) v_score_agg4_just_leaf = homogeneity_completeness_v_measure( labels, cluster_agg_ap4_just_leaf) v_score_agg4_just_seed = homogeneity_completeness_v_measure( labels, cluster_agg_ap4_just_seed) v_score_agg4_just_weather = homogeneity_completeness_v_measure( labels, cluster_agg_ap4_just_weather) print(v_score)
print("DBSCAN evaluation: ",mutual_info_score(digits.target, labels_dbscan)) # AgglomerativeClustering print("AgglomerativeClustering evaluation: ",mutual_info_score(digits.target, labels_Agg)) # <a id='2.7.3'></a> # #### 2.7.3 Thực hiện đáng giá theo homogeneity_completeness_v_measure # - Giá trị trả về trong khoảng 0 >> 1 # - Càng về 1 thì độ khớp của True labels và cluster labels càng cao. # In[140]: # KMeans print("KMeans evaluation: ",homogeneity_completeness_v_measure(digits.target, labels)) # Spectral cluster print("Spectral evaluation: ",homogeneity_completeness_v_measure(digits.target, labels_spectral)) # DBSCAN print("DBSCAN evaluation: ",homogeneity_completeness_v_measure(digits.target, labels_dbscan)) # AgglomerativeClustering print("AgglomerativeClustering evaluation: ",homogeneity_completeness_v_measure(digits.target, labels_Agg)) # <a id='2.8'></a> # ### 2.8 Nhận xét # - Đối với data là chữ số viết tay (Digits data) thì Agglomerative clustering hiệu quả hơn hẳn so với KMeans, Spectral, DBSCAN clustering # - DBSCAN clustering: khó sử dụng bởi parameters: eps và min_samples. Thử nhiều lần giá trị của eps và min_sample mới cho kết quả khả quan.
def test_homogeneity_completeness_v_measure_sparse(): labels_a = np.array([1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3]) labels_b = np.array([1, 1, 1, 1, 2, 1, 2, 2, 2, 2, 3, 1, 3, 3, 3, 2, 2]) h, c, v = homogeneity_completeness_v_measure(labels_a, labels_b) h_s, c_s, v_s = homogeneity_completeness_v_measure(labels_a, labels_b, sparse=True) assert_array_almost_equal([h, c, v], [h_s, c_s, v_s])
labels = read_labels('rain_labels.csv') # Auto encoder autoencoder, model_evaluation, model_prediction = auto_encode( model_training=model_training, model_training_target=model_training, data_model=model, layer_encoder_2=38, layer_encoder_3=16, latent_space=2, layer_dencoder_1=16, layer_dencoder_2=38, epochs=50) print("\n\n K means using the ORIGINAL data-set\n") k_means_pred_original = k_means(model, labels, 2) print("\n Homogeneity Completeness V-Measure") print(homogeneity_completeness_v_measure(labels, k_means_pred_original)) print("\n\n K means using the Auto-ENCODER data-set\n") k_means_pred_ae = k_means(model_prediction, labels, 2) print("\n Homogeneity Completeness V-Measure") print(homogeneity_completeness_v_measure(labels, k_means_pred_ae)) latent_autoencoder, latent_model_prediction = auto_encode_clustering( data_model=model, latent_layer_encoder_2=38, latent_layer_encoder_3=16, latent_latent_space=2, autoencoder=autoencoder) print("\n\n K means using the LATENT space data\n") k_means_pred_latent = k_means(latent_model_prediction, labels, 2) print("\n Homogeneity Completeness V-Measure") print(homogeneity_completeness_v_measure(labels, k_means_pred_latent)) latent_autoencoder_softmax, latent_model_prediction_softmax = auto_encode_clustering_softmax(
cluster_ap = clusterer_ap.fit_predict(data) cluster_agg_ap = clusterer_agg_ap.fit_predict(data_agg) cluster_agg_ap2 = clusterer_agg_ap.fit_predict(data_agg2) cluster_agg_ap4 = clusterer_agg_ap.fit_predict(data_agg4) cluster_agg_ap4_w = clusterer_agg_ap.fit_predict(data_agg4_w) cluster_agg_ap4_ws = clusterer_agg_ap.fit_predict(data_agg4_ws) cluster_agg_ap4_just_season = clusterer_agg_ap.fit_predict(data_agg4_just_season) cluster_agg_ap4_just_leaf = clusterer_agg_ap.fit_predict(data_agg4_just_leaf) cluster_agg_ap4_just_seed = clusterer_agg_ap.fit_predict(data_agg4_just_seed) cluster_agg_ap4_just_weather = clusterer_agg_ap.fit_predict(data_agg4_just_weather) mutual_info_score = adjusted_mutual_info_score(labels,cluster_ap) mutual_info_score_agg = adjusted_mutual_info_score(labels,cluster_agg_ap) v_score = homogeneity_completeness_v_measure(labels,cluster_ap) v_score_agg2 = homogeneity_completeness_v_measure(labels,cluster_agg_ap2) v_score_agg4 = homogeneity_completeness_v_measure(labels,cluster_agg_ap4) v_score_agg4_w = homogeneity_completeness_v_measure(labels,cluster_agg_ap4_w) v_score_agg4_ws = homogeneity_completeness_v_measure(labels,cluster_agg_ap4_ws) v_score_agg4_just_season = homogeneity_completeness_v_measure(labels,cluster_agg_ap4_just_season) v_score_agg4_just_leaf = homogeneity_completeness_v_measure(labels,cluster_agg_ap4_just_leaf) v_score_agg4_just_seed = homogeneity_completeness_v_measure(labels,cluster_agg_ap4_just_seed) v_score_agg4_just_weather = homogeneity_completeness_v_measure(labels,cluster_agg_ap4_just_weather) print(v_score) print(v_score_agg2) print(v_score_agg4_just_leaf) print(v_score_agg4_just_seed) print(v_score_agg4) # Leaf and seed
def compute_scores(self, x): self.cluster_labels = np.ndarray((x.shape[0], )) for i in range(0, x.shape[0], self.batch_size): predictions = self.kmeans.predict(x[i:(i + self.batch_size)]) self.cluster_labels[i:(i + self.batch_size)] = predictions if (i + self.batch_size) > x.shape[0]: predictions = self.kmeans.predict(x[i:x.shape[0]]) self.cluster_labels[i:x.shape[0]] = predictions confusion_matrix = cscores.contingency_matrix(self.labels_true, self.labels_pred) purity_score = np.sum(np.amax(confusion_matrix, axis=0)) / np.sum(confusion_matrix) homogeneity_score, completeness_score, v_measure_score = cscores.homogeneity_completeness_v_measure( self.labels_true, self.labels_pred) scores = [ #['calinski_harabasz_score', 'internal', cscores.calinski_harabasz_score(x, self.cluster_labels)], [ 'davies_bouldin_score', 'internal', metrics.davies_bouldin_score(x, self.cluster_labels) ], [ 'silhouette_score', 'internal', metrics.silhouette_score(x, self.cluster_labels) ], #['silhouette_samples', 'internal', cscores.silhouette_samples(x, self.cluster_labels)], ['purity_score', 'external', purity_score], [ 'adjusted_rand_score', 'external', cscores.adjusted_rand_score(self.labels_true, self.labels_pred) ], ['completeness_score', 'external', completeness_score], [ 'fowlkes_mallows_score', 'external', cscores.fowlkes_mallows_score(self.labels_true, self.labels_pred) ], ['homogeneity_score', 'external', homogeneity_score], [ 'adjusted_mutual_info_score', 'external', cscores.adjusted_mutual_info_score(self.labels_true, self.labels_pred) ], [ 'mutual_info_score', 'external', cscores.mutual_info_score(self.labels_true, self.labels_pred) ], [ 'normalized_mutual_info_score', 'external', cscores.normalized_mutual_info_score(self.labels_true, self.labels_pred) ], ['v_measure_score', 'external', v_measure_score] ] scores = pd.DataFrame(scores, columns=['name', 'type', 'score']) scores.to_csv(files.small_images_classes_kmeans_scores, index=False)