def test_synthetic_circles(self): print(''' two concentric circles ''') N = 10**3 X, y = make_circles(n_samples=N, noise=1.0) k = len(np.unique(y)) X_incomplete = create_incomplete_matrix(X) labels, _, X_hat = kmeans_missing(X_incomplete, k) sklearn_mse = ((X - X_hat)**2).mean() score = metrics.homogeneity_completeness_v_measure(labels, y) print(f'sklearn mse: {sklearn_mse}') print(f'sklearn scores: {score}') displacements = np.nan_to_num(X_incomplete) spans = np.nan_to_num(X_incomplete) spans[spans == 0] = 1 spans[spans != 1] = 0 L = SetOfLines(spans, displacements, np.ones(N), np.ones(N)) config = ParameterConfig() ## data m = 100 # coreset size ~ reduction ratio tau = 1e-2 config.a_b_approx_minimum_number_of_lines = 100 # constant 100, line 2, algo 2 BI-CRITERIA config.sample_size_for_a_b_approx = int( m * 1.05) # |S| >= m, line 3 of algo 2 # note: there'll be a O(|S|^2) cost while computing algo 1 config.farthest_to_centers_rate_in_a_b_approx = 4 / 11 # opp of 7/11, line 6, algo 2 BI-CRITERIA config.number_of_remains_multiply_factor = int( math.log(N) ) // k # this is `b` in algo 2, other paper, set as random here - how to calculate it? config.closest_to_median_rate = (1 - tau) / ( 2 * k) # refer line 4, algo 1, other paper config.median_sample_size = int( N * 0.05) # size of q_i, line 3, algo 2, other paper config.max_sensitivity_multiply_factor = 100 # for outliers in coresets config.number_of_remains = 20 SAMPLE_SIZE = 50 # keep it < 100, works fast ITER = 5 klines_mse = np.zeros(ITER) scores = [[]] * ITER for i in range(ITER): print(f'Running KLines iter {i+1} of {ITER}') X_klines, kl_labels = customStreamer(L, k, m, SAMPLE_SIZE, config) klines_mse[i] = ((X - X_klines)**2).mean() scores[i] = metrics.homogeneity_completeness_v_measure( kl_labels, y) print(f"Klines MSE: {klines_mse.mean()}") print(f"Klines scores: {np.array(scores).mean(axis=0)}") assert sklearn_mse / klines_mse.mean() > 0.5
def expectation_maximization(X,y,dataset_name): X_train, X_test, y_train, y_test = train_test_split(X,y, random_state=65) train_scores = [] train_homo = [] train_completeness = [] train_v_score = [] test_scores = [] test_homo = [] test_completeness = [] test_v_score = [] kvals = [x for x in range(2,51)] for k in range(2, 51): print("k= {}".format(k)) clf = GaussianMixture(n_components=k, max_iter=1000) # Train on train data, recording accuracy, homogeneity, completeness, and v_measure train_pred = clf.fit_predict(X_train) train_score = fowlkes_mallows_score(y_train, train_pred) train_scores.append(train_score) homogeneity, completeness, v_measure = homogeneity_completeness_v_measure(y_train, train_pred) train_homo.append(homogeneity) train_completeness.append(completeness) train_v_score.append(v_measure) # Evaluate same metrics on test set test_pred = clf.predict(X_test) test_score = fowlkes_mallows_score(y_test, test_pred) test_scores.append(test_score) homogeneity, completeness, v_measure = homogeneity_completeness_v_measure(y_test, test_pred) test_homo.append(homogeneity) test_completeness.append(completeness) test_v_score.append(v_measure) print("done") print("generating plots") plt.figure() plt.title('Folkes-Mallows Score of Expectation Maximization on {} Dataset'.format(dataset_name)) plt.xlabel('Number of Components') plt.ylabel('Folkes-Mallows Score') plt.plot(kvals, train_scores, label='Training Score') plt.plot(kvals, test_scores, label='Test Score') plt.legend(loc='upper left') plt.show(block=False) plt.figure() plt.title('Performance Metrics of Expectation Maximization on {} Dataset'.format(dataset_name)) plt.xlabel('K Value (Number of Clusters)') plt.ylabel('Score (Range 0.0 to 1.0)') plt.plot(kvals, train_homo, label='Training Homogeneity') plt.plot(kvals, test_homo, label='Test Homogeneity') plt.plot(kvals, train_completeness, label='Training Completeness') plt.plot(kvals, test_completeness, label='Test Completeness') plt.plot(kvals, train_v_score, label='Training V-Measure') plt.plot(kvals, test_v_score, label='Test V-Measure') plt.legend(loc='upper left') plt.show(block=False)
def getClusteringEvalPlots(dataset): noOfClusters = range(2, 11, 1) for ds in dataset: sse = [[]] sil = [[[], []]] scores = [[[], []], [[], []], [[], []], [[], []], [[], []]] for cluster in noOfClusters: kmLearner = Clustering.KM(n_clusters=cluster) kmLearner.getLearner().fit(ds.training_x) emLearner = Clustering.EM(n_components=cluster) emLearner.getLearner().fit(ds.training_x) clustringY_KM = kmLearner.getLearner().predict(ds.training_x) clustringY_EM = emLearner.getLearner().predict(ds.training_x) homogeneityKM, completenessKM, v_measureKM = homogeneity_completeness_v_measure(ds.training_y, clustringY_KM) AMISKM = adjusted_mutual_info_score(ds.training_y, clustringY_KM) ARSKM = adjusted_rand_score(ds.training_y, clustringY_KM) silhouetteKM = silhouette_score(ds.training_x, clustringY_KM) homogeneityEM, completenessEM, v_measureEM = homogeneity_completeness_v_measure(ds.training_y, clustringY_EM) AMISEM = adjusted_mutual_info_score(ds.training_y, clustringY_EM) ARSEM = adjusted_rand_score(ds.training_y, clustringY_EM) silhouetteEM = silhouette_score(ds.training_x, clustringY_EM) sse.append(kmLearner.getLearner().inertia_) sil[0][0].append(silhouetteKM) scores[0][0].append(v_measureKM) scores[1][0].append(AMISKM) scores[2][0].append(ARSKM) scores[3][0].append(homogeneityKM) sil[0][1].append(silhouetteEM) scores[0][1].append(v_measureEM) scores[1][1].append(AMISEM) scores[2][1].append(ARSEM) scores[3][1].append(homogeneityEM) plt.style.use('seaborn-whitegrid') plt.plot(noOfClusters, sil[0][0], label='Silhouette Score, KM', marker='o') plt.plot(noOfClusters, sil[0][1], label='Silhouette Score, EM', marker='o', linestyle='--') plt.ylabel('Silhouette Score', fontsize=12) plt.xlabel('K', fontsize=12) plt.title('Silhouette Plot for ' + ds.name, fontsize=12, y=1.03) plt.legend() plt.savefig('Figures/Clustering/Silhouette for ' + ds.name + '.png') plt.close() plt.style.use('seaborn-whitegrid') plt.plot(noOfClusters, scores[0][0], label='V Measure, KM', marker='o') plt.plot(noOfClusters, scores[1][0], label='Adj. Mutual Info, KM', marker='o') plt.plot(noOfClusters, scores[2][0], label='Adj. Rand. Score, KM', marker='o') plt.plot(noOfClusters, scores[0][1], label='V Measure, EM', marker='o', linestyle='--') plt.plot(noOfClusters, scores[1][1], label='Adj. Mutual Info, EM', marker='o', linestyle='--') plt.plot(noOfClusters, scores[2][1], label='Adj. Rand. Score, EM', marker='o', linestyle='--') plt.ylabel('Score', fontsize=12) plt.xlabel('K', fontsize=12) plt.title('Score Plot for ' + ds.name, fontsize=12, y=1.03) plt.legend() plt.savefig('Figures/Clustering/Score for ' + ds.name + '.png') plt.close()
def test_benchmark_Chainlink(self): print('Clustering Chainlink.npz') npzfile = np.load('data/Chainlink.npz') X, y = npzfile['X'], npzfile['y'] (N, _), k = X.shape, np.unique(y).shape[0] print(f'#Datapoints {N}') X_incomplete = create_incomplete_matrix(X) labels, _, X_hat = kmeans_missing(X_incomplete, k) sklearn_mse = ((X - X_hat)**2).mean() score = metrics.homogeneity_completeness_v_measure(labels, y) print(f'MSE sklearn: {sklearn_mse}') print(f'MSE scores/measures: {score}') displacements = np.nan_to_num(X_incomplete) spans = np.nan_to_num(X_incomplete) spans[spans == 0] = 1 spans[spans != 1] = 0 L = SetOfLines(spans, displacements, np.ones(N), np.ones(N)) config = ParameterConfig() ## data m = 60 # coreset size ~ reduction ratio tau = 1e-2 config.a_b_approx_minimum_number_of_lines = 40 # constant 100, line 2, algo 2 BI-CRITERIA config.sample_size_for_a_b_approx = int( m * 1.05) # |S| >= m, line 3 of algo 2 # note: there'll be a O(|S|^2) cost while computing algo 1 config.farthest_to_centers_rate_in_a_b_approx = 4 / 11 # opp of 7/11, line 6, algo 2 BI-CRITERIA config.number_of_remains_multiply_factor = int( math.log(N) ) // k # this is `b` in algo 2, other paper, set as random here - how to calculate it? config.closest_to_median_rate = (1 - tau) / ( 2 * k) # refer line 4, algo 1, other paper config.median_sample_size = int( N * 0.05) # size of q_i, line 3, algo 2, other paper config.max_sensitivity_multiply_factor = 100 # for outliers in coresets config.number_of_remains = 20 SAMPLE_SIZE = 50 ITER = 5 klines_mse = np.zeros(ITER) scores = [[]] * ITER for i in range(ITER): print(f'Running KLines iter {i+1} of {ITER}') X_klines, kl_labels = customStreamer(L, k, m, SAMPLE_SIZE, config) klines_mse[i] = ((X - X_klines)**2).mean() scores[i] = metrics.homogeneity_completeness_v_measure( kl_labels, y) print(f"Klines MSE: {klines_mse.mean()}") print(f"Scores: {np.array(scores).mean(axis=0)}") assert sklearn_mse / klines_mse.mean() > 0.8
def test_avg_clustering_with_model_selection(db_dirs, method, val_dirs_count=2): bestStatistic, prevStatistic = 0, 0 val_dirs_count = len(db_dirs) #hack!!! if use_clustering == rankorder_clustering: bestThreshold = (0, 0) for distanceThreshold in np.linspace(1.02, 1.1, 9): prevStatistic = 0 bestChanged = False for rankThreshold in range(12, 22, 2): currentStatistic = 0 for i, db_dir in enumerate(db_dirs[:val_dirs_count]): num_of_classes, num_of_clusters, y_true, y_pred = get_clustering_results( db_dir, method, (distanceThreshold, rankThreshold)) #bcubed_precision,bcubed_recall,bcubed_fmeasure=BCubed_stat(y_true, y_pred) #currentStatistic+=bcubed_fmeasure homogeneity, completeness, v_measure = metrics.homogeneity_completeness_v_measure( y_true, y_pred) currentStatistic += v_measure #print(num_of_classes) currentStatistic /= val_dirs_count print(distanceThreshold, rankThreshold, currentStatistic) if currentStatistic > bestStatistic: bestStatistic = currentStatistic bestThreshold = (distanceThreshold, rankThreshold) bestChanged = True if currentStatistic <= prevStatistic: #-0.01 break prevStatistic = currentStatistic if not bestChanged: break else: bestThreshold = 0 for distanceThreshold in np.linspace(0.6, 1.3, 71): currentStatistic = 0 for i, db_dir in enumerate(db_dirs[:val_dirs_count]): num_of_classes, num_of_clusters, y_true, y_pred = get_clustering_results( db_dir, method, distanceThreshold) #bcubed_precision,bcubed_recall,bcubed_fmeasure=BCubed_stat(y_true, y_pred) #currentStatistic+=bcubed_fmeasure homogeneity, completeness, v_measure = metrics.homogeneity_completeness_v_measure( y_true, y_pred) currentStatistic += v_measure #print(num_of_classes) currentStatistic /= val_dirs_count #print(distanceThreshold,currentStatistic) if currentStatistic > bestStatistic: bestStatistic = currentStatistic bestThreshold = distanceThreshold if currentStatistic < prevStatistic - 0.01: break prevStatistic = currentStatistic print('method:', method, 'bestParams:', bestThreshold, 'bestStatistic:', bestStatistic) #test_avg_clustering(db_dirs[val_dirs_count:],method,bestThreshold) test_avg_clustering(db_dirs, method, bestThreshold) #hack!!!
def test(): from sklearn.metrics import homogeneity_completeness_v_measure from sklearn.cluster import KMeans from time import time # mat = np.random.random([500, 250]) # mat[mat > 0.5] = 1 # mat[mat <= 0.5] = 0 n_clusters = 40 mat = np.random.random([n_clusters, 250]) mat[mat > 0.7] = 1 mat[mat <= 0.7] = 0 mats = [] labels = [] n_samples = 1500 for i in xrange(mat.shape[0]): m = np.zeros([n_samples, mat.shape[1]]) l = np.zeros(n_samples, dtype=np.int32) + i m[:, :] = mat[i] for j in xrange(n_samples): inds = np.random.permutation(np.arange(mat.shape[1]))[:50] m[j, inds] = 1 - m[j, inds] mats.append(m) labels.append(l) mat = np.concatenate(mats) labels = np.concatenate(labels) inds = np.random.permutation(np.arange(mat.shape[0])) mat = mat[inds] labels = labels[inds] st = time() modes, clusters = kmodes_fit(mat, n_clusters, 20, 3000) print "elapsed time:", time() - st # print modes.shape # print modes # print clusters.shape # print clusters st = time() clusters_km = KMeans(n_clusters, max_iter=20, n_init=1, tol=0).fit_predict(mat) print "elapsed time:", time() - st print homogeneity_completeness_v_measure(labels, clusters) print homogeneity_completeness_v_measure(labels, clusters_km)
def tracking(self, d_start=gb.D_START_TRACKING, d_end=gb.D_END_TRACKING, path=""): print("\n --------- tracking ...") times_fsp, axes_fsp, labels_fsp = [], [], [] times_ssp, axes_ssp, labels_ssp = [], [], [] timedelta = datetime.timedelta( milliseconds=60 * 60 * 1000) # read chunk by chunk (each chunk is of 'timedelta' milliseconds) date = d_start while date < d_end: if date + timedelta >= d_end: timedelta = d_end - date times, axes, labels = self.predict_fsp(d_start=date, d_end=date + timedelta) # self.plot_colored_signals(times, axes, labels, path, figname="_FSP.png") times_fsp += times; axes_fsp += axes; labels_fsp += labels times, axes, labels = self.predict_ssp(d_start=date, d_end=date + timedelta, update=True) # self.plot_colored_signals(times, axes, labels, path, figname="_SSP.png") times_ssp += times; axes_ssp += axes; labels_ssp += labels date += timedelta # ---------------------------- if gb.ARTIFICIAL: times, values, true_labels = self.sigReaders[0].getSignal(start=d_start, end=d_end, dated=gb.DATED, get_modes=True) ari_fps = adjusted_rand_score(true_labels, labels_fsp); ari_sps = adjusted_rand_score(true_labels, labels_ssp) ami_fps = adjusted_mutual_info_score(true_labels, labels_fsp); ami_sps = adjusted_mutual_info_score(true_labels, labels_ssp) ho_fps, com_fps, vm_fps = homogeneity_completeness_v_measure(true_labels, labels_fsp); ho_sps, com_sps, vm_sps = homogeneity_completeness_v_measure(true_labels, labels_ssp) print("---------------------------------------------------") print("adjusted_rand_score \t (ari_fps, ari_sps)", (ari_fps, ari_sps)) print("adjusted_mutual_info \t (ami_fps, ami_sps)", (ami_fps, ami_sps)) print("homogeneity \t (ho_fps, ho_sps)", (ho_fps, ho_sps)) print("completeness \t (com_fps, com_sps)", (com_fps, com_sps)) print("v_measure \t (vm_fps, vm_sps)", (vm_fps, vm_sps)) #return (ari_fps, ari_sps), (ami_fps, ami_sps), (ho_fps, ho_sps), (com_fps, com_sps), (vm_fps, vm_sps) return ((ari_fps, ari_sps), (ami_fps, ami_sps), (ho_fps, ho_sps), (com_fps, com_sps), (vm_fps, vm_sps)), (times_fsp,axes_fsp,labels_fsp,times_ssp,axes_ssp,labels_ssp) else: return 0., 0.
def clustering_performance_evaluation(X, y_pred, y_true): """ this function implement multiple evaluation metrics for clustering analysis. this method will be used in order to asses the quality of a clustering solution based on multiple criteria :param X: input matrix :param y_pred: predicted vector :param y_true: ground truth - if none - one do not have this knowledge :return: a dictionary with all measures """ result = {} result['ARI'] = metrics.adjusted_rand_score(y_true, y_pred) result['AMI'] = metrics.adjusted_mutual_info_score(y_true, y_pred) result['NMI'] = metrics.normalized_mutual_info_score(y_true, y_pred) h, c, v = metrics.homogeneity_completeness_v_measure(y_true, y_pred) result['H**o'] = h result['Comp'] = c result['V'] = v result['FM'] = metrics.fowlkes_mallows_score(y_true, y_pred) result['Sil'] = metrics.silhouette_score(X[['entropy', 'joint_entropy']], y_pred, metric='euclidean') return result
def K_Means_RFE(feature_set, label_set, depth_index, score_spread): """A recursive function to extract the best features and score vs # of features""" if len(feature_set[0]) == 1: return (-2, -2, -2), [], score_spread best_features = [] max_sil_score = (-2, -2, -2 ) #since range is 0 to 1, this will be overridden top_features = [] for i in range(len(feature_set[0])): sub_set = numpy.delete(feature_set, i, 1) kmeans = KMeans(n_clusters=8, n_init=10) kmeans = kmeans.fit(sub_set) sil_score = metrics.homogeneity_completeness_v_measure( label_set, kmeans.labels_) if sil_score[2] > max_sil_score[2]: max_sil_score = sil_score top_features = sub_set score_spread = numpy.insert(score_spread, 0, max_sil_score[2]) print("Now entering depth: ", depth_index + 1) best_score, best_features, score_spread = K_Means_RFE( top_features, label_set, depth_index + 1, score_spread) if max_sil_score[2] > best_score[2]: best_score = max_sil_score best_features = top_features print("Now leaving depth: ", depth_index) return best_score, best_features, score_spread
def clustering_metrics(labels_pred, labels_true = None, feature = None): ''' 聚类算法结果评估 需要真实标签: 兰德指数 ARI: 输入参数没有顺序要求,ARI值的范围是[-1,1], 负的结果都是较差的,说明标签是独立分布的,相似分布的ARI结果是正的, 1是最佳结果,说明两种标签的分布完全一致 互信息 AMI:输入参数没有顺序要求,最好的值为1,最差的值(与labels_true不相关),其结果为非正值 同质性、完整性、两者的调和平均V-measure:从0到1反应出最差到最优的表现 Fowlkes-Mallows指数:针对训练集和验证集数据之间求得的查全率和查准率的几何平均值 不需要真实标签: 轮廓系数:取值范围是[-1,1],同类别样本距离越相近不同类别样本距离越远,分数越高。 Calinski-Harabaz Index:分数值越大则聚类效果越好 ''' if labels_true is not None: print u'兰德指数 ARI: ', metrics.adjusted_rand_score(labels_true, labels_pred) print u'互信息 AMI: ', metrics.adjusted_mutual_info_score(labels_true, labels_pred) print u'同质性、完整性、两者的调和平均V-measure: ', metrics.homogeneity_completeness_v_measure(labels_true, labels_pred) print u'Fowlkes-Mallows指数 FMI: ', metrics.fowlkes_mallows_score(labels_true, labels_pred) if feature is not None: print u'轮廓系数: ', metrics.silhouette_score(feature, labels_pred, metric='euclidean') print u'Calinski-Harabaz Index: ', metrics.calinski_harabaz_score(feature, labels_pred)
def score(self: 'Frame2D', score_frame: 'Frame2D', label_ix: int = -1, glcm_radius=None): """ Scores the current frame kmeans with a scoring image :param label_ix: The label index to score against score_frame :param score_frame: The score as Frame2D :param glcm_radius: The radius of GLCM used if applicable. This will crop the Frame2D automatically to fit. :return: A Dictionary of various scoring algorithm results, {'Custom', 'Homogeneity', 'Completeness', 'V Measure'} """ # Convert grayscale to labels if glcm_radius is not None: score_frame = score_frame.crop_glcm(glcm_radius) true = self.labelize(score_frame.data[..., 0]).flatten() pred = self.data[..., label_ix].flatten() score = self.scorer_pair(true, pred)['score'],\ *homogeneity_completeness_v_measure(true, pred) return { "Custom": score[0], "Homogeneity": score[1], "Completeness": score[2], "V Measure": score[3] }
def get_homogeneity_completeness_v_measure(labels_pred, labels_anno): """ homogeneity_completeness_v_measure """ h, c, v = metrics.homogeneity_completeness_v_measure( labels_anno, labels_pred) return h, c, v
def results(X_test, y_test, clf=None): if clf is None: clf = cluster.KMeans(n_clusters=4, init='random').fit(X_test) preds = clf.predict(X_test) ans = pd.DataFrame({'label': y_test.values, 'kmean': preds}) print(preds) print("y_test: ", y_test) ans = ans.groupby(['kmean', 'label']).size() print(ans) correct = sum([ anom if anom > norm else norm for anom, norm in zip(ans[::2], ans[1::2]) ]) print(correct) print(sum(ans)) print("Total accuracy: {0:.1%}".format(correct / sum(ans))) y_test = y_test.tolist() for x in range(len(y_test)): if (y_test[x] == "attack"): y_test[x] = 1 else: y_test[x] = 0 print(homogeneity_completeness_v_measure(y_test, preds)) print("ac ", metrics.accuracy_score(y_test, preds)) print(confusion_matrix(y_test, preds)) return clf
def kmeans_clustering(X_train, y_train, X_test, y_test, genre_list): scalar = StandardScaler() scalar.fit(X_train, y_train) new_data = scalar.transform(X_train) kmeans = KMeans(init='k-means++', n_init=10, n_clusters=4, max_iter=300) rVal = kmeans.fit(X_train, y_train) kmeans_predictions = kmeans.predict(X_test) print("the randomized score is : ", metrics.adjusted_rand_score(y_test, kmeans_predictions)) print("the normalized mutual info score is : ", metrics.normalized_mutual_info_score(y_test, kmeans_predictions)) print("the mutual info score is : ", metrics.mutual_info_score(y_test, kmeans_predictions)) print( "the homogenity, completeness and v measure score is : ", metrics.homogeneity_completeness_v_measure(y_test, kmeans_predictions)) print("the fowlkes mallows score is : ", metrics.fowlkes_mallows_score(y_test, kmeans_predictions)) labels = kmeans.labels_ print( "the silhouette score is :", metrics.silhouette_score(X_test, kmeans_predictions, metric='euclidean')) print(kmeans_predictions) print(y_test) centers = rVal.cluster_centers_ distances = pairwise_distances(new_data, centers, metric='euclidean') clusters = np.argmin(distances, axis=1) print(len(clusters)) plotSamples = PCA(n_components=2).fit_transform(new_data) plotClusters(plotSamples, clusters, kmeans) joblib.dump(kmeans, 'saved_models/model_kmeans.pkl')
def baseline_cluster(data, act_labels, k, output_folder, experiment_name): start_time = time.time() clusters = np.random.randint(0, k, size=data.shape[0]) end_time = time.time() final_time = end_time - start_time h, c, v = homogeneity_completeness_v_measure(act_labels, clusters) return clusters, h, c, v, final_time
def show_clustering_info(x: np.ndarray, y_true: np.ndarray, y_predicted: np.ndarray, folder: str = 'results', filename: str = 'genes', extension: str = 'xlsx', sheet_name: str = 'results') -> None: """ Shows information about the predicted data and saves them to an excel file. :param x: the x data. :param y_true: the known label values. :param y_predicted: the predicted label values. :param folder: the folder to save the results excel file. :param filename: the name of the excel file. :param extension: the file's extension. :param sheet_name: the excel's sheet name. """ hcv = metrics.homogeneity_completeness_v_measure(y_true, y_predicted) # Create results dictionary. results = {'Adjusted Random Index': [metrics.adjusted_rand_score(y_true, y_predicted)], 'Homogeneity': [hcv[0]], 'Completeness': [hcv[1]], 'V Measure': [hcv[2]], 'Silhouette Coefficient': [metrics.silhouette_score(x, y_predicted)]} # Log results. logger.log('Model\'s Results:') for key, values in results.items(): for value in values: logger.log('{text}: {number:.{points}g}'.format(text=key, number=value, points=4)) # Create excel if save is True. if SAVE_PRED_RESULTS: helpers.utils.create_excel(results, folder, filename, extension, sheet_name)
def generate_eval_dict(gt, pred): # Put all the metrics values in a dictionary and return them eval_dict = {} # Compute all the traditional metrics eval_dict['homogeneity'], eval_dict['completeness'], eval_dict['v_measure'] = \ homogeneity_completeness_v_measure(gt, pred) eval_dict['nmi'] = normalized_mutual_info_score(gt, pred) eval_dict['rand'] = adjusted_rand_score(gt, pred) eval_dict['munkres'] = munkres_score([gt], [pred]) eval_dict['ari'] = adjusted_rand_score(gt, pred) # Compute all the new metrics eval_dict['rss_substring'] = repeated_structure_score(gt, pred, with_purity=True, substring=True) eval_dict['transs'] = transition_structure_score(gt, pred) eval_dict['transs_flip'] = transition_structure_score(pred, gt) eval_dict['lass'] = label_agnostic_segmentation_score(gt, pred) eval_dict['sss_combined'] = segment_structure_score_new(gt, pred) eval_dict['tss_combined'] = temporal_structure_score_new(gt, pred) eval_dict['tss_combined-10'] = temporal_structure_score_new(gt, pred, beta=10.) eval_dict['tss_combined-0,1'] = temporal_structure_score_new(gt, pred, beta=0.1) eval_dict['tss_combined-5'] = temporal_structure_score_new(gt, pred, beta=5.) eval_dict['tss_combined-0,5'] = temporal_structure_score_new(gt, pred, beta=0.5) eval_dict['tss_combined-2'] = temporal_structure_score_new(gt, pred, beta=2.) eval_dict['tss_combined-0,2'] = temporal_structure_score_new(gt, pred, beta=0.2) return eval_dict
def spectral_cluster_evaluate(data, labels, n_cluster, affinity="rbf"): """ :param data: 相似度矩阵 or 嵌入向量 :param n_cluster: :param affinity: precomputed || rbf :return: """ metric = "euclidean" if affinity == "precomputed": # sklearn指导,如果data是距离矩阵而不是相似度矩阵,则可以用下面的rbf转换一下 distance_mat = data delta = math.sqrt(2) data = np.exp(-distance_mat**2 / (2. * delta**2)) metric = affinity clustering = SpectralClustering(n_clusters=n_cluster, affinity=affinity, n_init=50, random_state=42) preds = clustering.fit_predict(data) h, c, v = metrics.homogeneity_completeness_v_measure(labels, preds) s1 = metrics.silhouette_score(embeddings, labels, metric=metric) s2 = metrics.silhouette_score(embeddings, preds, metric=metric) print( f"homogenetiy: {h}, completeness: {c}, v_measure: {v}, silhouette_score label: {s1}, silhouette_score pred: {s2}\n" )
def bestClassify(X,Y): "Best classifier function" tfidf = True if tfidf: vec = TfidfVectorizer(preprocessor = identity, tokenizer = identity, ) else: vec = CountVectorizer(preprocessor = identity, tokenizer = identity) km = KMeans(n_clusters=6, n_init=10, verbose=1) clusterer = Pipeline( [('vec', vec), ('cls', km)] ) clusterer.fit(X) prediction = clusterer.predict(X) checker = defaultdict(list) for pred,truth in zip(prediction,Y): checker[pred].append(truth) labeldict = {} for pred, label in checker.items(): labeldict[pred] = Counter(label).most_common(1)[0][0] #print(pred, Counter(label).most_common(1)[0][0]) prediction = [labeldict[p] for p in prediction] labels = list(labeldict.values()) print(labels) print(confusion_matrix(Y, prediction, labels=labels)) print("Rand-Index:", adjusted_rand_score(Y,prediction)) print(homogeneity_completeness_v_measure(Y, prediction))
def eval_2(labels_true, labels_pred, is_show=True): """ 有监督的评估 评价指标 越接近 1 越好 :param labels_true: :param labels_pred: :param is_show: 是否显示结果 :return: """ if labels_true == []: info = f"cluster: img_sum:{len(labels_pred)}, id_sum:{len(set(labels_pred))}" return [], info nmi = 0 # metrics.normalized_mutual_info_score(labels_true, labels_pred) # 归一化互信息 ari = metrics.adjusted_rand_score(labels_true, labels_pred) # 调整兰德指数 # 纯度,散度, v_measure homogeneity, completeness, v_measure_score = metrics.homogeneity_completeness_v_measure( labels_true, labels_pred) fmi = metrics.fowlkes_mallows_score(labels_true, labels_pred) # 几何平均数 avg_pre, avg_rec, fscore = fowlkes_mallows_score( labels_true, labels_pred) # 调和平均数 ***** k = 0.5 fscore_2 = 2. * avg_pre * k * avg_rec / (avg_pre * k + avg_rec) s_1 = f"gt: img_sum:{len(labels_true)}, id_sum:{len(set(labels_true))}" s_2 = f"cluster: img_sum:{len(labels_pred)}, id_sum:{len(set(labels_pred))}" s_3 = "有监督: 纯度, 散度, nmi, v_measure, ari:" + f"{r(homogeneity)}, {r(completeness)}, {r(nmi)}, {r(v_measure_score)}, {r(ari)}" s_4 = 'avg_pre, avg_rec, fscore, fmi:' + f"{r(avg_pre)}, {r(avg_rec)}, {r(fscore)}, {r(fmi)}" info = f"{s_1}\n{s_2}\n{s_3}\n{s_4}" if is_show: print(info) metric = [avg_pre, avg_rec, fscore, fmi] return metric, info
def computeHomogeneityCompleteness(self, labels_families, predicted_clusters): if labels_families is None: self.homogeneity, self.completeness, self.v_measure = 0, 0, 0 return self.homogeneity, self.completeness, self.v_measure = \ metrics.homogeneity_completeness_v_measure(labels_families, predicted_clusters)
def prin_clustering(test_rep, test_label, NUM_OF_CLASS): # 聚类 km = KMeans(n_clusters=NUM_OF_CLASS) #km.fit_transform(test_rep) cls_rs = km.fit_predict(test_rep) # ARI ari = metrics.adjusted_rand_score(test_label, cls_rs) # AMI ami = metrics.adjusted_mutual_info_score(test_label, cls_rs) # H,C,V H, C, V = metrics.homogeneity_completeness_v_measure(test_label, cls_rs) # FMI fmi = metrics.fowlkes_mallows_score(test_label, cls_rs) # s # s = metrics.silhouette_score(test_label, cls_rs) # DBI # dbi = metrics.davies_bouldin_score(test_label, cls_rs) # nmi nmi = metrics.normalized_mutual_info_score(test_label, cls_rs) d = dict() d['ari'] = ari d['ami'] = ami d['nmi'] = nmi d['fmi'] = fmi d['H'] = H d['C'] = C d['V'] = V print('ARI:%.4f,AMI:%.4f,HCV:%.4f %.4f %.4f FMI:%.4f NMI:%.4f' % (ari, ami, H, C, V, fmi, nmi)) return d
def cluster_evaluate(embeddings, labels, n_class, metric="euclidean"): """ Unsupervised setting: We assess the ability of each method to embed close together nodes with the same ground-truth structural role. We use agglomerative clustering (with single linkage) to cluster embeddings learned by each method and evaluate the clustering quality via: (1) homogeneity, conditional entropy of ground-truth structural roles given the predicted clustering; (2) completeness, a measure of how many nodes with the same ground-truth structural role are assigned to the same cluster; (3) silhouette score, a measure of intra-cluster distance vs. inter-cluster distance. Supervised setting: We assess the performance of learned embeddings for node classifcation. Using 10-fold cross validation, we predict the structural role (label) of each node in the test set based on its 4-nearest neighbors in the training set as determined by the embedding space. The reported score is then the average accuracy and F1-score over 25 trials. """ clusters = AgglomerativeClustering(n_clusters=n_class, linkage='single', affinity=metric).fit_predict(embeddings) h, c, v = metrics.homogeneity_completeness_v_measure(labels, clusters) s = metrics.silhouette_score(embeddings, clusters) acc = accuracy_score(labels, clusters) macro_f1 = f1_score(labels, clusters, average="macro") print("cluster:", clusters, "labels:", labels) print("accuracy: ", acc) print("macro_score: ", macro_f1) print("homogeneity: ", h) print("completeness: ", c) print("v-score: ", v) print("silhouette: ", s) return h, c, v, s
def results_evaluation_phase2(actual_labels, predicted_labels): start_time = datetime.now() print(' +| Extracting details of the resulting clusters...') actual_vs_predicted_labels_df = pd.DataFrame({'actual_labels': actual_labels, 'predicted_labels': predicted_labels}) clusters_details = [] # The list includes details of each clusters (Number of items & items distribution) # Extracts clusters' details for c in [x for x in sorted(set(predicted_labels)) if x >= 0]: # Evaluates each cluster (except outliers) details = dict(actual_vs_predicted_labels_df.query("predicted_labels == @c ")[ 'actual_labels'].value_counts()) # Counts actual labels inside the cluster details = {i: details[i] for i in sorted(details.keys())} # Sorts directory by keys clusters_details.append(' # Cluster [%s] contains %d items. Details:%s' % (str( c), list(predicted_labels).count(c), details)) # Adds the cluster's details to the list # Extracts outliers details if -1 in actual_vs_predicted_labels_df["predicted_labels"].tolist(): outliers_details = dict(actual_vs_predicted_labels_df.query("predicted_labels == -1")['actual_labels'].value_counts()) outliers_details = dict(sorted(outliers_details.items(), key=lambda x: x[0])) else: outliers_details = "" # Calculates homogeneity, completeness, Vmeasure, AR, and AMI scores warnings.filterwarnings('ignore') # Ignores outliers print(' +| Calculating the clustering evaluation metrics...') MetricWithoutOtl = actual_vs_predicted_labels_df[actual_vs_predicted_labels_df['predicted_labels'] != -1] P2_hom_com_vmet = (homogeneity_completeness_v_measure(MetricWithoutOtl['actual_labels'], MetricWithoutOtl['predicted_labels'])) P2_AR_Score = (adjusted_rand_score(MetricWithoutOtl['actual_labels'], MetricWithoutOtl['predicted_labels'])) P2_AMI_Score = (adjusted_mutual_info_score(MetricWithoutOtl['actual_labels'], MetricWithoutOtl['predicted_labels'])) # Prints the results' summary print(' *| Summary of Phase2 clustering results: ({} Clus. | {} Outl. | Homg.:{:.2%} | Comp.:{:.2%} | V-measure:{:.2%} | AR:{:.2%} | AMI:{:.2%})' .format(len(MetricWithoutOtl['predicted_labels'].unique()), list(predicted_labels).count(-1), P2_hom_com_vmet[0], P2_hom_com_vmet[1], P2_hom_com_vmet[2], P2_AR_Score, P2_AMI_Score)) return clusters_details, outliers_details, P2_hom_com_vmet, P2_AR_Score, P2_AMI_Score
def evaluate_clustering_performance(clusters, labels): set_of_dimensionality = set() for cluster in clusters: set_of_dimensionality.add(frozenset(cluster.dimensions)) # Evaluating performance in all dimensionality for dim in set_of_dimensionality: print("\nEvaluating clusters in dimension: ", list(dim)) # Finding clusters with same dimensions clusters_in_dim = [] for c in clusters: if c.dimensions == dim: clusters_in_dim.append(c) clustering_labels = np.zeros(np.shape(labels)) for i, c in enumerate(clusters_in_dim): clustering_labels[list(c.data_point_ids)] = i + 1 print("Number of clusters: ", len(clusters_in_dim)) print("Adjusted Rand index: ", metrics.adjusted_rand_score(labels, clustering_labels)) print("Mutual Information: ", metrics.adjusted_mutual_info_score(labels, clustering_labels)) print( "Homogeneity, completeness, V-measure: ", metrics.homogeneity_completeness_v_measure(labels, clustering_labels)) print("Fowlkes-Mallows: ", metrics.fowlkes_mallows_score(labels, clustering_labels))
def get_homogeneity_completeness_vmeasure(standard_file, prediction_file): """Get homogeneity, completeness, and V-measure score [Rosenberg2007]_. Parameters ---------- standard_file : str The ground truth or standard filename. prediction_file : str The analyzed or predicted filename. Returns ------- homogeneity_completeness_vmeasure : tuple Homogeneity, completeness, and V-measure score References ---------- .. [Rosenberg2007] Andrew Rosenberg and Julia Hirschberg. V-Measure: A conditional entropy-based external cluster evaluation measure. In Proceedings of the 2007 Joint Conference on Empirical Methods in Natural Language Processing and Computational Natural Language Learning, volume 7, pages 410-420, 2007. """ standard_labels = ExternalEvaluation.get_evaluated(standard_file) prediction_labels = ExternalEvaluation.get_evaluated(prediction_file) homogeneity_completeness_vmeasure = \ metrics.homogeneity_completeness_v_measure(standard_labels, prediction_labels) return homogeneity_completeness_vmeasure
def evaluate_recurrent_defects(ref_df: pd.DataFrame, predictions, remove_ata_zero_section=True): """ Uses sklearn's Adjusted Rand Index, homogeneity, completeness and v-measure to evaluate the clustering predictions. https://scikit-learn.org/stable/modules/generated/sklearn.metrics.adjusted_rand_score.html https://scikit-learn.org/stable/modules/generated/sklearn.metrics.homogeneity_score.html https://scikit-learn.org/stable/modules/generated/sklearn.metrics.completeness_score.html https://scikit-learn.org/stable/modules/generated/sklearn.metrics.v_measure_score.html :param ref_df: The reference dataframe. :param predictions: The predictions. Their format is an iterable collection of sets of defect labels belonging to the same cluster, i.e. [{'C-6414274-1', 'L-5245081-1'}, {'C-6414294-1', 'C-6414295-1', 'C-6414296-1'}, ...] Clusters containing a single element are ignored during evaluation. :param remove_ata_zero_section: Remove from the reference all clusters for which the ATA section is 0 (recommended) :return: A dict with the following keys ari_score - Adjusted Rand Index, similarity score between -1.0 and 1.0. Random labelings have an ARI close to 0. 1.0 stands for perfect match. homogeneity - A clustering result satisfies homogeneity if all of its predicted clusters contain only data points that are clustered in the reference. completeness - A clustering result satisfies completeness if all the data points that are members of the same reference cluster are found in the same predicted cluster. v_measure - harmonic mean of homogeneity and completeness pred_clusters - a list of predicted cluster labels, useful for debug ref_clusters - a list of reference cluster labels, useful for debug remove_ata_zero_section - copy of argument remove_ata_zero_section for this function """ filled_df = ref_df.recurrent.fillna( NO_CLUSTER_LABEL ) # when there is no recurrent id, define as not clustered if remove_ata_zero_section: filled_df.where(ref_df.section == 0, NO_CLUSTER_LABEL, inplace=True) # remove clusters with a single member, which are not clusters at all duplicate_df = filled_df.duplicated(keep=False) filled_df.where(duplicate_df, NO_CLUSTER_LABEL, inplace=True) ref_clusters = filled_df # convert cluster assignments from the predictions in the same order as those from the ref pred_clusters = convert_cluster_labels_to_seq(ref_df, predictions) # evaluate homogeneity, completeness, v_measure_score = homogeneity_completeness_v_measure( ref_clusters, pred_clusters) ari_score = adjusted_rand_score(ref_clusters, pred_clusters) return { 'ari_score': ari_score, 'homogeneity': homogeneity, 'completeness': completeness, 'v_measure': v_measure_score, 'pred_clusters': pred_clusters, 'ref_clusters': ref_clusters, 'remove_ata_zero_section': remove_ata_zero_section }
def test_homogeneity_completeness_vmeasure(self): labels_true, labels_pred = _linearize(self.labels_true, self.labels_pred) sk_homogeneity, sk_completeness, sk_vmeasure = skmetrics.homogeneity_completeness_v_measure(labels_true, labels_pred) homogeneity, completeness, vmeasure = self.metrics._homogeneity_completeness_vmeasure(1) self.assertEqual(homogeneity, sk_homogeneity) self.assertEqual(completeness, sk_completeness) self.assertEqual(sk_vmeasure, vmeasure)
def computeExternalMetrics(labels, predLabels) -> ExternalClusterMetrics: """External metrics evaluate clustering performance against labeled data.""" ami = metrics.adjusted_mutual_info_score(labels, predLabels) ars = metrics.adjusted_rand_score(labels, predLabels) fm = metrics.fowlkes_mallows_score(labels, predLabels) h, c, v = metrics.homogeneity_completeness_v_measure(labels, predLabels) return ExternalClusterMetrics(ami, ars, c, fm, h, v)
def meanShift(self,X,axis2): ms=MeanShift(bandwidth=7)#带宽 ms.fit(X) pred_ms=ms.labels_ axis2.scatter(X[:,0],X[:,1],c=pred_ms,cmap='prism') axis2.set_title('mean-shift',fontsize=40) print('mean-shift:',np.unique(ms.labels_)) print('mean-shift:',homogeneity_completeness_v_measure(self.labels,pred_ms))
def print_results(true_labels, pred_labels, num_clusters): (h, c, v) = metrics.homogeneity_completeness_v_measure(true_labels, pred_labels) print "#Topics=%s (%s). v-measure: %0.3f. h**o: %0.3f. comp: %0.3f. MI: %0.3f. NMI: %0.3f. Acc: %0.3f" \ % (num_clusters, len(pred_labels), v, h, c, metrics.mutual_info_score(true_labels, pred_labels), metrics.normalized_mutual_info_score(true_labels, pred_labels), metrics.accuracy_score(true_labels, pred_labels))
def evaluate_clusters(true_labels, pred_labels, technique): homog, compl, v_measure = homogeneity_completeness_v_measure( true_labels, pred_labels) print('Clustering Evaluation of', technique) print(' Homogeneity: ', homog) print(' Completeness:', compl) print(' V-Measure: ', v_measure)
def main(argv): print("Usage: python LFDassignment3_KMextra_Group10.py <C50trainset> <C50testset>") print('Reading Data...') # define train and test set # shuffle data train = read_corpus(sys.argv[1]) test = read_corpus(sys.argv[2]) random.shuffle(train) random.shuffle(test) # only use a part of the test data split_point = int(0.10*len(test)) test = test[:split_point] Xtrain = [i[0] for i in train] Xtest = [i[0] for i in test] Ytrain = [i[1] for i in train] Ytest = [i[1] for i in test] tfidf = True # TdifdVectorizer with additional features used for classification # I used only stopwords if tfidf: vec = TfidfVectorizer(ngram_range=(1,3), analyzer='word', preprocessor = preprocessor, tokenizer = identity, stop_words = 'english', lowercase = True) else: vec = CountVectorizer(ngram_range=(1,3), analyzer='word', preprocessor = preprocessor, tokenizer = identity) # define the Support Vector Model with a linear kernel '''clf = svm.SVC(kernel='linear', C=1)''' # define the Kmeans classifier with 50 cluster clf = KMeans(n_clusters=50, random_state=1000, n_init=1, verbose=0) classifier = Pipeline([('vec', vec), ('cls', clf)]) print('Training Classifier...') # train the classifier with features and their labels classifier.fit(Xtrain,Ytrain) print('Predicting Test Values...') # predict values of Xtest Yguess = classifier.predict(Xtest) # calculate the accuracy scores for the SVM classifier '''accuracy = accuracy_score(Ytest, Yguess) print(('Accuracy:', accuracy))''' print('-'*40) # calculate accuracy for the Kmeans classifier try: print(classifier.labels_) except: pass print(adjusted_rand_score(Ytest,Yguess)) print(homogeneity_completeness_v_measure(Ytest,Yguess))
def k_means_results(name, A, B, x_label, y_label, colormap): X = A[0] y = A[1] X_test = B[0] y_test = B[1] h = .02 n_clusters = 2 k_means = KMeans(n_clusters=n_clusters) start = time.time() fit_results = k_means.fit(X) end = time.time() print 'Fit Time: ' + str(end - start) Y_kmeans = k_means.predict(X) ld.save_data('datasets/' + name.replace(' ', '_') + '_train.csv', [Y_kmeans,y]) # print Y_kmeans figure_identifier = plt.figure() colors = ['yellow', 'cyan'] if colormap: cmap_light = ListedColormap(['#FF3EFA', '#AAFFAA']) x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1 y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1 xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h)) Z = k_means.predict(np.c_[xx.ravel(), yy.ravel()]) Z = Z.reshape(xx.shape) plt.pcolormesh(xx, yy, Z, cmap=cmap_light) for i in xrange(len(colors)): px = X[:, 0][Y_kmeans == i] py = X[:, 1][Y_kmeans == i] plt.scatter(px, py, c=colors[i]) plt.scatter(fit_results.cluster_centers_[0, 0:1],fit_results.cluster_centers_[0, 1:2] , s=100, linewidths=4, c='orange', marker='x') plt.scatter(fit_results.cluster_centers_[1, 0:1],fit_results.cluster_centers_[1, 1:2] , s=100, linewidths=4, c='orange', marker='x') plt.xlabel(x_label) plt.ylabel(y_label) plt.title(name + ' Train Results') # plt.show() plt.savefig('figures/' + name.replace(' ', '_') + '_Training_results.png') figure_identifier.clf() plt.close(figure_identifier) y_pred = Y_kmeans y_true = y print 'Accuracy Score' print metrics.accuracy_score(y_true, y_pred) print 'Classification Report' print metrics.classification_report(y_true, y_pred) print 'Confusion Matrix' print metrics.confusion_matrix(y_true, y_pred) print 'Completeness Score' print metrics.completeness_score(y_true,y_pred) print 'Homogeneity Score' print metrics.homogeneity_score(y_true,y_pred) print 'Homogeneity Completeness V Measured' print metrics.homogeneity_completeness_v_measure(y_true,y_pred) print 'Mutual Information Score' print metrics.mutual_info_score(y_true,y_pred) print 'Normalized Mutual Info Score' print metrics.normalized_mutual_info_score(y_true,y_pred) print 'Silhouette Score' print metrics.silhouette_score(X,fit_results.labels_) print 'Silhouette Samples' print metrics.silhouette_samples(X,fit_results.labels_) print 'V Measure Score' print metrics.v_measure_score(y_true,y_pred) print_confusion_matrix('Train', Y_kmeans, y) figure_identifier = plt.figure() Y_kmeans = k_means.predict(X_test) ld.save_data('datasets/' + name.replace(' ', '_') + '_test.csv', [Y_kmeans,y_test]) colors = ['yellow', 'cyan'] if colormap: cmap_light = ListedColormap(['#FF3EFA', '#AAFFAA']) x_min, x_max = X_test[:, 0].min() - 1, X_test[:, 0].max() + 1 y_min, y_max = X_test[:, 1].min() - 1, X_test[:, 1].max() + 1 xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h)) Z = k_means.predict(np.c_[xx.ravel(), yy.ravel()]) Z = Z.reshape(xx.shape) plt.pcolormesh(xx, yy, Z, cmap=cmap_light) for i in xrange(len(colors)): px = X_test[:, 0][Y_kmeans == i] py = X_test[:, 1][Y_kmeans == i] plt.scatter(px, py, c=colors[i]) plt.scatter(fit_results.cluster_centers_[0, 0:1],fit_results.cluster_centers_[0, 1:2] , s=100, linewidths=4, c='orange', marker='x') plt.scatter(fit_results.cluster_centers_[1, 0:1],fit_results.cluster_centers_[1, 1:2] , s=100, linewidths=4, c='orange', marker='x') plt.xlabel(x_label) plt.ylabel(y_label) plt.title(name + ' Test Results') # plt.show() plt.savefig('figures/' + name.replace(' ', '_') + '_Test_results.png') print_confusion_matrix('Test', Y_kmeans, y_test) figure_identifier.clf() plt.close(figure_identifier)
def agglomerative_clustering(embedding_model_name, embedding_type, cluster_label_ground_truth_file, cluster_n, method='ward', metric='euclidean', plot=False): embedding_file = 'data/{}/embeddings/{}_{}.npy'.format(dataset, embedding_model_name, embedding_type) embeddings = np.load(embedding_file) logger.info('Loaded embeddings from {}'.format(embedding_file)) # Start clustering. logger.info('Start clustering ({}, {})...'.format(cluster_n, method)) t0 = time() clustering = linkage(embeddings, method=method, metric=metric) logger.info('Clustering time: {}s'.format(time() - t0)) embedding_labels = [] embedding_label_file = 'data/{}/embeddings/{}_{}_labels.txt'.format(dataset, embedding_model_name, embedding_type) embedding_label_in = codecs.open(embedding_label_file) for row in embedding_label_in: if row: label = row.strip() if label: embedding_labels.append(label) embedding_label_in.close() cluster_label_prediction = fcluster(clustering, cluster_n, criterion='maxclust') # 1-based index # logger.info('Cluster label prediction: {}'.format(cluster_label_prediction)) clusters_agg = {} for i in xrange(len(cluster_label_prediction)): clusters_agg.setdefault(cluster_label_prediction[i] - 1, []).append(i) clustering_clusters_file = 'data/{}/clustering/{}_clusters.txt'.format(dataset, embedding_type) cluster_out = codecs.open(clustering_clusters_file, 'w') for i in xrange(len(clusters_agg)): cluster_out.write(u'{}\n'.format(','.join([embedding_labels[j] for j in clusters_agg[i]]))) cluster_out.close() logger.info('Clustering labels saved at {}'.format(clustering_clusters_file)) if cluster_label_ground_truth_file: # Read cluster label ground truth cluster_label_ground_truth = [] with open(cluster_label_ground_truth_file) as f: for line in f: if line: cluster_label_ground_truth.append(map(int, line.strip().split(','))) # Compute Ajusted Rand Index for i in xrange(len(cluster_label_ground_truth)): ari = metrics.adjusted_rand_score(cluster_label_ground_truth[i], cluster_label_prediction) logger.info('Ajusted Rand Index for cluster group {}: {}'.format(i, ari)) ami = metrics.adjusted_mutual_info_score(cluster_label_ground_truth[i], cluster_label_prediction) logger.info('Ajusted Mutual Information Score for cluster group {}: {}'.format(i, ami)) chv = metrics.homogeneity_completeness_v_measure(cluster_label_ground_truth[i], cluster_label_prediction) logger.info('V-measure score for cluster group {}: {}'.format(i, chv)) # Compute Silhouette Coefficient t0 = time() sc_score = metrics.silhouette_score(embeddings, cluster_label_prediction, metric=metric) logger.info('Silhouette Coefficient: {}'.format(sc_score)) logger.info('SC computation time: {}s'.format(time() - t0)) if plot: plt.rc('lines', linewidth=2) plt.figure() plt.title('{} Clustering'.format('Relation'), fontsize=28) plt.yticks([]) dendrogram( clustering, leaf_rotation=90., # rotates the x axis labels leaf_font_size=14., # font size for the x axis labels labels=embedding_labels ) plt.gcf().subplots_adjust(bottom=0.25) plt.show() # plt.savefig('data/{}/{}_clustering_dendrogram.png'.format(dataset, type), dpi=300) return sc_score
def print_metrics(true_clustering, cluster): #try some metrics from sklearn print "\n" print "adjusted rand score [-1.0 (bad) to 1.0 (good)]\n", metrics.adjusted_rand_score(true_clustering, cluster) print "mutual information based score [0.0 (bad) to 1.0 (good)]\n", metrics.adjusted_mutual_info_score(true_clustering, cluster) print "homogeneity, completeness, v measure [0.0 (bad) to 1.0 (good)]\n", metrics.homogeneity_completeness_v_measure(true_clustering, cluster)
def main(self,argv=sys.argv): ####### try: kernel =sys.argv[3] except :# catch *all* exceptions kernel='rbf' print('Training data loading....') data = arff.load(open(argv[1],'rb')) labeled_set = data['data'] train_set = np.asarray([fila[0:len(fila)-1] for fila in labeled_set]) train_set_labels = np.asarray([fila[-1] for fila in labeled_set]) atts = data['attributes'] atts_names = [fila[0] for fila in atts] att_values = [fila [1] for fila in atts] labels = np.array(att_values[len(att_values)-1]) print 'TRAIN DATA SHAPE' print train_set.shape print 'Attributes NUM' print len(atts_names) print 'LABELS FOR CLASS' print labels print('Develop data loading....') datadev_set = arff.load(open(argv[2],'rb')) dev_labeled_set = datadev_set['data'] dev_set = np.asarray([fila[0:len(fila)-1] for fila in dev_labeled_set]) dev_set_labels = np.asarray([fila[-1] for fila in dev_labeled_set]) dev_atts = data['attributes'] dev_atts_names = [fila[0] for fila in dev_atts] dev_att_values = [fila [1] for fila in dev_atts] dev_labels = np.array(dev_att_values[len(dev_att_values)-1]) print 'DEV DATA SHAPE' print dev_set.shape print 'DEV Attributes NUM' print len(dev_atts_names) print 'LABELS FOR DEV CLASS' print dev_labels #### print ('Preprocesing data...') # # parse a un dict para poder vectorizar los att categoricos print ('Parsing categorical data...') dict_list = [] N,F = train_set.shape for n in range(N): d = {} for f in range(F): feature = atts_names[f] d[feature] = train_set[n,f] dict_list.append(d) dev_dict_list = [] N,F = dev_set.shape for n in range(N): d = {} for f in range(F): feature = dev_atts_names[f] d[feature] = dev_set[n,f] dev_dict_list.append(d) #Fit vectorizer for each dict v = DictVectorizer(sparse=False,dtype=np.float16) v_train_set = v.fit_transform(dict_list[0]) for i in range(1,len(dict_list)): train_set_instance = v.fit_transform(dict_list[i]) v_train_set = np.vstack((v_train_set,train_set_instance)) v_dev_set = v.fit_transform(dev_dict_list[0]) for j in range(1,len(dev_dict_list)): v_dev_set_instance = v.fit_transform(dev_dict_list[j]) v_dev_set = np.vstack((v_dev_set,v_dev_set_instance)) v_train_set = np.asarray(v_train_set) v_dev_set = np.asarray(v_dev_set) # # transform non-numerical labels to numerical le = preprocessing.LabelEncoder() le.fit(train_set_labels) train_numeric_labels = le.transform(train_set_labels) le.fit(dev_set_labels) dev_numeric_labels = le.transform(dev_set_labels) ######### # print ('Fitting the model') # #Fit the model # model = svm.SVC(kernel='rbf', gamma=2, C=1, degree=0).fit(v_train_set, train_numeric_labels, sample_weight=None) # print "Making predictions..." # expected=dev_numeric_labels # predicted = model.predict(v_dev_set) # print "Making Hold out evaluation with dev set..." # f1Aux = metrics.f1_score(expected, predicted, pos_label=0) # print ("New F1Score = %r" %f1Aux) # print(metrics.classification_report(expected, predicted, labels=None)) ########## cBest = 0. gBest = 0. dBest = 0. print('Start scaning data for Polinomial kernel....') f1Aux=0.0 f1Best=0.0 if kernel=='rbf': maxD=3 else: maxD=5 for d in range(2,maxD):#2,5 for i in range(-15,12):#-15,12 c=2**i for j in range(-3,5):#-3,5 g=2**j print("Hyperparameters: coef0 = %r gamma = %r degree = %d...." %(c,g,d)) # fit the model model = svm.SVC(kernel=kernel, gamma=g, coef0=c, degree=d, class_weight='auto').fit(v_train_set, train_numeric_labels, sample_weight=None) # make predictions print "Making predictions..." expected=dev_numeric_labels predicted = model.predict(v_dev_set) print "Making Hold out evaluation with dev set..." f1Aux = metrics.f1_score(expected, predicted, pos_label=0) print ("New F1Score = %r" %f1Aux) if f1Aux>f1Best: print ("Maximun F1Score = %r" %f1Aux) f1Best=f1Aux print('Hyperparameters has been changed New degree = %d New coef0= %r New gamma = %r ' %(d,c,g)) cBest = c gBest = g dBest = d # summarize the fit of the model print('Optimized hyperparameters from %s kernel are : coef0 = %r gamma = %r degree = %d'%(kernel,cBest, gBest, dBest)) #Concat train+dev X_all = np.vstack((v_train_set, v_dev_set)) expected_all = np.concatenate((train_numeric_labels,dev_numeric_labels), axis=0) print('Start Dis-honest evaluation with train for test') model = svm.SVC(kernel='rbf', gamma=gBest, coef0=cBest, degree=dBest).fit(v_train_set, train_numeric_labels, sample_weight=None) predicted = model.predict(v_train_set) print(metrics.classification_report(train_numeric_labels, predicted, labels=None)) print('Start Hold-Hout evaluation with train ,dev for test') model = svm.SVC(kernel='rbf', gamma=gBest, coef0=cBest, degree=dBest).fit(v_train_set, train_numeric_labels, sample_weight=None) predicted = model.predict(v_dev_set) # make predictions print(metrics.classification_report(dev_numeric_labels, predicted, labels=None)) print(metrics.confusion_matrix(dev_numeric_labels, predicted)) print print(metrics.f1_score(dev_numeric_labels, predicted, pos_label=0)) print(metrics.homogeneity_completeness_v_measure(dev_numeric_labels, predicted)) print "Making 10-FCV with train+dev..." scores = cs.cross_val_score(model, X_all, expected_all, metrics.f1_score, cv=10, n_jobs=-1, verbose=True) print("F1score weighted: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2)) scores = cs.cross_val_score(model, X_all, expected_all, metrics.classification_report, cv=10, n_jobs=-1, verbose=True) for score in scores: print(score) if not os.path.isdir('Modelos'): os.mkdir('Modelos') date = time.strftime("%H%M%d%m%Y") jl.dump(model, 'Modelos/CSVM'+kernel+date+'.pkl') return model
def compare_clusters(prog, argv): parser = argparse.ArgumentParser(prog=prog, description='Compare Equivalence Classes') parser.add_argument('classes', metavar='eqclass', type=str, nargs=2, help='Ground Truth / Prediction') parser.add_argument('-ar', action='store_true', default=False, help='Adjusted rand score') parser.add_argument('-mi', action='store_true', default=False, help='Mutual info score') parser.add_argument('-ami', action='store_true', default=False, help='Adjusted mutual info score') parser.add_argument('-nmi', action='store_true', default=False, help='Normalised mutual info score') parser.add_argument('-pur', action='store_true', default=False, help='Purity') parser.add_argument('-pr', action='store_true', default=False, help='Classic Precision/Recall') parser.add_argument('-fm', action='store_true', default=False, help='Fowlkes-Mallow score') parser.add_argument('-remove-identical', action='store_true', default=False, help='Remove identical clusters before comparing') parser.add_argument('-f', type=str, help='Write results to filename') parser.add_argument('-test', action='store_true', default=False, help='run tests') args = parser.parse_args(argv) # These are the converted example from: # https://nlp.stanford.edu/IR-book/html/htmledition/evaluation-of-clustering-1.html if args.test: ground_truth = Cluster() prediction = Cluster() prediction.insert(0,1,2,3,4,5) prediction.insert(6,7,8,9,10,11) prediction.insert(12, 13, 14, 15, 16) ground_truth.insert(0,2,3,4,5,6,12,14) ground_truth.insert(1, 7, 8, 9, 11) ground_truth.insert(10,13,15,16) else: ground_truth = Cluster.from_file(args.classes[0], must_exist=True) prediction = Cluster.from_file(args.classes[1], must_exist=True) # remove identical clusters, if desired if args.remove_identical: identical = list() for cluster in ground_truth: cand = prediction.get_cluster(list(cluster)[0]) if not cand: continue elif cand == cluster: identical.append(cand) log.info('Removing %d identical clusters (%d elements)...' % (len(identical), sum([len(x) for x in identical]))) for cluster in identical: for element in cluster: ground_truth.remove_key(element) prediction.remove_key(element) ground_truth.optimize() prediction.optimize() if (args.pr): prec_rec(ground_truth, prediction) # intermix all keys ground_truth_keys = ground_truth.get_keys() prediction_keys = prediction.get_keys() missing = ground_truth_keys - prediction_keys log.info('%d keys missing in prediction' % len(missing)) for key in missing: prediction.insert_single(key) missing = prediction_keys - ground_truth_keys log.info('%d keys missing in ground truth' % len(missing)) for key in missing: ground_truth.insert_single(key) gt = list(sorted(ground_truth.lookup.items())) t = list(sorted(prediction.lookup.items())) gt = [x[1] for x in gt] t = [x[1] for x in t] log.info('Number of equiv classes: %d' % len(ground_truth)) h**o, comp, vm = metrics.homogeneity_completeness_v_measure(gt, t) log.info("Homogeneity: %0.3f" % h**o) log.info("Completeness: %0.3f" % comp) log.info("V-measure: %0.3f" % vm) if args.ar: ar = metrics.adjusted_rand_score(gt, t) log.info("Adjusted rand score: %0.3f" % ar) if args.mi: mi = metrics.mutual_info_score(gt, t) log.info("Mutual info score: %0.3f" % mi) if args.ami: ami = metrics.adjusted_mutual_info_score(gt, t) log.info("Adjusted mutual info score: %0.3f" % ami) if args.nmi: nmi = metrics.normalized_mutual_info_score(gt, t) log.info("Normalised mutual info score: %0.3f" % nmi) if args.pur: elements = len(gt) hits = 0 for w in ground_truth: this = 0 for element in w: tmp = prediction[element] foo = len(w & tmp) if foo > this: this = foo hits += this purity = hits / elements log.info('Purity: %0.3f' % purity) if args.fm: fm = metrics.fowlkes_mallows_score(gt, t) log.info("Fowlkes-Mallows score: %0.3f" % fm) if args.f: with open(args.f, 'w') as f: f.write("h**o: %0.3f\n" % h**o) f.write("comp: %0.3f\n" % comp) f.write("vm: %0.3f\n" % vm) if args.ar: f.write("ar: %0.3f\n" % ar) if args.mi: f.write("mi: %0.3f\n" % mi) if args.nmi: f.write("nmi: %0.3f\n" % nmi) if args.ami: f.write("ami: %0.3f\n" % ami) if args.pur: f.write("pur: %0.3f\n" % purity) if args.fm: f.write("fm: %0.3f\n" % fm) return 0
max_features=n_feathers, stop_words=stop_word, use_idf=True ) # vectorizer = Pipeline(( # ('hasher', hasher), # ('tf_idf', TfidfTransformer()) # )) X = vectorizer.fit_transform(twenty_train.data) labels = twenty_train.target true_k = np.unique(labels).shape[0] km = KMeans(n_clusters=true_k, init='k-means++', max_iter=100, n_init=1, verbose=False, random_state=RandomState(42)) X_kmean = km.fit(X) # print km.cluster_centers_ print '##########################' air_score = metrics.adjusted_rand_score(twenty_train.target, km.labels_) all_three_score = metrics.homogeneity_completeness_v_measure(twenty_train.target, km.labels_) print air_score print all_three_score print metrics.silhouette_score(X, km.labels_, metric='euclidean') # 0.172990728537 # (0.23396974800824874, 0.34894426413758112, 0.28011816442240145) # 0.00810690704347
def homogeneity_completeness( (x, y) ): return list(metrics.homogeneity_completeness_v_measure(x, y))
pp2= ax.scatter(c2[:,0], c2[:,1],cmap='prism',s=50,color='g') ax.legend((pp1,pp2),('class 1', 'class2'),fontsize=35) fig.savefig('classes.png') #start figure fig.clf()#reset plt fig, ((axis1, axis2), (axis3, axis4)) = plt.subplots(2, 2, sharex='col', sharey='row') #k-means kmeans = KMeans(n_clusters=2) kmeans.fit(X) pred_kmeans = kmeans.labels_ #axis1 = fig.add_subplot(211) print 'kmeans:',np.unique(kmeans.labels_) print 'kmeans:',homogeneity_completeness_v_measure(labels,pred_kmeans) plt.scatter(X[:,0], X[:,1], c=kmeans.labels_, cmap='prism') # plot points with cluster dependent colors axis1.scatter(X[:,0], X[:,1], c=kmeans.labels_, cmap='prism') #axis1.set_xlabel('x',fontsize=40) axis1.set_ylabel('y',fontsize=40) axis1.set_title('k-means',fontsize=20) #plt.show() #mean-shift ms = MeanShift(bandwidth=7) ms.fit(X) pred_ms = ms.labels_ axis2.scatter(X[:,0], X[:,1], c=pred_ms, cmap='prism') axis2.set_title('mean-shift',fontsize=20)
X_tfidf = transformer.fit_transform(X_counts) # print vectorizer.get_feature_names() print len(vectorizer.get_feature_names()) km = KMeans(n_clusters=3, init='k-means++', max_iter=800, n_init=200) X_kmean = km.fit(X_tfidf) X_kmean_r = X_kmean.transform(X_tfidf) # print(X_kmean_r) # print X_kmean.labels_ # print X_kmean.labels_ # print km.labels_ # print np.asarray(myLabel, dtype=np.int) # print km.labels_ air_score = metrics.adjusted_rand_score(myLabel, km.labels_) all_three_score = metrics.homogeneity_completeness_v_measure(myLabel, km.labels_) print "ARI 计算真实与预测的结果相似度: %s" % air_score #相似度 print "Mutual Information based scores 使用labels_true和labels_pred 来计算之间的一致性: %s" % metrics.adjusted_mutual_info_score(myLabel, km.labels_) # print '' print "Homogeneity 同质性 每个簇中的成员只包含唯一个类型: %s" % all_three_score[0] print "completeness 完整性 一个类型中得全部成员都被分配到同一个簇中: %s" % all_three_score[1] print "V-measure 相等于上面的NIMI的标签熵之和的归一化 normalized by sum of label entropies: %s" % all_three_score[2] print "Silhouette Coefficient 轮廓系数: %s" % metrics.silhouette_score(X_tfidf, np.asarray(myLabel, dtype=np.int), metric='euclidean') fig, ax = pl.subplots() for c, i, in zip("rgb", [0, 1, 2]): pl.scatter(X_kmean_r[np.asarray(X_kmean.labels_) == i, 0], X_kmean_r[np.asarray(X_kmean.labels_) == i, 1], c=c,label='Dimension 1 vs Dimension2') ax.set_xlabel('Dimension 0 ') ax.set_ylabel('Dimension 1 ') ax.set_title('term words scatter plot of 3 cluster Dimension 0 vs Dimension 1')
def compareClustering(groundTruth,modelCluster): print "adjusted random score (different from assigning random classes?)= ",metrics.adjusted_rand_score(groundTruthCustering,modelCluster) print "adjusted mutual Information based scores (tends to increase with number of clusters)= ",metrics.adjusted_mutual_info_score(groundTruthCustering,modelCluster) print "homogenity, completeness, v-measure scores = ",metrics.homogeneity_completeness_v_measure(groundTruthCustering,modelCluster)
c_inds = np.where(clusters[k] == i) cluster_treatment_label = ( 1 if df["treatment_label"].values[c_inds].sum() / len(df["treatment_label"].values[c_inds]) >= 0.5 else 0 ) pred_treatment_labels[k][c_inds] = cluster_treatment_label cluster_infection_label = ( 1 if df["case_control_label"].values[c_inds].sum() / len(df["case_control_label"].values[c_inds]) >= 0.5 else 0 ) pred_infection_labels[k][c_inds] = cluster_infection_label cluster_treatment_stats[k] = metrics.homogeneity_completeness_v_measure( df["treatment_label"].values, pred_treatment_labels[k] ) cluster_infection_stats[k] = metrics.homogeneity_completeness_v_measure( df["case_control_label"].values, pred_treatment_labels[k] ) # compute one way anova over clustering solutions: for p in ["Vic_HA", "Vic_NA"]: for assay in Vic_assays: group_samples = {} for i in arange(1, num_clusters + 1): group_samples[i] = np.asarray(df[assay].loc[clusters[p] == i]) group_samples[i] = group_samples[i][~np.isnan(group_samples[i])] (F, p_anova) = scipy.stats.f_oneway(*group_samples.values()) print(p, assay, F, p_anova)
classifier = Pipeline([('vec',vec),('cls', km)]) classifier.fit(X) Yguess = classifier.predict(X) labelDict = {} clusterCombos = defaultdict(list) for pred, gold in zip(Yguess, Y): clusterCombos[pred].append(gold) for pred, gold in clusterCombos.items(): labelDict[pred]=Counter(gold).most_common(1)[0][0] predList = [labelDict[label] for label in Yguess] print("Rand index: {}".format(adjusted_rand_score(Y,Yguess))) print("V-measure: {}".format(v_measure_score(Y,Yguess))) print("All three: {}".format(homogeneity_completeness_v_measure(Y,Yguess))) cm=confusion_matrix(Y, predList, labels=list(set(Y))) print(cm) plt.figure() plt.imshow(cm, interpolation='nearest', cmap=plt.cm.Blues) plt.title('Confusion Matrix of binary label K-Means classification') plt.colorbar() tick_marks = numpy.arange(len(list(set(Y)))) plt.xticks(tick_marks, list(set(Y)), rotation=45) plt.yticks(tick_marks, list(set(Y))) plt.tight_layout() plt.ylabel('True label') plt.xlabel('Predicted label') plt.show()
for feature in actor['featureVectors']: featureList.append(feature) k_means.fit(featureList) groundTruthCuster=[] i=0 for actor in featureListFromFile['features']: for feature in actor['featureVectors']: groundTruthCuster.append(i) i=i+1 modelCluster=[] for actor in featureListFromFile['features']: for feature in actor['featureVectors']: cluster=k_means.predict(feature) for c in cluster: modelCluster.append(c) print "adjusted random index (different from assigning random classes?)= ",metrics.adjusted_rand_score(groundTruthCuster,modelCluster) print "adjusted mutual Information based scores (tends to increase with number of clusters)= ",metrics.adjusted_mutual_info_score(groundTruthCuster,modelCluster) print "homogenity, completeness, v-measure scores = ",metrics.homogeneity_completeness_v_measure(groundTruthCuster,modelCluster)
def k_means_results(name, A, B, x_label, y_label, colormap): X = A[0] y = A[1] X_test = B[0] y_test = B[1] h = 0.02 n_clusters = 2 k_means = KMeans(n_clusters=n_clusters) start = time.time() fit_results = k_means.fit(X) end = time.time() print "Fit Time: " + str(end - start) Y_kmeans = k_means.predict(X) y_pred = Y_kmeans y_true = y print "Train Accuracy Score Default" print metrics.accuracy_score(y_true, y_pred) y_pred = map(flip, Y_kmeans) print "Train Accuracy Score Flip Labels" print metrics.accuracy_score(y_true, y_pred) print "Classification Report" print metrics.classification_report(y_true, y_pred) print "Confusion Matrix" print metrics.confusion_matrix(y_true, y_pred) print "Completeness Score" print metrics.completeness_score(y_true, y_pred) print "Homogeneity Score" print metrics.homogeneity_score(y_true, y_pred) print "Homogeneity Completeness V Measured" print metrics.homogeneity_completeness_v_measure(y_true, y_pred) print "Mutual Information Score" print metrics.mutual_info_score(y_true, y_pred) print "Normalized Mutual Info Score" print metrics.normalized_mutual_info_score(y_true, y_pred) print "Silhouette Score" print metrics.silhouette_score(X, fit_results.labels_) print "Silhouette Samples" print metrics.silhouette_samples(X, fit_results.labels_) print "V Measure Score" print metrics.v_measure_score(y_true, y_pred) figure_identifier = plt.figure() colors = ["yellow", "cyan"] if colormap: cmap_light = ListedColormap(["#FF3EFA", "#AAFFAA"]) x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1 y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1 xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h)) Z = k_means.predict(np.c_[xx.ravel(), yy.ravel()]) Z = Z.reshape(xx.shape) plt.pcolormesh(xx, yy, Z, cmap=cmap_light) for i in xrange(len(colors)): px = X[:, 0][Y_kmeans == i] py = X[:, 1][Y_kmeans == i] plt.scatter(px, py, c=colors[i]) plt.scatter( fit_results.cluster_centers_[0, 0:1], fit_results.cluster_centers_[0, 1:2], s=100, linewidths=4, c="orange", marker="x", ) plt.scatter( fit_results.cluster_centers_[1, 0:1], fit_results.cluster_centers_[1, 1:2], s=100, linewidths=4, c="orange", marker="x", ) plt.xlabel(x_label) plt.ylabel(y_label) plt.title(name + " Train Results") # plt.show() plt.savefig("figures/" + name.replace(" ", "_") + "_Training_results.png") figure_identifier.clf() plt.close(figure_identifier) print_confusion_matrix("Train", Y_kmeans, y) figure_identifier = plt.figure() Y_kmeans = k_means.predict(X_test) y_pred = Y_kmeans y_true = y_test print "Test Accuracy Score Default" print metrics.accuracy_score(y_true, y_pred) y_pred = map(flip, Y_kmeans) print "Test Accuracy Score Flip Labels" print metrics.accuracy_score(y_true, y_pred) colors = ["yellow", "cyan"] if colormap: cmap_light = ListedColormap(["#FF3EFA", "#AAFFAA"]) x_min, x_max = X_test[:, 0].min() - 1, X_test[:, 0].max() + 1 y_min, y_max = X_test[:, 1].min() - 1, X_test[:, 1].max() + 1 xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h)) Z = k_means.predict(np.c_[xx.ravel(), yy.ravel()]) Z = Z.reshape(xx.shape) plt.pcolormesh(xx, yy, Z, cmap=cmap_light) for i in xrange(len(colors)): px = X_test[:, 0][Y_kmeans == i] py = X_test[:, 1][Y_kmeans == i] plt.scatter(px, py, c=colors[i]) plt.scatter( fit_results.cluster_centers_[0, 0:1], fit_results.cluster_centers_[0, 1:2], s=100, linewidths=4, c="orange", marker="x", ) plt.scatter( fit_results.cluster_centers_[1, 0:1], fit_results.cluster_centers_[1, 1:2], s=100, linewidths=4, c="orange", marker="x", ) plt.xlabel(x_label) plt.ylabel(y_label) plt.title(name + " Test Results") # plt.show() plt.savefig("figures/" + name.replace(" ", "_") + "_Test_results.png") print_confusion_matrix("Test", Y_kmeans, y_test) figure_identifier.clf() plt.close(figure_identifier)
def test_homogeneity_completeness_v_measure(self): result = self.df.metrics.homogeneity_completeness_v_measure() expected = metrics.homogeneity_completeness_v_measure(self.target, self.pred) self.assertEqual(result, expected)
post_inds = time_dict['Post'] p_labels = np.unique(arr_df[post_inds].group_label.values) for k in ind_dict.keys(): # use Andrew's package which allows clustering using Spearman distances (sch.linkage, and pdist do not support this for some reason, unlike Matlab) (dMat[k], Z_struct[k], dend[k]) = hcp.computeHCluster(arr_df[post_inds][ind_dict[k]], method='complete', metric='spearman') clusters[k] = sch.fcluster(Z_struct[k], t=num_clusters, criterion='maxclust') # compute cluster homogeneity and completness (purity and accuracy) for treatment label and for infection status: pred_treatment_labels[k] = np.zeros(shape=(arr_df[post_inds].shape[0])) for i in np.arange(1, num_clusters+1): c_inds = np.where(clusters[k] == i) val, ind = scipy.stats.mode(arr_df[post_inds]['group_label'].values[c_inds]) pred_treatment_labels[k][c_inds] = val[0] cluster_treatment_stats[k] = metrics.homogeneity_completeness_v_measure(arr_df[post_inds]['group_label'].values, pred_treatment_labels[k]) # compute pairwise statistics of clusters using alternate assays as values: prot_stats = {} for p in ['SHA_ha', 'SHA_na']: p_values = {assay: np.zeros(shape=(num_clusters, num_clusters)) for assay in assays} q_values = {assay: np.zeros(shape=(num_clusters, num_clusters)) for assay in assays} stats_df = pd.DataFrame() for assay in assays: res = [] c_inds = [] for i in np.arange(num_clusters): for j in np.arange(i+1, num_clusters): res.append(scipy.stats.ranksums(arr_df[assay].loc[clusters[p] == i+1], arr_df[assay].loc[clusters[p] == j+1])) c_inds.append((i+1, j+1))
from sklearn.cluster import KMeans from sklearn.metrics import adjusted_rand_score, homogeneity_completeness_v_measure import pylab if len(sys.argv) < 3: sys.exit('Usage: python kmeans.py dataset k') ## Data preprocessing data = parse_tab(sys.argv[1]) k = int(sys.argv[2]) classes = [example[-1] for example in data] examples = data_to_na(data) ## Clustering kmeans = KMeans(k=k, random_state=0) kmeans.fit(examples) codebook = kmeans.cluster_centers_ labels = kmeans.predict(examples) ## Performance evaluation ari = adjusted_rand_score(labels, classes) homogeneity, completeness, v_measure = homogeneity_completeness_v_measure(labels, classes) print('ARI: {0}'.format(ari)) print('Homogeneity: {0}'.format(homogeneity)) print('Completeness: {0}'.format(completeness)) print('V-measure: {0}'.format(v_measure)) pylab.figure(1) pylab.scatter(examples.T[0], examples.T[1], c=labels) pylab.show()
# In[27]: km2 = KMeans(n_clusters=2, random_state=42).fit(X) km2_labels = km2.labels_ km5 = KMeans(n_clusters=5, random_state=42).fit(X) km5_labels = km5.labels_ # ## Homogeneity, Completeness and V-measure # In[28]: km2_hcv = np.round(metrics.homogeneity_completeness_v_measure(y, km2_labels), 3) km5_hcv = np.round(metrics.homogeneity_completeness_v_measure(y, km5_labels), 3) print('Homogeneity, Completeness, V-measure metrics for num clusters=2: ', km2_hcv) print('Homogeneity, Completeness, V-measure metrics for num clusters=5: ', km5_hcv) # ## Silhouette Coefficient # In[29]: from sklearn import metrics km2_silc = metrics.silhouette_score(X, km2_labels, metric='euclidean') km5_silc = metrics.silhouette_score(X, km5_labels, metric='euclidean')
def evaluateClusteringModel(self, predTarget): from sklearn.metrics import homogeneity_completeness_v_measure print('Clustering report--homogeneity, completeness, v-measure') print(homogeneity_completeness_v_measure(self.testTarget, predTarget))
y_pred = k_means.predict(X) accuracy_score = [] #http://scikit-learn.org/stable/modules/classes.html print 'Accuracy Score' accuracy_score.append(metrics.accuracy_score(y_true, y_pred)) print 'Classification Report' print metrics.classification_report(y_true, y_pred) print 'Confusion Matrix' print metrics.confusion_matrix(y_true, y_pred) print 'Completeness Score' print metrics.completeness_score(y_true,y_pred) print 'Homogeneity Score' print metrics.homogeneity_score(y_true,y_pred) print 'Homogeneity Completeness V Measured' print metrics.homogeneity_completeness_v_measure(y_true,y_pred) print 'Mutual Information Score' print metrics.mutual_info_score(y_true,y_pred) print 'Normalized Mutual Info Score' print metrics.normalized_mutual_info_score(y_true,y_pred) print 'Silhouette Score' print metrics.silhouette_score(X,result.labels_) print 'Silhouette Samples' print metrics.silhouette_samples(X,result.labels_) print 'V Measure Score' print metrics.v_measure_score(y_true,y_pred) stdsc = StandardScaler() X_scaled = stdsc.fit_transform(X) k_means = KMeans(n_clusters=2)