def Porownaj_algorytmy(data, klasy, labels, method, baza): """ Oblicza indeksy AM, AR i FM dla wszystkich algorytmów, aprócz napisanego przeze mnie. """ wektor =[] test =[0]*len(method) i=0 #algorytmy linkage for name in method: Z = linkage(data, name) test[i] = cluster.hierarchy.cut_tree(Z,klasy) test[i] = [y for x in test[i] for y in x] wektor.append([fowlkes_mallows_score(labels,test[i]), adjusted_mutual_info_score(labels, test[i]),adjusted_rand_score(labels,test[i]),baza ]) i+=1 # algorytm genieclust wynikMG = genieclust.genie.Genie(n_clusters=klasy).fit_predict(data) wektor.append([fowlkes_mallows_score(labels,wynikMG), adjusted_mutual_info_score(labels, wynikMG),adjusted_rand_score(labels,wynikMG),baza ]) #MeanShift wynikCL = MeanShift(bandwidth=klasy).fit(data).labels_ wektor.append([fowlkes_mallows_score(labels,wynikCL), adjusted_mutual_info_score(labels, wynikCL),adjusted_rand_score(labels,wynikCL),baza ]) #AgglomerativeClustering wynikFA = AgglomerativeClustering(n_clusters=klasy).fit(data).labels_ wektor.append([fowlkes_mallows_score(labels,wynikFA), adjusted_mutual_info_score(labels, wynikFA),adjusted_rand_score(labels,wynikFA),baza ]) #KMeans wynikKM = KMeans(n_clusters=klasy, random_state=123).fit(data).labels_ wektor.append([fowlkes_mallows_score(labels,wynikKM), adjusted_mutual_info_score(labels, wynikKM),adjusted_rand_score(labels,wynikKM) ,baza]) index = ["single",'complete','average','weighted','centroid','median','ward', "genieclust","AgglomerativeClustering","KMeans","MeanShift"] return pd.DataFrame(wektor, index = index, columns = ["FM","AM","AR", "Dane"])
def test_fowlkes_mallows_score(): # General case score = fowlkes_mallows_score([0, 0, 0, 1, 1, 1], [0, 0, 1, 1, 2, 2]) assert_almost_equal(score, 4.0 / np.sqrt(12.0 * 6.0)) # Perfect match but where the label names changed perfect_score = fowlkes_mallows_score([0, 0, 0, 1, 1, 1], [1, 1, 1, 0, 0, 0]) assert_almost_equal(perfect_score, 1.0) # Worst case worst_score = fowlkes_mallows_score([0, 0, 0, 0, 0, 0], [0, 1, 2, 3, 4, 5]) assert_almost_equal(worst_score, 0.0)
def computeFowklesMallowsIndex(inputDirTDA, inputFileWavelet, sheetNameWavelet): #labels_TDA = ['m59','m39','m102', 'm6', 'm47', 'm8', 'm4', 'm98', 'm2', 'm40', 'm3'] #clustering_TDA, labelsTDA = tdaClustering("../Results/CohomologyOPPregJNP/") clustering_TDA, labelsTDA = tdaClustering(inputDirTDA) #labels_Wavelet = ['m39','m40', 'm47', 'm98', 'm102', 'm2', 'm3', 'm4', 'm6', 'm8', 'm59'] clustering_Wavelet, labelsWavelet = waveletClustering( inputFileWavelet, sheetNameWavelet) # 167 #[2, 2, 1, 1, 1, 1, 2, 2, 2, 2, 3] -- cutoff 175 #[2, 2, 1, 1, 1, 1, 4, 4, 3, 3, 5] -- cutoff 160 #[2, 2, 1, 1, 1, 1, 3, 3, 3, 3, 4] -- cutoff 167 #Reorganize labels of wavelet clustering according to TDA #label so that they are in same order reOrganizedWavelet = [] dicWavelet = {} for pair in zip(labelsWavelet, clustering_Wavelet): dicWavelet[pair[0]] = pair[1] for l in labelsTDA: reOrganizedWavelet.append(dicWavelet[l]) print(reOrganizedWavelet) score = fowlkes_mallows_score(clustering_TDA, reOrganizedWavelet) #score = fowlkes_mallows_score([1,1,0,0], [0,0,1,1]) return score
def true_label_metrics(true_label, assigned_label, print_metric): """ https://scikit-learn.org/stable/modules/clustering.html#clustering-evaluation""" true_label_metrics = {} true_label_metrics['adjusted_rand_score'] = \ cluster_metric.adjusted_rand_score(true_label, assigned_label) # true_label_metrics['adjusted_mutual_info_score'] = \ # cluster_metric.adjusted_mutual_info_score(true_label, # assigned_label) # true_label_metrics['homogeneity_completeness_v_measure'] = \ # cluster_metric.homogeneity_completeness_v_measure(true_label, # assigned_label) true_label_metrics['fowlkes_mallows_score'] = \ cluster_metric.fowlkes_mallows_score(true_label, assigned_label) if (print_metric): print("Metric with True label") print("adjusted rand score: % s " % true_label_metrics['adjusted_rand_score']) # print("adjusted mutual info score: % s" # % true_label_metrics['adjusted_mutual_info_score']) # print("homogeneity completeness v measure:" ) # print(true_label_metrics['homogeneity_completeness_v_measure']) print("fowlkes_mallows : % s" % true_label_metrics['fowlkes_mallows_score']) return true_label_metrics
def get_clustering_metrics(train_data, cluster_labels, ground_truth_labels=None): clustering_metric_dict = dict({}) clustering_metric_dict['silhouette_score'] = silhouette_score( train_data, cluster_labels, random_state=42) clustering_metric_dict[ 'calinski_harabasz_score'] = calinski_harabasz_score( train_data, cluster_labels) clustering_metric_dict['davies_bouldin_score'] = davies_bouldin_score( train_data, cluster_labels) if ground_truth_labels is not None: clustering_metric_dict['v_measure_score'] = v_measure_score( ground_truth_labels, cluster_labels) clustering_metric_dict[ 'fowlkes_mallows_score'] = fowlkes_mallows_score( ground_truth_labels, cluster_labels) clustering_metric_dict['homogeneity_score'] = homogeneity_score( ground_truth_labels, cluster_labels) clustering_metric_dict[ 'normalized_mutual_info_score'] = normalized_mutual_info_score( ground_truth_labels, cluster_labels) clustering_metric_dict['adjusted_rand_score'] = adjusted_rand_score( ground_truth_labels, cluster_labels) clustering_metric_dict['completeness_score'] = completeness_score( ground_truth_labels, cluster_labels) return clustering_metric_dict
def _score_clustering(self, labels,metric='vm'): # Score clustering compared to true model if metric=='fm': score = fowlkes_mallows_score(self.true_labels_, labels) elif metric=='ami': score = adjusted_mutual_info_score(self.true_labels_, labels) else: score = v_measure_score(self.true_labels_[labels>0], labels[labels>0]) return score
def _clustering_evaluation(label, labels_true, digits): if labels_true is None: FM = None ARI = None else: ARI = round(adjusted_rand_score(labels_true, label), digits) FM = round(fowlkes_mallows_score(labels_true, label),digits) return ARI, FM
def get_landmarking(dataset_name, df): start = time.time() record = {'dataset': dataset_name.split('.')[0]} results = [] n_samples = int(len(df)*0.1) if len(df) > 400 else min(df.shape[0], 40) data = df.sample(n=n_samples, replace=False) labels = get_dbscan(data) k = len(np.unique(labels)) labels2 = get_Kmeans(data, k, 40) full_tree = DecisionTreeClassifier() full_tree.fit(data, labels) worst_attr = np.argmin(full_tree.feature_importances_) X_train, X_test, y_train, y_test = train_test_split(data, labels, test_size=0.3) best_stump = DecisionTreeClassifier(max_depth=1) random_stump = DecisionTreeClassifier(splitter="random", max_depth=1) worst_stump = DecisionTreeClassifier(max_depth=1) elite_knn = KNeighborsClassifier(n_neighbors=1) one_knn = KNeighborsClassifier(n_neighbors=1, algorithm="auto", weights="uniform", p=2, metric="minkowski") nb = GaussianNB() lda = LinearDiscriminantAnalysis() best_stump.fit(X_train, y_train) random_stump.fit(X_train, y_train) worst_stump.fit(X_train.iloc[:, worst_attr].values.reshape(-1, 1), y_train) elite_knn.fit(X_train, y_train) one_knn.fit(X_train, y_train) # lda.fit(X_train, y_train) nb.fit(X_train, y_train) record['LM1'] = np.log2(df.shape[0]) record['LM2'] = np.log2(df.shape[1]) record['LM3'] = accuracy_score(best_stump.predict(X_test), y_test) # record['LM4'] = f1_score(best_stump.predict(X_test), y_test, average='weighted') record['LM5'] = accuracy_score(random_stump.predict(X_test), y_test) # record['LM6'] = f1_score(random_stump.predict(X_test), y_test, average='weighted') # record['LM7'] = model.inertia_ record['LM8'] = accuracy_score(elite_knn.predict(X_test), y_test) # record['LM9'] = f1_score(elite_knn.predict(X_test), y_test, average='weighted') # record['LM10'] = accuracy_score(lda.predict(X_test), y_test) # record['LM11'] = f1_score(lda.predict(X_test), y_test, average='weighted') record['LM12'] = accuracy_score(nb.predict(X_test), y_test) # record['LM13'] = f1_score(nb.predict(X_test), y_test, average='weighted') record['LM14'] = accuracy_score(one_knn.predict(X_test), y_test) # record['LM15'] = f1_score(one_knn.predict(X_test), y_test, average='weighted') record['LM16'] = accuracy_score(worst_stump.predict(X_test.iloc[:, worst_attr].values.reshape(-1, 1)), y_test) # record['LM17'] = f1_score(worst_stump.predict(X_test.iloc[:, worst_attr].values.reshape(-1, 1)), y_test, average='weighted') record['LM18'] = adjusted_rand_score(labels, labels2) record['LM19'] = adjusted_mutual_info_score(labels, labels2) record['LM20'] = completeness_score(labels, labels2) record['LM21'] = fowlkes_mallows_score(labels, labels2) end = time.time() return record, (df.shape[0], df.shape[1], end-start)
def test_int_overflow_mutual_info_fowlkes_mallows_score(): # Test overflow in mutual_info_classif and fowlkes_mallows_score x = np.array([1] * (52632 + 2529) + [2] * (14660 + 793) + [3] * (3271 + 204) + [4] * (814 + 39) + [5] * (316 + 20)) y = np.array([0] * 52632 + [1] * 2529 + [0] * 14660 + [1] * 793 + [0] * 3271 + [1] * 204 + [0] * 814 + [1] * 39 + [0] * 316 + [1] * 20) assert_all_finite(mutual_info_score(x, y)) assert_all_finite(fowlkes_mallows_score(x, y))
def cluster_performance(y_true, y_pred): ''' 返回FM指数和Rand指数 :param y_true:参考模型的簇划分,类型为ndarray :param y_pred:聚类模型给出的簇划分,类型为ndarray :return: FM指数,Rand指数 ''' FM = fowlkes_mallows_score(y_true, y_pred) Rand = adjusted_rand_score(y_true, y_pred) return FM, Rand
def cluster_performance(y_true, y_pred): ''' 返回FM指数和Rand指数 :param y_true:参考模型的簇划分,类型为ndarray :param y_pred:聚类模型给出的簇划分,类型为ndarray :return: FM指数,Rand指数 ''' #********* Begin *********# return fowlkes_mallows_score(y_true, y_pred), adjusted_rand_score(y_true, y_pred)
def print_stats(x, y, quiet=True): ari = adjusted_rand_score(x, y) ami = adjusted_mutual_info_score(x, y) fms = fowlkes_mallows_score(x, y) if not quiet: print("ARI: {}".format(ari), file=sys.stderr) print("AMI: {}".format(ami), file=sys.stderr) print("FMS: {}".format(fms), file=sys.stderr) return ari, ami, fms
def cluster_performance(y_true, y_pred): """ 返回Rand指数和FM指数 :param y_true:参考模型的簇划分,类型为ndarray :param y_pred:聚类模型给出的簇划分,类型为ndarray :return: Rand指数,FM指数 """ # ********* Begin *********# rand = adjusted_rand_score(y_true, y_pred) fm = fowlkes_mallows_score(y_true, y_pred) return fm, rand
def test_fowlkes_mallows_score_properties(): # handcrafted example labels_a = np.array([0, 0, 0, 1, 1, 2]) labels_b = np.array([1, 1, 2, 2, 0, 0]) expected = 1. / np.sqrt((1. + 3.) * (1. + 2.)) # FMI = TP / sqrt((TP + FP) * (TP + FN)) score_original = fowlkes_mallows_score(labels_a, labels_b) assert_almost_equal(score_original, expected) # symmetric property score_symmetric = fowlkes_mallows_score(labels_b, labels_a) assert_almost_equal(score_symmetric, expected) # permutation property score_permuted = fowlkes_mallows_score((labels_a + 1) % 3, labels_b) assert_almost_equal(score_permuted, expected) # symmetric and permutation(both together) score_both = fowlkes_mallows_score(labels_b, (labels_a + 2) % 3) assert_almost_equal(score_both, expected)
def Porownaj_algorytmy2(data, klasy, labels, baza): """ Oblicza indeksy AM, AR i FM dla algorytmu napisanego przeze mnie. """ wektor =[] #moj algorytm wynikM = spectral_clustering(data, k=klasy, M=5) wektor.append([fowlkes_mallows_score(labels,wynikM), adjusted_mutual_info_score(labels, wynikM),adjusted_rand_score(labels,wynikM),baza ]) index=["Moj"] return pd.DataFrame(wektor, index = index, columns = ["FM","AM","AR", "Dane"])
def evaluate(self): eval_result_dict = {} eval_result_dict['ami'] = adjusted_mutual_info_score( self.data['true_y'], self.data['pred_y']) eval_result_dict['rand'] = adjusted_rand_score(self.data['true_y'], self.data['pred_y']) eval_result_dict['comp'] = completeness_score(self.data['true_y'], self.data['pred_y']) eval_result_dict['fow'] = fowlkes_mallows_score( self.data['true_y'], self.data['pred_y']) eval_result_dict['hom'] = homogeneity_score(self.data['true_y'], self.data['pred_y']) eval_result_dict['nmi'] = normalized_mutual_info_score( self.data['true_y'], self.data['pred_y']) eval_result_dict['v_score'] = v_measure_score(self.data['true_y'], self.data['pred_y']) return eval_result_dict
def cluster_hac(num_k): feature_ds, label_ds = read_dataset() user_max_id = num_k - 1 sub_feature_ds = [] sub_label_ds = [] for i in range(0, len(label_ds)): if label_ds[i] <= user_max_id: sub_feature_ds.append(feature_ds[i]) sub_label_ds.append(label_ds[i]) feature_array = np.array(sub_feature_ds) x_scalar = StandardScaler() x = x_scalar.fit_transform(feature_array) pca = PCA(n_components=0.999) components = pca.fit_transform(x) hac = AgglomerativeClustering(n_clusters=num_k, linkage='average') hac.fit_predict(components) print(fowlkes_mallows_score(hac.labels_, sub_label_ds))
def cluster_kmeans(num_k): feature_ds, label_ds = read_dataset() user_max_id = num_k - 1 sub_feature_ds = [] sub_label_ds = [] for i in range(0, len(label_ds)): if label_ds[i] <= user_max_id: sub_feature_ds.append(feature_ds[i]) sub_label_ds.append(label_ds[i]) feature_array = np.array(sub_feature_ds) x_scalar = StandardScaler() x = x_scalar.fit_transform(feature_array) pca = PCA(n_components=0.999) components = pca.fit_transform(x) kmeans = KMeans(n_clusters=num_k, random_state=0) kmeans.fit_predict(components) print(fowlkes_mallows_score(kmeans.labels_, sub_label_ds))
def compute_external_metrics(labels_true: List[str], labels_pred: List[int]) -> ExternalEvaluation: if len(labels_true) == 0 and len(labels_pred) == 0: return None homogeneity, completeness, v_measure = homogeneity_completeness_v_measure( labels_true, labels_pred) adjusted_mutual_info = adjusted_mutual_info_score(labels_true, labels_pred) adjusted_rand_index = adjusted_rand_score(labels_true, labels_pred) fowlkes_mallows = fowlkes_mallows_score(labels_true, labels_pred) mat = contingency_matrix(labels_true, labels_pred) purity = purity_score(mat) inverse_purity = purity_score(mat, inverse=True) return ExternalEvaluation(homogeneity=homogeneity, completeness=completeness, v_measure=v_measure, adjusted_mutual_information=adjusted_mutual_info, adjusted_rand_index=adjusted_rand_index, fowlkes_mallows=fowlkes_mallows, purity=purity, inverse_purity=inverse_purity)
def externalEval(self, y_pred, true_label): true_label = np.array(true_label) n_cluster = len(set(true_label)) y_pred_modi = y_pred.copy() result = [[] for i in range(len(set(y_pred)))] for i in range(len(y_pred)): result[y_pred[i]].append(i) dict1 = dict.fromkeys([i for i in range(n_cluster)], None) for i in list(dict1.keys()): dict1[i] = [] nummostnum = 0 for i in range(len(result)): if len(true_label[result[i]]) > 0: mostnum = Counter(true_label[result[i]]).most_common(1)[0][0] nummostnum += Counter( true_label[result[i]]).most_common(1)[0][1] dict1[mostnum] += (result[i]) for r in list(dict1.keys()): for i in dict1[r]: y_pred_modi[i] = r nmi = normalized_mutual_info_score(true_label, y_pred) purity = nummostnum / len(y_pred_modi) fowlkes_mallows = fowlkes_mallows_score(true_label, y_pred_modi) return nmi, purity, fowlkes_mallows
def kmeans(data, k): centroid = initialize_centroids(data, k) a = np.zeros((k, k)) b = np.zeros(k) c1 = np.zeros((k, k)) d = np.zeros(k) clusnew = np.zeros(len(data)) i = 1 while (i < 100): clusters = closest_centroid(data, centroid, k) for l in range(0, k): centroid[l, :] = np.mean(data[(np.where(clusters == l))], axis=0) i = i + 1 print i c = confusion_matrix(clusters, digits.target) for j in range(0, k): c1[j, :] = c[:, (np.argmax(c[j, :]))] clusnew[clusters == (np.argmax(c[j, :]))] = j d[j] = sum(c1[:, j]) c1[:, (np.argmin(d))] = -1 print('Confusion Matrix: ', c1) print('Fowlkes Mallows Score: ', fowlkes_mallows_score(digits.target, clusnew))
output_file = sys.argv[1] correct_file = sys.argv[2] values = np.loadtxt(correct_file, dtype=int) num_lines = sum(1 for line in open(correct_file)) result = np.zeros(num_lines) cur_clus = -1 with open(output_file) as f: content = f.readlines() for i in range(0, len(content)): if (content[i][0] == '#'): cur_clus += 1 continue result[int(content[i])] = cur_clus net_score = fowlkes_mallows_score(values, result) ''' unique, counts = np.unique(result, return_counts=True) print np.asarray((unique, counts)).T unique, counts = np.unique(values, return_counts=True) print np.asarray((unique, counts)).T ''' logs = open('DBSCANLogs.txt', 'a') logs.write(str(net_score) + '\n') logs.close()
def _eval_clustering(self, labels_true, labels_predicted): # To address when COP-KMeans fails to satisfy all constraints at a k: if labels_predicted is None: # return an empty dictionary to expose in the final output return {"nmi": None, "ami": None, "ari": None, "fms": None, "v_measure": None, "bcubed_precision": None, "bcubed_recall": None, "bcubed_fscore": None, "Silhouette": None, "Calinski_harabasz": None, "Davies_Bouldin": None } nmi = normalized_mutual_info_score(labels_true, labels_predicted, average_method="max") ami = adjusted_mutual_info_score(labels_true, labels_predicted, average_method="arithmetic") ari = adjusted_rand_score(labels_true, labels_predicted) v_measure = v_measure_score(labels_true, labels_predicted, beta=1.0) fms = fowlkes_mallows_score(labels_true, labels_predicted) # Reshape labels for BCubed measures true_dict = self._reshape_labels_as_dicts(labels_true) pred_dict = self._reshape_labels_as_dicts(labels_predicted) bcubed_precision = bcubed.precision(cdict=pred_dict, ldict=true_dict) bcubed_recall = bcubed.recall(cdict=pred_dict, ldict=true_dict) bcubed_f1 = bcubed.fscore(bcubed_precision, bcubed_recall) # ===================================================================== # Unsupervised Metrics # ===================================================================== if not labels_predicted.nunique() in (1, len(self.data)): sil = silhouette_score(X=self.data, labels=labels_predicted, metric=self.distance_metric, random_state=13712) ch = calinski_harabasz_score(X=self.data, labels=labels_predicted) dv = davies_bouldin_score(X=self.data, labels=labels_predicted) else: sil = None ch = None dv = None ret = {} ret.update({"nmi": round(nmi, 4), "ami": round(ami, 4), "ari": round(ari, 4), "fms": round(fms, 4), "v_measure": round(v_measure, 4), "bcubed_precision": round(bcubed_precision, 4), "bcubed_recall": round(bcubed_recall, 4), "bcubed_fscore": round(bcubed_f1, 4), "Silhouette": round(sil, 4 ) if sil is not None else None, "Calinski_harabasz": round(ch, 4 ) if ch is not None else None, "Davies_Bouldin": round(dv, 4 ) if dv is not None else None # Here goes the unsupervised indices }) return ret
def _fm(labels, labels_true,digits): return round(fowlkes_mallows_score(labels_true, labels),digits)
data_copy = copy.copy(data) # Drop the class inputs = data.drop('species', axis=1) # Test from n_clusters = 2 until n_clusters = 6 for n_clusters in range(2, 6 + 1): # Fowkes-Mallows and Silhouette evaluation: agglo = Agglomerative(n_clusters=n_clusters) agglo.fit(inputs) labels = np.array(agglo.predict(inputs)) print("n_clusters =", n_clusters) print("Menggunakan metode Fowlkes-Mallows: ") fowlkes_mallows = fowlkes_mallows_score(labels, target) print("Fowlkes Mallows Score:", fowlkes_mallows) print("Menggunakan metode Silhouette:") silhouette_avg = silhouette_score(inputs, labels) print("Hasil rata-rata skor silhouette:", silhouette_avg) print() print() silhouette_values_per_point = silhouette_samples(inputs, labels) # Visualize Silhouette subplot # 1 row and 2 columns: Left -> silhouette plot and Right -> Cluster Visualization fig, silhouette_viz = plt.subplots(1) fig.set_size_inches(18, 7)
def report_clustering(distance_file, biom_file, metadata_file, num_clusters, verbose, L=2, output_file=None): if not isinstance(distance_file, list): distance_matrix = CSV.read(distance_file) else: distance_matrix = distance_file if output_file is not None: f = open(output_file, 'w') output_matrix = [] AgglomerativeCluster = AgglomerativeClustering( n_clusters=num_clusters, affinity='precomputed', linkage='complete').fit_predict(distance_matrix) KMedoidsCluster = KMedoids(n_clusters=num_clusters, metric='precomputed', method='pam', init='heuristic').fit_predict(distance_matrix) PCoA_Samples = BW.extract_samples(biom_file) metadata = meta.extract_metadata(metadata_file) region_names = [] for i in range(len(PCoA_Samples)): if metadata[PCoA_Samples[i]]['body_site'] not in region_names: region_names.append(metadata[PCoA_Samples[i]]['body_site']) PCoA_Samples[i] = region_names.index( metadata[PCoA_Samples[i]]['body_site']) if verbose and L == 1: print('Printing results for L1-UniFrac:') elif verbose and L == 2: print('Printing results for L2-UniFrac:') if verbose: print('Metric\t\t\t\t\t\t\tAgglomerativeClustering\t\tKMedoids') if output_file is not None: if L == 1: f.write('Printing results for L1-UniFrac:\n') elif L == 2: f.write('Printing results for L2-UniFrac:\n') f.write('Metric\t\t\t\tAgglomerativeClustering\t\t\tKMedoids\n') if L == 1: output_matrix.append(['Printing results for L1-UniFrac:']) if L == 2: output_matrix.append(['Printing results for L2-UniFrac:']) output_matrix.append(['Metric', 'AgglomerativeClustering', 'KMedoids']) RI1 = rand_score(PCoA_Samples, AgglomerativeCluster) RI2 = rand_score(PCoA_Samples, KMedoidsCluster) if verbose: print(f'Rand Index Score: {RI1}\t\t\t{RI2}') ARI1 = adjusted_rand_score(PCoA_Samples, AgglomerativeCluster) ARI2 = adjusted_rand_score(PCoA_Samples, KMedoidsCluster) if verbose: print(f'Adjusted Rand Index Score: {ARI1}\t\t\t{ARI2}') NMI1 = normalized_mutual_info_score(PCoA_Samples, AgglomerativeCluster) NMI2 = normalized_mutual_info_score(PCoA_Samples, KMedoidsCluster) if verbose: print(f'Normalized Mutual Index Score: {NMI1}\t\t\t{NMI2}') AMI1 = adjusted_mutual_info_score(PCoA_Samples, AgglomerativeCluster) AMI2 = adjusted_mutual_info_score(PCoA_Samples, KMedoidsCluster) if verbose: print(f'Adjusted Mutual Info Score: {AMI1}\t\t\t{AMI2}') FM1 = fowlkes_mallows_score(PCoA_Samples, AgglomerativeCluster) FM2 = fowlkes_mallows_score(PCoA_Samples, KMedoidsCluster) if verbose: print(f'Fowlkes Mallows Score: {FM1}\t\t\t{FM2}') if output_file is not None: f.write(f'Rand Index Score: {RI1}\t\t\t{RI2}\n') f.write(f'Adjusted Rand Index Score: {ARI1}\t\t\t{ARI2}\n') f.write(f'Normalized Mutual Index Score: {NMI1}\t\t\t{NMI2}\n') f.write(f'Adjusted Mutual Info Score: {AMI1}\t\t\t{AMI2}\n') f.write(f'Fowlkes Mallows Score: {FM1}\t\t\t{FM2}\n') output_matrix.append(['Rand Index Score:', RI1, RI2]) output_matrix.append(['Adjusted Rand Index Score:', ARI1, ARI2]) output_matrix.append(['Normalized Mutual Index Score:', NMI1, NMI2]) output_matrix.append(['Adjusted Mutual Info Score:', AMI1, AMI2]) output_matrix.append(['Fowlkes Mallows Score:', FM1, FM2]) return output_matrix
from time import time import numpy as np from scipy import ndimage from matplotlib import pyplot as plt from sklearn import manifold, datasets from sklearn.cluster import AgglomerativeClustering from sklearn.metrics import confusion_matrix from sklearn.metrics.cluster import fowlkes_mallows_score from sklearn.preprocessing import scale digits = datasets.load_digits(n_class=10) X = scale(digits.data) y = digits.target n_samples, n_features = X.shape X_red = manifold.SpectralEmbedding(n_components=2).fit_transform(X) clusnew = np.zeros(len(X)) clustering = AgglomerativeClustering(linkage='ward', n_clusters=10) t0 = time() clustering.fit(X_red) print("%s : %.2fs" % ('ward', time() - t0)) c1 = np.zeros((10, 10)) d = np.zeros(10) c = confusion_matrix(clustering.labels_, y) for j in range(0, 10): c1[j, :] = c[:, (np.argmax(c[j, :]))] clusnew[clustering.labels_ == (np.argmax(c[j, :]))] = j d[j] = sum(c1[:, j]) c1[:, (np.argmin(d))] = -1 print('Confusion Matrix: ', c1) print('Fowlkes Mallows Score: ', fowlkes_mallows_score(y, clusnew))
with open(fi, 'w') as outfile: json.dump(label_test.tolist(), outfile) fi = os.getcwd() + "/svm/json/label_train.json" with open(fi, 'w') as outfile: json.dump(label_train.tolist(), outfile) print "\nLinear SVC: " Classifier = svm.SVC(kernel='linear', probability=True) Classifier.fit(feature_train, label_train) joblib.dump(Classifier, 'linear_2.pkl') print "predicting.." predict = Classifier.predict(feature_test) print "Expected output:", label_test print "Predicted output:", predict print "Confusion Matrix:\n", metrics.confusion_matrix(label_test, predict) print "Fowlkes Mallows Score", fowlkes_mallows_score(label_test, predict) try: print "Precision Score", precision_score(label_test, predict) print "Recall Score", recall_score(label_test, predict) print "F-measure", f1_score(label_test, predict) # exit() except: pass print "\nRBF SVC: " Classifier = svm.SVC(kernel='rbf') Classifier.fit(feature_train, label_train) joblib.dump(Classifier, 'rbf_2.pkl') print "predicting.." predict = Classifier.predict(feature_test)
def compute_scores(self, x): self.cluster_labels = np.ndarray((x.shape[0], )) for i in range(0, x.shape[0], self.batch_size): predictions = self.kmeans.predict(x[i:(i + self.batch_size)]) self.cluster_labels[i:(i + self.batch_size)] = predictions if (i + self.batch_size) > x.shape[0]: predictions = self.kmeans.predict(x[i:x.shape[0]]) self.cluster_labels[i:x.shape[0]] = predictions confusion_matrix = cscores.contingency_matrix(self.labels_true, self.labels_pred) purity_score = np.sum(np.amax(confusion_matrix, axis=0)) / np.sum(confusion_matrix) homogeneity_score, completeness_score, v_measure_score = cscores.homogeneity_completeness_v_measure( self.labels_true, self.labels_pred) scores = [ #['calinski_harabasz_score', 'internal', cscores.calinski_harabasz_score(x, self.cluster_labels)], [ 'davies_bouldin_score', 'internal', metrics.davies_bouldin_score(x, self.cluster_labels) ], [ 'silhouette_score', 'internal', metrics.silhouette_score(x, self.cluster_labels) ], #['silhouette_samples', 'internal', cscores.silhouette_samples(x, self.cluster_labels)], ['purity_score', 'external', purity_score], [ 'adjusted_rand_score', 'external', cscores.adjusted_rand_score(self.labels_true, self.labels_pred) ], ['completeness_score', 'external', completeness_score], [ 'fowlkes_mallows_score', 'external', cscores.fowlkes_mallows_score(self.labels_true, self.labels_pred) ], ['homogeneity_score', 'external', homogeneity_score], [ 'adjusted_mutual_info_score', 'external', cscores.adjusted_mutual_info_score(self.labels_true, self.labels_pred) ], [ 'mutual_info_score', 'external', cscores.mutual_info_score(self.labels_true, self.labels_pred) ], [ 'normalized_mutual_info_score', 'external', cscores.normalized_mutual_info_score(self.labels_true, self.labels_pred) ], ['v_measure_score', 'external', v_measure_score] ] scores = pd.DataFrame(scores, columns=['name', 'type', 'score']) scores.to_csv(files.small_images_classes_kmeans_scores, index=False)
n_samples, n_features = X.shape np.random.seed(0) k=10 labels_y = list(set(y)) print 50*"_" print "KMeans clustering (implementation of algo from question 1a)" no_of_iterations = 10 dat = {i:0 for i in range(n_samples)} t0 = time() k_centers, dat = k_means(X, dat, k,no_of_iterations) y_pred1 = [value for key,value in dat.iteritems()] c_m1 = confusion_matrix(y,y_pred1, labels_y) print "PROTOCOL1: The cluster predictions for 10 clusters, i.e k = 10 are:\n",getClusterRepresentatives(c_m1, k) print "PROTOCOL2: Confusion Matrix: \n",c_m1 print "PROTOCOL3: Fowlkes-Mallows score:", fowlkes_mallows_score(y, y_pred1) print "Time taken: %.2fs" % (time() - t0) print 50*"_" print "KMeans clustering (using sklearn)" clustering1 = KMeans(n_clusters=k, init='k-means++', n_init=10, max_iter=300, tol=0.0001, precompute_distances='auto', verbose=0, random_state=None, copy_x=True, n_jobs=1, algorithm='auto') t01 = time() y_pred11 = clustering1.fit_predict(X) c_m11 = confusion_matrix(y,y_pred11) print "PROTOCOL1: The cluster predictions for 10 clusters, i.e k = 10 are:\n",getClusterRepresentatives(c_m11, k) print "PROTOCOL2: Confusion Matrix: \n",c_m11 print "PROTOCOL3: Fowlkes-Mallows score:", fowlkes_mallows_score(y, y_pred11) print "Time taken: %.2fs" % (time() - t01) print 50*"_" print "Agglomerative Clustering with Ward linkage"