def test_adjusted_mutual_info_score(): # Compute the Adjusted Mutual Information and test against known values labels_a = np.array([1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3]) labels_b = np.array([1, 1, 1, 1, 2, 1, 2, 2, 2, 2, 3, 1, 3, 3, 3, 2, 2]) # Mutual information mi = mutual_info_score(labels_a, labels_b, log_base='e') assert_almost_equal(mi, 0.41022, 5) # with provided sparse contingency C = contingency_matrix(labels_a, labels_b, sparse=True) mi = mutual_info_score(labels_a, labels_b, contingency=C, log_base='e') assert_almost_equal(mi, 0.41022, 5) # with provided dense contingency C = contingency_matrix(labels_a, labels_b) mi = mutual_info_score(labels_a, labels_b, contingency=C, log_base='e') assert_almost_equal(mi, 0.41022, 5) # Expected mutual information n_samples = C.sum() emi = expected_mutual_information(C, n_samples, log_base='e') assert_almost_equal(emi, 0.15042, 5) # Adjusted mutual information ami = adjusted_mutual_info_score(labels_a, labels_b, log_base='e') assert_almost_equal(ami, 0.27502, 5) ami = adjusted_mutual_info_score([1, 1, 2, 2], [2, 2, 3, 3]) assert_equal(ami, 1.0) # Test with a very large array a110 = np.array([list(labels_a) * 110]).flatten() b110 = np.array([list(labels_b) * 110]).flatten() ami = adjusted_mutual_info_score(a110, b110, log_base='e') # This is not accurate to more than 2 places assert_almost_equal(ami, 0.37, 2)
def test_adjusted_mutual_info_score(): # Compute the Adjusted Mutual Information and test against known values labels_a = np.array([1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3]) labels_b = np.array([1, 1, 1, 1, 2, 1, 2, 2, 2, 2, 3, 1, 3, 3, 3, 2, 2]) # Mutual information mi = mutual_info_score(labels_a, labels_b) assert_almost_equal(mi, 0.41022, 5) # with provided sparse contingency C = contingency_matrix(labels_a, labels_b, sparse=True) mi = mutual_info_score(labels_a, labels_b, contingency=C) assert_almost_equal(mi, 0.41022, 5) # with provided dense contingency C = contingency_matrix(labels_a, labels_b) mi = mutual_info_score(labels_a, labels_b, contingency=C) assert_almost_equal(mi, 0.41022, 5) # Expected mutual information n_samples = C.sum() emi = expected_mutual_information(C, n_samples) assert_almost_equal(emi, 0.15042, 5) # Adjusted mutual information ami = adjusted_mutual_info_score(labels_a, labels_b) assert_almost_equal(ami, 0.27821, 5) ami = adjusted_mutual_info_score([1, 1, 2, 2], [2, 2, 3, 3]) assert ami == pytest.approx(1.0) # Test with a very large array a110 = np.array([list(labels_a) * 110]).flatten() b110 = np.array([list(labels_b) * 110]).flatten() ami = adjusted_mutual_info_score(a110, b110) assert_almost_equal(ami, 0.38, 2)
def test_adjusted_mutual_info_score(): # Compute the Adjusted Mutual Information and test against known values labels_a = np.array([1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3]) labels_b = np.array([1, 1, 1, 1, 2, 1, 2, 2, 2, 2, 3, 1, 3, 3, 3, 2, 2]) # Mutual information mi = mutual_info_score(labels_a, labels_b) assert_almost_equal(mi, 0.41022, 5) # with provided sparse contingency C = contingency_matrix(labels_a, labels_b, sparse=True) mi = mutual_info_score(labels_a, labels_b, contingency=C) assert_almost_equal(mi, 0.41022, 5) # with provided dense contingency C = contingency_matrix(labels_a, labels_b) mi = mutual_info_score(labels_a, labels_b, contingency=C) assert_almost_equal(mi, 0.41022, 5) # Expected mutual information n_samples = C.sum() emi = expected_mutual_information(C, n_samples) assert_almost_equal(emi, 0.15042, 5) # Adjusted mutual information ami = adjusted_mutual_info_score(labels_a, labels_b) assert_almost_equal(ami, 0.27502, 5) ami = adjusted_mutual_info_score([1, 1, 2, 2], [2, 2, 3, 3]) assert_equal(ami, 1.0) # Test with a very large array a110 = np.array([list(labels_a) * 110]).flatten() b110 = np.array([list(labels_b) * 110]).flatten() ami = adjusted_mutual_info_score(a110, b110) # This is not accurate to more than 2 places assert_almost_equal(ami, 0.37, 2)
def Porownaj_algorytmy(data, klasy, labels, method, baza): """ Oblicza indeksy AM, AR i FM dla wszystkich algorytmów, aprócz napisanego przeze mnie. """ wektor =[] test =[0]*len(method) i=0 #algorytmy linkage for name in method: Z = linkage(data, name) test[i] = cluster.hierarchy.cut_tree(Z,klasy) test[i] = [y for x in test[i] for y in x] wektor.append([fowlkes_mallows_score(labels,test[i]), adjusted_mutual_info_score(labels, test[i]),adjusted_rand_score(labels,test[i]),baza ]) i+=1 # algorytm genieclust wynikMG = genieclust.genie.Genie(n_clusters=klasy).fit_predict(data) wektor.append([fowlkes_mallows_score(labels,wynikMG), adjusted_mutual_info_score(labels, wynikMG),adjusted_rand_score(labels,wynikMG),baza ]) #MeanShift wynikCL = MeanShift(bandwidth=klasy).fit(data).labels_ wektor.append([fowlkes_mallows_score(labels,wynikCL), adjusted_mutual_info_score(labels, wynikCL),adjusted_rand_score(labels,wynikCL),baza ]) #AgglomerativeClustering wynikFA = AgglomerativeClustering(n_clusters=klasy).fit(data).labels_ wektor.append([fowlkes_mallows_score(labels,wynikFA), adjusted_mutual_info_score(labels, wynikFA),adjusted_rand_score(labels,wynikFA),baza ]) #KMeans wynikKM = KMeans(n_clusters=klasy, random_state=123).fit(data).labels_ wektor.append([fowlkes_mallows_score(labels,wynikKM), adjusted_mutual_info_score(labels, wynikKM),adjusted_rand_score(labels,wynikKM) ,baza]) index = ["single",'complete','average','weighted','centroid','median','ward', "genieclust","AgglomerativeClustering","KMeans","MeanShift"] return pd.DataFrame(wektor, index = index, columns = ["FM","AM","AR", "Dane"])
def test_exactly_zero_info_score(): # Check numerical stability when information is exactly zero for i in np.logspace(1, 4, 4).astype(int): labels_a, labels_b = (np.ones(i, dtype=int), np.arange(i, dtype=int)) assert normalized_mutual_info_score(labels_a, labels_b) == 0.0 assert v_measure_score(labels_a, labels_b) == 0.0 assert adjusted_mutual_info_score(labels_a, labels_b) == 0.0 assert normalized_mutual_info_score(labels_a, labels_b) == 0.0 for method in ["min", "geometric", "arithmetic", "max"]: assert adjusted_mutual_info_score(labels_a, labels_b, method) == 0.0 assert normalized_mutual_info_score(labels_a, labels_b, method) == 0.0
def test_exactly_zero_info_score(): # Check numerical stability when information is exactly zero for i in np.logspace(1, 4, 4).astype(np.int): labels_a, labels_b = (np.ones(i, dtype=np.int), np.arange(i, dtype=np.int)) assert_equal(normalized_mutual_info_score(labels_a, labels_b), 0.0) assert_equal(v_measure_score(labels_a, labels_b), 0.0) assert_equal(adjusted_mutual_info_score(labels_a, labels_b), 0.0) assert_equal(normalized_mutual_info_score(labels_a, labels_b), 0.0) for method in ["min", "geometric", "arithmetic", "max"]: assert adjusted_mutual_info_score(labels_a, labels_b, method) == 0.0 assert normalized_mutual_info_score(labels_a, labels_b, method) == 0.0
def clustering(label_file, embedding_file, embedding_dim, clusters): print( 'performing kmeans clustering -------------------------------------------' ) embeddings = np.fromfile(embedding_file, np.float32).reshape(-1, embedding_dim) #embeddings = read_embeddings_avg(embedding_file) kmeans = KMeans(n_clusters=clusters, random_state=0).fit(embeddings) node_labels = kmeans.labels_ with open(label_file, 'r') as f: reader = csv.reader(f, delimiter=' ') label_list = list(reader) labels = [] for item in label_list: labels.append(int(item[1])) nmi_score = normalized_mutual_info_score(node_labels, labels) adj_score = adjusted_mutual_info_score(node_labels, labels) print(nmi_score) print(adj_score) return nmi_score
def show_result(self, prediction, msg): new_line(50) print(msg) new_line(50) real = self.train_labels print "Confusion Matrix: " print str(confusion_matrix(real, prediction)) homo_score = homogeneity_score(real, prediction) complete_score = completeness_score(real, prediction) v_score = v_measure_score(real, prediction) rand_score = adjusted_rand_score(real, prediction) mutual_info = adjusted_mutual_info_score(real, prediction) print("Homogeneity Score: %0.3f" % homo_score) print("Completeness Score: %0.3f" % complete_score) print("V-measure: %0.3f" % v_score) print("Adjusted Rand Score: %0.3f" % rand_score) print("Adjusted Mutual Info Score: %0.3f\n" % mutual_info) return { 'Homogeneity': homo_score, 'Completeness': complete_score, 'V-measure': v_score, 'RAND': rand_score, 'Mutual': mutual_info }
def cluster_acc(y_true, y_pred): """ calculating the accuracy of the clustering. since the index of each cluster might be different in y_true and y_pred, this function finds the linear assignment which maximizes the accuracy. This means some of the clusters might remain without a matching label. :param y_true: ground truth labeling :param y_pred: calculated from the model :return: the accuracy percentage, ami, nmi and the matrix w of all the combinations of indexes of the original clusters and the calculated ones """ assert y_pred.size == y_true.size y_true_unique = np.unique(y_true) true_cluster_idx = np.nonzero(y_true[:, None] == y_true_unique)[1] D = max(y_pred.max()+1, len(y_true_unique)) # number of clusters w = np.zeros((D, len(y_true_unique)), dtype=np.int64) # D is in size number of clusters*number of clusters for i in range(y_pred.size): w[y_pred[i], true_cluster_idx[i]] += 1 ind = linear_assignment(w.max() - w) # calculating the corresponding gt label most fit for each y_pred. since there are usually a lot of clusters, # the ones which didn't correspond to a value in the gt will receive the value -1 y_pred_new = -1 * np.ones(len(y_pred), int) for i in range(0, len(y_pred)): j = np.argwhere(ind[:, 0] == y_pred[i]) if j.shape[0] > 0: y_pred_new[i] = (ind[j[0], 1]) acc = sum([w[i, j] for i, j in ind])*1.0/y_pred.size ami = adjusted_mutual_info_score(y_true, y_pred) nmi = normalized_mutual_info_score(y_true, y_pred) return acc, ami, nmi, w, y_pred_new
def get_evaluation_metric_value(m_predicted_Y_file_path, m_actual_Y_file_path, m_evaluation_metric): print(m_predicted_Y_file_path, m_actual_Y_file_path) y_true, y_pred = get_pred_and_actual_y_arrays(m_predicted_Y_file_path, m_actual_Y_file_path) try: metric_value = 0 if(m_evaluation_metric == EvaluationMetricType.ACCURACY): from sklearn.metrics import accuracy_score metric_value = accuracy_score(y_true, y_pred) elif(m_evaluation_metric == EvaluationMetricType.PRECISION): from sklearn.metrics import precision_score metric_value = precision_score(y_true, y_pred) #TODO elif(m_evaluation_metric == EvaluationMetricType.RECALL): from sklearn.metrics import recall_score metric_value = recall_score(y_true, y_pred) #TODO elif(m_evaluation_metric == EvaluationMetricType.F1_SCORE): from sklearn.metrics import f1_score metric_value = f1_score(y_true, y_pred) #TODO elif(m_evaluation_metric == EvaluationMetricType.ADJUSTED_MUTUTAL_INFO): from sklearn.metrics.cluster import adjusted_mutual_info_score metric_value = adjusted_mutual_info_score(y_true, y_pred) else: from sklearn.metrics import mean_squared_error metric_value = mean_squared_error(y_true, y_pred) return metric_value except Exception as e: raise Exception("SKLEARN_ERROR",e)
def plot_MI_distance(data, ax, marker, dmax=100, method='NMI'): if len(data.shape) == 1: assert (len(data) % dmax == 0) data = data.reshape(len(data) // dmax, dmax) N, dmax = data.shape print(dmax) if method == 'NMI': NMIs = np.zeros(dmax) for d in range(1, dmax): NMIs[d] = normalized_mutual_info_score(data[:, 0], data[:, d]) MIs = NMIs if method == 'AMI': AMIs = np.zeros(dmax) for d in range(1, dmax): AMIs[d] = adjusted_mutual_info_score(data[:, 0], data[:, d]) MIs = AMIs if method == 'self_NMI': self_NMIs = np.zeros(dmax) for d in range(1, dmax): self_NMIs[d] = MI(data[:, 0], data[:, d]) MIs = self_NMIs # return ax.plot(range(1,dmax), MIs[1:dmax],'o') return ax.plot(np.log10(range(1, dmax)), np.log10(MIs[1:dmax]), marker)[0]
def correlation(self, X, Y, heatmap=False): nb_classes = len(set(Y)) print nb_classes km = KMeans(n_clusters=nb_classes, random_state=0).fit(X) label_kmeans = km.labels_ purity = metric.compute_purity(label_kmeans, Y) nmi = normalized_mutual_info_score(Y, label_kmeans) ari = adjusted_rand_score(Y, label_kmeans) homogeneity = homogeneity_score(Y, label_kmeans) ami = adjusted_mutual_info_score(Y, label_kmeans) print('NMI = {}, ARI = {}, Purity = {},AMI = {}, Homogeneity = {}'. format(nmi, ari, purity, ami, homogeneity)) if heatmap: x_ticks = [''] * len(Y) y_ticks = [''] * len(Y) idx = [] for i in range(nb_classes): sub_idx = [j for j, item in enumerate(Y) if item == i] idx += [j for j, item in enumerate(Y) if item == i] x_ticks[len(idx) - 1] = str(i) assert len(idx) == len(Y) X = X[idx, :] Y = Y[idx] #similarity_mat = pairwise_distances(X,metric='cosine') similarity_mat = cosine_similarity(X) #sns.heatmap(similarity_mat,cmap='Blues') fig, ax = plt.subplots() #ax.set_yticks(range(len(y_ticks))) ax.set_yticklabels(y_ticks) ax.set_xticks(range(len(x_ticks))) ax.set_xticklabels(x_ticks) im = ax.imshow(similarity_mat, cmap='Blues') plt.colorbar(im) plt.savefig('heatmap_%s_dim%d.png' % (self.name, X.shape[1]), dpi=600)
def elem_clustering(self,X,X_dec,y): mse = self.elem_calc_mse(X,X_dec) key = False while key == False: clst = KMeans(n_clusters = 2).fit(mse[:,np.newaxis]) if clst.cluster_centers_[0] <clst.cluster_centers_[1]: key = True else: key = False ami = adjusted_mutual_info_score(y,clst.labels_) ari = adjusted_rand_score(y,clst.labels_) print( "adjusted_mutual_info_score : {0}\nadjusted_rand_score : {1}" .format(ami,ari) ) if ami == 1.0 and ari == 1.0: thresh = sum(clst.cluster_centers_) / clst.n_clusters clf = KNeighborsClassifier( n_neighbors = 2 ).fit(mse[:,np.newaxis],y) print( "Classification border was formed.\nCluster center = {0}\nClassification border threshold = {1}" .format(clst.cluster_centers_,thresh)) else: print ("Cannot define classification border threshold!!") thresh = None clf = None return mse,thresh,clst,clf
def calculate_NMI(cluster_assignments, true_classes): """ The function is to calculate NMI (the normalized mutual information) metric. Let C denote the set of clusters obtained from the ground truth and C' obtained from an algorithm. Their mutual information metric MI(C, C') is defined as follows: MI(C, C') = sum_{ci in C, cj' in C') p(ci, cj') * log2 (p(ci, cj') /(p(ci)p(cj'))) where p(ci) and p(cj') are the probabilities that a data sample arbitrarily selected from the data set belongs to the clusters ci and cj', respectively, and p(ci, cj') is the joint probability that the arbitrarily selected data sample belongs to the clusters ci as well as cj' at the same time. Then the NMI is calculated as: NMI(C, C') = MI(C, C') / max(H(C), H(C')) where H(C) and H(C') are the entropies of C and C', respectively. It is easy to check that NMI(C, C') ranges from 0 to 1. NMI = 1 if two sets of clusters are identical, and NMI = 0 if the two sets are independent. Args: cluster_assignments (numpy array): an array contains cluster ids indicating the clustering assignment of each data point with the same order in the data set true_classes (numpy array): an array contains class ids indicating the true labels of each data point with the same order in the data set Returns: A number between 0 and 1. """ return adjusted_mutual_info_score(cluster_assignments, true_classes)
def ami(true_list, pred_list): """ get the adjusted mutual information (correcting for agreement occuring by chance) between the predicted and the true alignment """ true_list = [int(x) for x in true_list] pred_list = [int(x) for x in pred_list] return adjusted_mutual_info_score(true_list, pred_list)
def calculate_scores(self): x, c, labels = self.x, self.c, self.labels self.v_measure = v_measure_score(c, labels) self.complete = completeness_score(c, labels) self.adjusted_mutual = adjusted_mutual_info_score(c, labels) self.adjusted_rand = adjusted_rand_score(c, labels) self.silhouette = silhouette_score(x, c) self.purity, self.partial_purity = self.__purity__()
def cluster_eval(labels_true, labels_infer): purity = metric.compute_purity(labels_infer, labels_true) nmi = normalized_mutual_info_score(labels_true, labels_infer) ari = adjusted_rand_score(labels_true, labels_infer) homogeneity = homogeneity_score(labels_true, labels_infer) ami = adjusted_mutual_info_score(labels_true, labels_infer) #print('NMI = {}, ARI = {}, Purity = {},AMI = {}, Homogeneity = {}'.format(nmi,ari,purity,ami,homogeneity)) return nmi, ari, homogeneity
def print_scores(labels, predicted, svd): print "Homogeneity: " + str(homogeneity_score(labels, predicted)) print "completeness: " + str(completeness_score(labels, predicted)) print "V-measure: " + str(v_measure_score(labels, predicted)) print "RAND score: " + str(adjusted_rand_score(labels, predicted)) print "Mutual Info: " + str(adjusted_mutual_info_score(labels, predicted)) ret = [] ret.append(homogeneity_score(labels, predicted)) ret.append(completeness_score(labels, predicted)) ret.append(v_measure_score(labels, predicted)) ret.append(adjusted_rand_score(labels, predicted)) ret.append(adjusted_mutual_info_score(labels, predicted)) if svd: svd_all.append(ret) else: nmf_all.append(ret) return homogeneity_score(labels, predicted)
def test_exactly_zero_info_score(): """Check numerical stability when information is exactly zero""" for i in np.logspace(1, 4, 4).astype(np.int): labels_a, labels_b = np.ones(i, dtype=np.int), np.arange(i, dtype=np.int) assert_equal(normalized_mutual_info_score(labels_a, labels_b), 0.0) assert_equal(v_measure_score(labels_a, labels_b), 0.0) assert_equal(adjusted_mutual_info_score(labels_a, labels_b), 0.0) assert_equal(normalized_mutual_info_score(labels_a, labels_b), 0.0)
def evaluateEmbeddingsKMeans(feature_data): X, y = feature_data['features'], feature_data['labels'] num_classes = len(set(list(y.flatten()))) kmeans_preds = KMeans(n_clusters=num_classes, random_state=7123).fit_predict(X) NMI = normalized_mutual_info_score(kmeans_preds, y) AMI = adjusted_mutual_info_score(kmeans_preds, y) print("KMeans (w/ oracle k) AMI Score: {}".format(AMI)) print("KMeans (w/ oracle k) NMI Score: {}".format(NMI))
def calc_performance_score(self, algo_type: str, predicted, y_train): homo_score = homogeneity_score(y_train, predicted) complete_socre = completeness_score(y_train, predicted) adjusted_mute_info_score = adjusted_mutual_info_score( y_train, predicted) print(algo_type + ' homo_score ' + "{:.2f}".format(homo_score)) print(algo_type + ' complete_socre ' + "{:.2f}".format(complete_socre)) print(algo_type + ' adjusted_mute_info_score ' + "{:.2f}".format(adjusted_mute_info_score))
def get_landmarking(dataset_name, df): start = time.time() record = {'dataset': dataset_name.split('.')[0]} results = [] n_samples = int(len(df)*0.1) if len(df) > 400 else min(df.shape[0], 40) data = df.sample(n=n_samples, replace=False) labels = get_dbscan(data) k = len(np.unique(labels)) labels2 = get_Kmeans(data, k, 40) full_tree = DecisionTreeClassifier() full_tree.fit(data, labels) worst_attr = np.argmin(full_tree.feature_importances_) X_train, X_test, y_train, y_test = train_test_split(data, labels, test_size=0.3) best_stump = DecisionTreeClassifier(max_depth=1) random_stump = DecisionTreeClassifier(splitter="random", max_depth=1) worst_stump = DecisionTreeClassifier(max_depth=1) elite_knn = KNeighborsClassifier(n_neighbors=1) one_knn = KNeighborsClassifier(n_neighbors=1, algorithm="auto", weights="uniform", p=2, metric="minkowski") nb = GaussianNB() lda = LinearDiscriminantAnalysis() best_stump.fit(X_train, y_train) random_stump.fit(X_train, y_train) worst_stump.fit(X_train.iloc[:, worst_attr].values.reshape(-1, 1), y_train) elite_knn.fit(X_train, y_train) one_knn.fit(X_train, y_train) # lda.fit(X_train, y_train) nb.fit(X_train, y_train) record['LM1'] = np.log2(df.shape[0]) record['LM2'] = np.log2(df.shape[1]) record['LM3'] = accuracy_score(best_stump.predict(X_test), y_test) # record['LM4'] = f1_score(best_stump.predict(X_test), y_test, average='weighted') record['LM5'] = accuracy_score(random_stump.predict(X_test), y_test) # record['LM6'] = f1_score(random_stump.predict(X_test), y_test, average='weighted') # record['LM7'] = model.inertia_ record['LM8'] = accuracy_score(elite_knn.predict(X_test), y_test) # record['LM9'] = f1_score(elite_knn.predict(X_test), y_test, average='weighted') # record['LM10'] = accuracy_score(lda.predict(X_test), y_test) # record['LM11'] = f1_score(lda.predict(X_test), y_test, average='weighted') record['LM12'] = accuracy_score(nb.predict(X_test), y_test) # record['LM13'] = f1_score(nb.predict(X_test), y_test, average='weighted') record['LM14'] = accuracy_score(one_knn.predict(X_test), y_test) # record['LM15'] = f1_score(one_knn.predict(X_test), y_test, average='weighted') record['LM16'] = accuracy_score(worst_stump.predict(X_test.iloc[:, worst_attr].values.reshape(-1, 1)), y_test) # record['LM17'] = f1_score(worst_stump.predict(X_test.iloc[:, worst_attr].values.reshape(-1, 1)), y_test, average='weighted') record['LM18'] = adjusted_rand_score(labels, labels2) record['LM19'] = adjusted_mutual_info_score(labels, labels2) record['LM20'] = completeness_score(labels, labels2) record['LM21'] = fowlkes_mallows_score(labels, labels2) end = time.time() return record, (df.shape[0], df.shape[1], end-start)
def print_scores(labels, predicted): print "Contingency: " print str(confusion_matrix(labels, predicted)) ret = [] ret.append(homogeneity_score(labels, predicted)) ret.append(completeness_score(labels, predicted)) ret.append(v_measure_score(labels, predicted)) ret.append(adjusted_rand_score(labels, predicted)) ret.append(adjusted_mutual_info_score(labels, predicted)) print "Homogeneity: " + str(homogeneity_score(labels, predicted)) print "completeness: " + str(completeness_score(labels, predicted)) print "V-measure: " + str(v_measure_score(labels, predicted)) print "RAND score: " + str(adjusted_rand_score(labels, predicted)) print "Mutual Info: " + str(adjusted_mutual_info_score(labels, predicted)) return ret
def validate( measure, classes, clustering ): if measure == "nmi": return normalized_mutual_info_score( classes, clustering ) elif measure == "ami": return adjusted_mutual_info_score( classes, clustering ) elif measure == "ari": return adjusted_rand_score( classes, clustering ) log.error("Unknown validation measure: %s" % measure ) return None
def five_measure_scores(label_true, label_pred): print("Homogeneity_score = %f" % homogeneity_score(label_true, label_pred)) print("Completeness_score = %f" % completeness_score(label_true, label_pred)) print("Adjusted_rand_score = %f" % adjusted_rand_score(label_true, label_pred)) print("V_measure_score = %f" % v_measure_score(label_true, label_pred)) print("Adjusted_mutual_info_score = %f" % adjusted_mutual_info_score(label_true, label_pred))
def adjusted_mutual_info(comm1, comm2): """ comm1: community 1 comm2: community 2 list of community labels, index i is integer community label of node i """ return adjusted_mutual_info_score(comm1.membership, comm2.membership)
def test_exactly_zero_info_score(): """Check numerical stabability when information is exactly zero""" for i in np.logspace(1, 4, 4): labels_a, labels_b = np.ones(i, dtype=np.int),\ np.arange(i, dtype=np.int) assert_equal(normalized_mutual_info_score(labels_a, labels_b), 0.0) assert_equal(v_measure_score(labels_a, labels_b), 0.0) assert_equal(adjusted_mutual_info_score(labels_a, labels_b), 0.0) assert_equal(normalized_mutual_info_score(labels_a, labels_b), 0.0)
def compute_nmi(preprocessed_brain, transformation_matrix): ref = nib.load(preprocessed_brain.file_paths["t2s"]).get_data() sum_of_mr = np.zeros(ref.shape, dtype=ref.dtype) for modality in preprocessed_brain.file_paths: sum_of_mr += nib.load(preprocessed_brain.file_paths[modality]).get_data() sum_of_mr[sum_of_mr != sum_of_mr] = 0 grayscale_histo = np.mean(np.array(Image.open(preprocessed_brain.histo_path)), axis=2) w_grayscale_histo = warp(grayscale_histo, transformation_matrix, output_shape=sum_of_mr.shape) return adjusted_mutual_info_score(np.ravel(w_grayscale_histo.astype(int)), np.ravel(sum_of_mr.astype(int)))
def print_stats(x, y, quiet=True): ari = adjusted_rand_score(x, y) ami = adjusted_mutual_info_score(x, y) fms = fowlkes_mallows_score(x, y) if not quiet: print("ARI: {}".format(ari), file=sys.stderr) print("AMI: {}".format(ami), file=sys.stderr) print("FMS: {}".format(fms), file=sys.stderr) return ari, ami, fms
def get_mutual_info(self, outcome_dict): """Gets mutual information given hash strings and outcomes using sklearn function.""" hashes, outcomes = list(zip(*list(outcome_dict.items()))) outcomes = [ element for outcomeval in outcomes for element in outcomeval ] return adjusted_mutual_info_score( hashes, outcomes) # <<<< problem here, outcomes isn't of correct shape
def correlation(self, X, Y, heatmap=False): nb_classes = len(set(Y)) print(nb_classes) km = KMeans(n_clusters=nb_classes, random_state=0).fit(X) label_kmeans = km.labels_ purity = metric.compute_purity(label_kmeans, Y) nmi = normalized_mutual_info_score(Y, label_kmeans) ari = adjusted_rand_score(Y, label_kmeans) homogeneity = homogeneity_score(Y, label_kmeans) ami = adjusted_mutual_info_score(Y, label_kmeans) print('NMI = {}, ARI = {}, Purity = {},AMI = {}, Homogeneity = {}'. format(nmi, ari, purity, ami, homogeneity))
def compute_score(predict_labels, labels, verbose=0): """计算聚类得分""" ari = adjusted_rand_score(predict_labels, labels) ami = adjusted_mutual_info_score(predict_labels, labels) nmi = normalized_mutual_info_score(predict_labels, labels) if verbose==1: print('-'*30) print('ari\tami\tnmi') print('{:.4f} {:.4f} {:.4f}'.format(ari, ami, nmi)) return [ari, ami, nmi]
def Porownaj_algorytmy2(data, klasy, labels, baza): """ Oblicza indeksy AM, AR i FM dla algorytmu napisanego przeze mnie. """ wektor =[] #moj algorytm wynikM = spectral_clustering(data, k=klasy, M=5) wektor.append([fowlkes_mallows_score(labels,wynikM), adjusted_mutual_info_score(labels, wynikM),adjusted_rand_score(labels,wynikM),baza ]) index=["Moj"] return pd.DataFrame(wektor, index = index, columns = ["FM","AM","AR", "Dane"])
def evaluate( self, partition, clustered_ids ): # no class info? if not self.has_class_info(): return {} # get two clusterings that we can compare n = len(clustered_ids) classes_subset = np.zeros( n ) for row in range(n): classes_subset[row] = self.class_map[clustered_ids[row]] scores = {} scores["external-nmi"] = normalized_mutual_info_score( classes_subset, partition ) scores["external-ami"] = adjusted_mutual_info_score( classes_subset, partition ) scores["external-ari"] = adjusted_rand_score( classes_subset, partition ) return scores
def sklearn_measures(U, V): # http://scikit-learn.org/stable/modules/classes.html#clustering-metrics import sklearn.metrics.cluster as sym U_labels = np.nonzero(U)[1] V_labels = np.nonzero(V)[1] print U_labels, V_labels # V2_labels = np.nonzero(V2)[1] print 'entro(U)=',sym.entropy(U_labels),'entro(V)=',sym.entropy(V_labels), 'entro(U,V)=',sym.mutual_info_score(U_labels, V_labels) res = [ ['ari', 'nmi', 'ami', 'vm' ], \ [ sym.adjusted_rand_score(U_labels, V_labels),\ sym.normalized_mutual_info_score(U_labels, V_labels),\ sym.adjusted_mutual_info_score(U_labels, V_labels),\ sym.v_measure_score(U_labels, V_labels)]] print res return res
def crossvalidate(profiles, true_group_name, holdout_group_name=None, train=NNClassifier, distance='cosine'): profiles.assert_not_isnan() keys = profiles.keys() true_labels = profiles.regroup(true_group_name) profiles.data = np.array([d for k, d in zip(keys, profiles.data) if tuple(k) in true_labels]) profiles._keys = [k for k in keys if tuple(k) in true_labels] keys = profiles.keys() labels = list(set(true_labels.values())) if holdout_group_name: holdouts = profiles.regroup(holdout_group_name) else: holdouts = dict((k, k) for k in keys) true_indices = [] pred_indices = [] for ho in set(holdouts.values()): test_set_mask = np.array([tuple(holdouts[k]) == ho for k in keys], dtype=bool) training_features = profiles.data[~test_set_mask, :] training_labels = [labels.index(true_labels[tuple(k)]) for k, m in zip(keys, ~test_set_mask) if m] model = train(training_features, training_labels, distance=distance) for k, f, m in zip(keys, profiles.data, test_set_mask): if not m: continue true = true_labels[k] predicted = labels[model.classify(f)] true_indices.append(labels.index(true)) pred_indices.append(labels.index(predicted)) true_indices = np.array(true_indices) pred_indices = np.array(pred_indices) nmi_score = normalized_mutual_info_score(true_indices, pred_indices) ami_score = adjusted_mutual_info_score(true_indices, pred_indices) return nmi_score, ami_score
clusterer_ap = cluster.AffinityPropagation() clusterer_agg_ap = cluster.AffinityPropagation(affinity="precomputed") cluster_ap = clusterer_ap.fit_predict(data) cluster_agg_ap = clusterer_agg_ap.fit_predict(data_agg) cluster_agg_ap2 = clusterer_agg_ap.fit_predict(data_agg2) cluster_agg_ap4 = clusterer_agg_ap.fit_predict(data_agg4) cluster_agg_ap4_w = clusterer_agg_ap.fit_predict(data_agg4_w) cluster_agg_ap4_ws = clusterer_agg_ap.fit_predict(data_agg4_ws) cluster_agg_ap4_just_season = clusterer_agg_ap.fit_predict(data_agg4_just_season) cluster_agg_ap4_just_leaf = clusterer_agg_ap.fit_predict(data_agg4_just_leaf) cluster_agg_ap4_just_seed = clusterer_agg_ap.fit_predict(data_agg4_just_seed) cluster_agg_ap4_just_weather = clusterer_agg_ap.fit_predict(data_agg4_just_weather) mutual_info_score = adjusted_mutual_info_score(labels,cluster_ap) mutual_info_score_agg = adjusted_mutual_info_score(labels,cluster_agg_ap) v_score = homogeneity_completeness_v_measure(labels,cluster_ap) v_score_agg2 = homogeneity_completeness_v_measure(labels,cluster_agg_ap2) v_score_agg4 = homogeneity_completeness_v_measure(labels,cluster_agg_ap4) v_score_agg4_w = homogeneity_completeness_v_measure(labels,cluster_agg_ap4_w) v_score_agg4_ws = homogeneity_completeness_v_measure(labels,cluster_agg_ap4_ws) v_score_agg4_just_season = homogeneity_completeness_v_measure(labels,cluster_agg_ap4_just_season) v_score_agg4_just_leaf = homogeneity_completeness_v_measure(labels,cluster_agg_ap4_just_leaf) v_score_agg4_just_seed = homogeneity_completeness_v_measure(labels,cluster_agg_ap4_just_seed) v_score_agg4_just_weather = homogeneity_completeness_v_measure(labels,cluster_agg_ap4_just_weather) print(v_score) print(v_score_agg2)
def vollmers(info, threshold, reco_info=None, debug=False): """ From Vollmers paper: Lineage Clustering. IGH sequences were clustered into IGH lineages according to similarity in their junctional region. Lineages were created according to the following steps. A lineage is formed and populated with one IGH sequence (seed). Then, all IGH sequences in the lineages (initially only the seed) are compared with all other IGH sequences of the same length using the same V and J segments. If their junctional regions (untemplated nucleotides and D segments) are at least 90% identical, the IGH sequence is added to the lineage. This process is repeated until the lineage does not grow. NOTE I'm interpreting this to mean - if *any* sequence already in the cluster is 90% to the prospective sequence that it's added to the cluster - 'sequences the same length' means cdr3 the same length (entire sequence the same length only made sense for their primers - since the 90% is on d + insertions, also have to not merge if the d + insertions aren't the same length """ id_clusters = {} # map from cluster id to list of query names def get_d_plus_insertions(uid): return info[uid]['vd_insertion'] + info[uid]['d_qr_seq'] + info[uid]['dj_insertion'] def get_cdr3_seq(uid): cpos = info[uid]['cyst_position'] tpos = info[uid]['tryp_position'] assert len(info[uid]['seqs']) == 1 seq = info[uid]['seqs'][0] cdr3_seq = seq[cpos : tpos+3] if len(cdr3_seq) != info[uid]['cdr3_length']: raise Exception('ERROR bad cdr3 sequence %s %d' % (cdr3_seq, info[uid]['cdr3_length'])) return cdr3_seq def from_same_lineage(cluster_id, uid): for clid in id_clusters[cluster_id]: # loop over seqs already in the cluster (it only has to match one of 'em) is_match = True for key in ('cdr3_length', 'v_gene', 'j_gene'): # same cdr3 length, v gene, and j gene if info[clid][key] != info[uid][key]: is_match = False break if not is_match: continue cl_seq = get_d_plus_insertions(clid) u_seq = get_d_plus_insertions(uid) if len(cl_seq) != len(u_seq): continue hamming_frac = utils.hamming_fraction(cl_seq, u_seq) if hamming_frac > 1. - threshold: continue return True # if we get to here, it's a match return False def check_unclustered_seqs(): """ loop through all unclustered sequences, adding them to the most recently created cluster """ uids_to_remove = [] for unique_id in unclustered_seqs: assert unique_id not in id_clusters[last_cluster_id] # not sure why I had the below, but I swear it's impossible. Remove this assertion when it fails to get triggered for a while # if unique_id in id_clusters[last_cluster_id]: # sequence is already in this cluster # continue if from_same_lineage(last_cluster_id, unique_id): if debug: print ' adding', unique_id id_clusters[last_cluster_id].append(unique_id) uids_to_remove.append(unique_id) for uid in uids_to_remove: unclustered_seqs.remove(uid) def add_cluster(clid): if debug: print ' starting cluster %d' % clid id_clusters[clid] = [unclustered_seqs[0],] unclustered_seqs.remove(unclustered_seqs[0]) while True: last_size = len(id_clusters[clid]) check_unclustered_seqs() if last_size == len(id_clusters[clid]): # stop when cluster stops growing break if debug: print ' running again (%d --> %d)' % (last_size, len(id_clusters[clid])) # ---------------------------------------------------------------------------------------- # the business unclustered_seqs = info.keys() last_cluster_id = 0 while len(unclustered_seqs) > 0: add_cluster(last_cluster_id) last_cluster_id += 1 adj_mi = -1 if reco_info is not None: true_cluster_list, inferred_cluster_list = [], [] for clid, uids in id_clusters.items(): for uid in uids: true_cluster_list.append(reco_info[uid]['reco_id']) inferred_cluster_list.append(clid) adj_mi = adjusted_mutual_info_score(true_cluster_list, inferred_cluster_list) print ' threshold %.2f: %d clusters (%d true) adj_mi: %.3f' % (threshold, len(set(inferred_cluster_list)), len(set(true_cluster_list)), adj_mi) partition = [uids for uids in id_clusters.values()] # convert to list of lists (no clid info) return adj_mi, partition
def compute_stability_fold(samples, train, test, method='ward', max_k=None, stack=False, stability=True, cv_likelihood=False, corr_score=None, ground_truth=None, n_neighbors=1, **kwargs): """ General function to compute the stability on a cross-validation fold. Parameters: ----------- samples : list of arrays List of arrays containing the samples to cluster, each array has shape (n_samples, n_features) in PyMVPA terminology. We are clustering the features, i.e., the nodes. train : list or array Indices for the training set. test : list or array Indices for the test set. method : {'complete', 'gmm', 'kmeans', 'ward'} Clustering method to use. Default is 'ward'. max_k : int or None Maximum k to compute the stability testing, starting from 2. By default it will compute up to the maximum possible k, i.e., the number of points. stack : bool Whether to stack or average the datasets. Default is False, meaning that the datasets are averaged by default. stability : bool Whether to compute the stability measure described in Lange et al., 2004. Default is True. cv_likelihood : bool Whether to compute the cross-validated likelihood for mixture model; only valid if 'gmm' method is used. Default is False. corr_score : {'pearson','spearman'} or None Whether to compute the specified type of correlation score. Default is None. ground_truth : array or None Array containing the ground truth of the clustering of the data, useful to compare stability against ground truth for simulations. n_neighbors : int Number of neighbors to use to predict clustering solution on test set using K-nearest neighbors. Currently used only for methods `complete` and `ward`. Default is 1. kwargs : optional Keyword arguments being passed to the clustering method (only for 'ward' and 'gmm'). Returns: -------- ks : array A (max_k-1,) array, where ks[i] is the `k` of the clustering solution for iteration `i`. ari : array A (max_k-1,) array, where ari[i] is the Adjusted Rand Index of the predicted clustering solution on the test set and the actual clustering solution of the test set for `k` of ks[i]. ami : array A (max_k-1,) array, where ari[i] is the Adjusted Mutual Information of the predicted clustering solution on the test set and the actual clustering solution of the test set for `k` of ks[i]. stab : array or None A (max_k-1,) array, where stab[i] is the stability measure described in Lange et al., 2004 for `k` of ks[i]. Note that this measure is the un-normalized one. It will be normalized later in the process. likelihood : array or None If method is 'gmm' and cv_likelihood is True, a (max_k-1,) array, where likelihood[i] is the cross-validated likelihood of the GMM clustering solution for `k` of ks[i]. Otherwise returns None. ari_gt : array or None If ground_truth is not None, a (max_k-1,) array, where ari_gt[i] is the Adjusted Rand Index of the predicted clustering solution on the test set for `k` of ks[i] and the ground truth clusters of the data. Otherwise returns None. ami_gt : array or None If ground_truth is not None, a (max_k-1,) array, where ami_gt[i] is the Adjusted Mutual Information of the predicted clustering solution on the test set for `k` of ks[i] and the ground truth clusters of the data. Otherwise returns None. stab_gt : array or None If ground_truth is not None, a (max_k-1,) array, where stab_gt[i] is the stability measure of the predicted clustering solution on the test set for `k` of ks[i] and the ground truth clusters of the data. Otherwise returns None. corr : array or None Average correlation for each fold. TODO corr_gt : array or None Avg correlation against GT. TODO """ if method not in AVAILABLE_METHODS: raise ValueError('Method {0} not implemented'.format(method)) if cv_likelihood and method != 'gmm': raise ValueError( "Cross-validated likelihood is only available for 'gmm' method") # if max_k is None, set max_k to maximum value if not max_k: max_k = samples[0].shape[1] # preallocate arrays for results ks = np.zeros(max_k-1, dtype=int) ari = np.zeros(max_k-1) ami = np.zeros(max_k-1) if stability: stab = np.zeros(max_k-1) if cv_likelihood: likelihood = np.zeros(max_k-1) if corr_score is not None: corr = np.zeros(max_k-1) if ground_truth is not None: ari_gt = np.zeros(max_k-1) ami_gt = np.zeros(max_k-1) if stability: stab_gt = np.zeros(max_k-1) if corr_score is not None: corr_gt = np.zeros(max_k-1) # get training and test train_set = [samples[x] for x in train] test_set = [samples[x] for x in test] if stack: train_ds = np.vstack(train_set) test_ds = np.vstack(test_set) else: train_ds = np.mean(np.dstack(train_set), axis=2) test_ds = np.mean(np.dstack(test_set), axis=2) # compute clustering on training set if method == 'complete': train_ds_dist = pdist(train_ds.T, metric='correlation') test_ds_dist = pdist(test_ds.T, metric='correlation') # I'm computing the full tree and then cutting # afterwards to speed computation Y_train = complete(train_ds_dist) # same on testing set Y_test = complete(test_ds_dist) elif method == 'ward': (children_train, n_comp_train, n_leaves_train, parents_train) = ward_tree(train_ds.T, **kwargs) # same on testing set (children_test, n_comp_test, n_leaves_test, parents_test) = ward_tree(test_ds.T, **kwargs) elif method == 'gmm' or method == 'kmeans': pass # we'll have to run it for each k else: raise ValueError("We shouldn't get here") for i_k, k in enumerate(range(2, max_k+1)): if method == 'complete': # cut the tree with right K for both train and test train_label = cut_tree_scipy(Y_train, k) test_label = cut_tree_scipy(Y_test, k) # train a classifier on this clustering knn = KNeighborsClassifier(#algorithm='brute', # metric='correlation', n_neighbors=n_neighbors) knn.fit(train_ds.T, train_label) # predict the clusters in the test set prediction_label = knn.predict(test_ds.T) elif method == 'ward': # cut the tree with right K for both train and test train_label = _hc_cut(k, children_train, n_leaves_train) test_label = _hc_cut(k, children_test, n_leaves_test) # train a classifier on this clustering knn = KNeighborsClassifier(n_neighbors=n_neighbors) knn.fit(train_ds.T, train_label) # predict the clusters in the test set prediction_label = knn.predict(test_ds.T) elif method == 'gmm': gmm = GMM(n_components=k, **kwargs) # fit on train and predict test gmm.fit(train_ds.T) prediction_label = gmm.predict(test_ds.T) if cv_likelihood: log_prob = np.sum(gmm.score(test_ds.T)) # fit on test and get labels gmm.fit(test_ds.T) test_label = gmm.predict(test_ds.T) elif method == 'kmeans': kmeans = KMeans(n_clusters=k) # fit on train and predict test kmeans.fit(train_ds.T) prediction_label = kmeans.predict(test_ds.T) # fit on test and get labels kmeans.fit(test_ds.T) test_label = kmeans.predict(test_ds.T) else: raise ValueError("We shouldn't get here") # append results ks[i_k] = k ari[i_k] = adjusted_rand_score(prediction_label, test_label) ami[i_k] = adjusted_mutual_info_score(prediction_label, test_label) if stability: stab[i_k] = stability_score(prediction_label, test_label, k) if cv_likelihood: likelihood[i_k] = log_prob if corr_score is not None: corr[i_k] = correlation_score(prediction_label, test_label, test_ds, corr_score) if ground_truth is not None: ari_gt[i_k] = adjusted_rand_score(prediction_label, ground_truth) ami_gt[i_k] = adjusted_mutual_info_score(prediction_label, ground_truth) if stability: stab_gt[i_k] = stability_score(prediction_label, ground_truth, k) if corr_score is not None: corr_gt[i_k] = correlation_score(prediction_label, ground_truth, test_ds, corr_score) results = [ks, ari, ami] if stability: results.append(stab) else: results.append(None) if cv_likelihood: results.append(likelihood) else: results.append(None) if ground_truth is not None: results += [ari_gt, ami_gt] else: results += [None, None] if stability and ground_truth is not None: results.append(stab_gt) else: results.append(None) if corr_score is not None: results.append(corr) else: results.append(None) if corr_score is not None and ground_truth is not None: results.append(corr_gt) else: results.append(None) return results
def __call__(self, test_label, predicted_label, **kwargs): return adjusted_mutual_info_score(test_label, predicted_label)