def test_adjusted_mutual_info_score(): # Compute the Adjusted Mutual Information and test against known values labels_a = np.array([1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3]) labels_b = np.array([1, 1, 1, 1, 2, 1, 2, 2, 2, 2, 3, 1, 3, 3, 3, 2, 2]) # Mutual information mi = mutual_info_score(labels_a, labels_b, log_base='e') assert_almost_equal(mi, 0.41022, 5) # with provided sparse contingency C = contingency_matrix(labels_a, labels_b, sparse=True) mi = mutual_info_score(labels_a, labels_b, contingency=C, log_base='e') assert_almost_equal(mi, 0.41022, 5) # with provided dense contingency C = contingency_matrix(labels_a, labels_b) mi = mutual_info_score(labels_a, labels_b, contingency=C, log_base='e') assert_almost_equal(mi, 0.41022, 5) # Expected mutual information n_samples = C.sum() emi = expected_mutual_information(C, n_samples, log_base='e') assert_almost_equal(emi, 0.15042, 5) # Adjusted mutual information ami = adjusted_mutual_info_score(labels_a, labels_b, log_base='e') assert_almost_equal(ami, 0.27502, 5) ami = adjusted_mutual_info_score([1, 1, 2, 2], [2, 2, 3, 3]) assert_equal(ami, 1.0) # Test with a very large array a110 = np.array([list(labels_a) * 110]).flatten() b110 = np.array([list(labels_b) * 110]).flatten() ami = adjusted_mutual_info_score(a110, b110, log_base='e') # This is not accurate to more than 2 places assert_almost_equal(ami, 0.37, 2)
def test_adjusted_mutual_info_score(): # Compute the Adjusted Mutual Information and test against known values labels_a = np.array([1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3]) labels_b = np.array([1, 1, 1, 1, 2, 1, 2, 2, 2, 2, 3, 1, 3, 3, 3, 2, 2]) # Mutual information mi = mutual_info_score(labels_a, labels_b) assert_almost_equal(mi, 0.41022, 5) # with provided sparse contingency C = contingency_matrix(labels_a, labels_b, sparse=True) mi = mutual_info_score(labels_a, labels_b, contingency=C) assert_almost_equal(mi, 0.41022, 5) # with provided dense contingency C = contingency_matrix(labels_a, labels_b) mi = mutual_info_score(labels_a, labels_b, contingency=C) assert_almost_equal(mi, 0.41022, 5) # Expected mutual information n_samples = C.sum() emi = expected_mutual_information(C, n_samples) assert_almost_equal(emi, 0.15042, 5) # Adjusted mutual information ami = adjusted_mutual_info_score(labels_a, labels_b) assert_almost_equal(ami, 0.27821, 5) ami = adjusted_mutual_info_score([1, 1, 2, 2], [2, 2, 3, 3]) assert ami == pytest.approx(1.0) # Test with a very large array a110 = np.array([list(labels_a) * 110]).flatten() b110 = np.array([list(labels_b) * 110]).flatten() ami = adjusted_mutual_info_score(a110, b110) assert_almost_equal(ami, 0.38, 2)
def test_adjusted_mutual_info_score(): # Compute the Adjusted Mutual Information and test against known values labels_a = np.array([1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3]) labels_b = np.array([1, 1, 1, 1, 2, 1, 2, 2, 2, 2, 3, 1, 3, 3, 3, 2, 2]) # Mutual information mi = mutual_info_score(labels_a, labels_b) assert_almost_equal(mi, 0.41022, 5) # with provided sparse contingency C = contingency_matrix(labels_a, labels_b, sparse=True) mi = mutual_info_score(labels_a, labels_b, contingency=C) assert_almost_equal(mi, 0.41022, 5) # with provided dense contingency C = contingency_matrix(labels_a, labels_b) mi = mutual_info_score(labels_a, labels_b, contingency=C) assert_almost_equal(mi, 0.41022, 5) # Expected mutual information n_samples = C.sum() emi = expected_mutual_information(C, n_samples) assert_almost_equal(emi, 0.15042, 5) # Adjusted mutual information ami = adjusted_mutual_info_score(labels_a, labels_b) assert_almost_equal(ami, 0.27502, 5) ami = adjusted_mutual_info_score([1, 1, 2, 2], [2, 2, 3, 3]) assert_equal(ami, 1.0) # Test with a very large array a110 = np.array([list(labels_a) * 110]).flatten() b110 = np.array([list(labels_b) * 110]).flatten() ami = adjusted_mutual_info_score(a110, b110) # This is not accurate to more than 2 places assert_almost_equal(ami, 0.37, 2)
def test_v_measure_and_mutual_information(seed=36): # Check relation between v_measure, entropy and mutual information for i in np.logspace(1, 4, 4).astype(np.int): random_state = np.random.RandomState(seed) labels_a, labels_b = random_state.random_integers(0, 10, i),\ random_state.random_integers(0, 10, i) assert_almost_equal(v_measure_score(labels_a, labels_b), 2.0 * mutual_info_score(labels_a, labels_b) / (entropy(labels_a) + entropy(labels_b)), 0)
def test_v_measure_and_mutual_information(seed=36): # Check relation between v_measure, entropy and mutual information for i in np.logspace(1, 4, 4).astype(np.int): random_state = np.random.RandomState(seed) labels_a, labels_b = (random_state.randint(0, 10, i), random_state.randint(0, 10, i)) assert_almost_equal(v_measure_score(labels_a, labels_b), 2.0 * mutual_info_score(labels_a, labels_b) / (entropy(labels_a) + entropy(labels_b)), 0)
def test_int_overflow_mutual_info_score(): # Test overflow in mutual_info_classif x = np.array([1] * (52632 + 2529) + [2] * (14660 + 793) + [3] * (3271 + 204) + [4] * (814 + 39) + [5] * (316 + 20)) y = np.array([0] * 52632 + [1] * 2529 + [0] * 14660 + [1] * 793 + [0] * 3271 + [1] * 204 + [0] * 814 + [1] * 39 + [0] * 316 + [1] * 20) assert_all_finite(mutual_info_score(x.ravel(), y.ravel(), log_base='e'))
def test_v_measure_and_mutual_information(seed=36): """Check relation between v_measure, entropy and mutual information""" for i in np.logspace(1, 4, 4): random_state = np.random.RandomState(seed) labels_a, labels_b = random_state.random_integers(0, 10, i),\ random_state.random_integers(0, 10, i) assert_almost_equal(v_measure_score(labels_a, labels_b), 2.0 * mutual_info_score(labels_a, labels_b) / (entropy(labels_a) + entropy(labels_b)), 0)
def test_int_overflow_mutual_info_score(): # Test overflow in mutual_info_classif x = np.array([1] * (52632 + 2529) + [2] * (14660 + 793) + [3] * (3271 + 204) + [4] * (814 + 39) + [5] * (316 + 20)) y = np.array([0] * 52632 + [1] * 2529 + [0] * 14660 + [1] * 793 + [0] * 3271 + [1] * 204 + [0] * 814 + [1] * 39 + [0] * 316 + [1] * 20) assert_all_finite(mutual_info_score(x.ravel(), y.ravel()))
def _compute_mi(x, y, x_discrete, y_discrete, n_neighbors=3): if x_discrete and y_discrete: return mutual_info_score(x, y) elif x_discrete and not y_discrete: return _compute_mi_cd(y, x, n_neighbors) elif not x_discrete and y_discrete: return _compute_mi_cd(x, y, n_neighbors) else: return _compute_mi_cc(x, y, n_neighbors)
def test_int_overflow_mutual_info_fowlkes_mallows_score(): # Test overflow in mutual_info_classif and fowlkes_mallows_score x = np.array([1] * (52632 + 2529) + [2] * (14660 + 793) + [3] * (3271 + 204) + [4] * (814 + 39) + [5] * (316 + 20)) y = np.array([0] * 52632 + [1] * 2529 + [0] * 14660 + [1] * 793 + [0] * 3271 + [1] * 204 + [0] * 814 + [1] * 39 + [0] * 316 + [1] * 20) assert_all_finite(mutual_info_score(x, y)) assert_all_finite(fowlkes_mallows_score(x, y))
def test_v_measure_and_mutual_information(seed=36): # Check relation between v_measure, entropy and mutual information for i in np.logspace(1, 4, 4).astype(np.int): random_state = np.random.RandomState(seed) labels_a, labels_b = (random_state.randint(0, 10, i), random_state.randint(0, 10, i)) v_m = v_measure_score(labels_a, labels_b) mi = mutual_info_score(labels_a, labels_b, log_base='e') h_a = entropy(labels_a, log_base='e') h_b = entropy(labels_b, log_base='e') assert_almost_equal(v_m, 2.0 * mi / (h_a + h_b), 0)
def _compute_mi(x, y, x_discrete, y_discrete, n_neighbors=3): """Compute mutual information between two variables. This is a simple wrapper which selects a proper function to call based on whether `x` and `y` are discrete or not. """ if x_discrete and y_discrete: return mutual_info_score(x, y) elif x_discrete and not y_discrete: return _compute_mi_cd(y, x, n_neighbors) elif not x_discrete and y_discrete: return _compute_mi_cd(x, y, n_neighbors) else: return _compute_mi_cc(x, y, n_neighbors)
def test_v_measure_and_mutual_information(seed=36): # Check relation between v_measure, entropy and mutual information for i in np.logspace(1, 4, 4).astype(int): random_state = np.random.RandomState(seed) labels_a, labels_b = (random_state.randint(0, 10, i), random_state.randint(0, 10, i)) assert_almost_equal(v_measure_score(labels_a, labels_b), 2.0 * mutual_info_score(labels_a, labels_b) / (entropy(labels_a) + entropy(labels_b)), 0) avg = 'arithmetic' assert_almost_equal(v_measure_score(labels_a, labels_b), normalized_mutual_info_score(labels_a, labels_b, average_method=avg) )
def sklearn_measures(U, V): # http://scikit-learn.org/stable/modules/classes.html#clustering-metrics import sklearn.metrics.cluster as sym U_labels = np.nonzero(U)[1] V_labels = np.nonzero(V)[1] print U_labels, V_labels # V2_labels = np.nonzero(V2)[1] print 'entro(U)=', sym.entropy(U_labels), 'entro(V)=', sym.entropy( V_labels), 'entro(U,V)=', sym.mutual_info_score(U_labels, V_labels) res = [ ['ari', 'nmi', 'ami', 'vm' ], \ [ sym.adjusted_rand_score(U_labels, V_labels),\ sym.normalized_mutual_info_score(U_labels, V_labels),\ sym.adjusted_mutual_info_score(U_labels, V_labels),\ sym.v_measure_score(U_labels, V_labels)]] print res return res
def _ami(ab_cts, average_method='arithmetic'): """Adjusted mutual information between two discrete categorical random variables based on counts observed and provided in ab_cts. Code adapted directly from scikit learn AMI to accomodate having counts/contingency table instead of rows/instances: https://scikit-learn.org/stable/modules/generated/sklearn.metrics.adjusted_mutual_info_score.html Parameters ---------- ab_cts : np.ndarray [len(a_classes) x len(b_classes) Counts for each combination of classes in random variables a and b organized in a rectangular array. average_method : str See sklearn documentation for details Returns ------- ami : float Adjusted mutual information score for variables a and b""" a_freq = np.sum(ab_cts, axis=1) a_freq = a_freq / np.sum(a_freq) b_freq = np.sum(ab_cts, axis=0) b_freq = b_freq / np.sum(b_freq) n_samples = np.sum(ab_cts) """ Calculate the MI for the two clusterings contingency is a joint count distribution [a_classes x b_classes]""" mi = mutual_info_score(None, None, contingency=ab_cts) """Calculate the expected value for the mutual information""" emi = expected_mutual_information(ab_cts, n_samples) """Calculate entropy""" h_true, h_pred = _entropy(a_freq), _entropy(b_freq) normalizer = _generalized_average(h_true, h_pred, average_method) denominator = normalizer - emi if denominator < 0: denominator = min(denominator, -np.finfo('float64').eps) else: denominator = max(denominator, np.finfo('float64').eps) ami = (mi - emi) / denominator return ami
def test_adjusted_mutual_info_score(): """Compute the Adjusted Mutual Information and test against known values""" labels_a = np.array([1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3]) labels_b = np.array([1, 1, 1, 1, 2, 1, 2, 2, 2, 2, 3, 1, 3, 3, 3, 2, 2]) # Mutual information mi = mutual_info_score(labels_a, labels_b) assert_almost_equal(mi, 0.41022, 5) # Expected mutual information C = contingency_matrix(labels_a, labels_b) n_samples = np.sum(C) emi = expected_mutual_information(C, n_samples) assert_almost_equal(emi, 0.15042, 5) # Adjusted mutual information ami = adjusted_mutual_info_score(labels_a, labels_b) assert_almost_equal(ami, 0.27502, 5) ami = adjusted_mutual_info_score([1, 1, 2, 2], [2, 2, 3, 3]) assert_equal(ami, 1.0) # Test with a very large array a110 = np.array([list(labels_a) * 110]).flatten() b110 = np.array([list(labels_b) * 110]).flatten() ami = adjusted_mutual_info_score(a110, b110) # This is not accurate to more than 2 places assert_almost_equal(ami, 0.37, 2)
def test_mutual_info_score_positive_constant_label(labels_true, labels_pred): # non-regression test for #16355 assert mutual_info_score(labels_true, labels_pred) >= 0
def test_mutual_info_score_positive_constant_label(labels_true, labels_pred): # Check that MI = 0 when one or both labelling are constant # non-regression test for #16355 assert mutual_info_score(labels_true, labels_pred) == 0
# ####### ## Step1. Initialization ####### f = np.loadtxt(output_dir+'/'+filename+'_dat.txt') cls = np.loadtxt(output_dir+'/'+filename+'_cls.txt') s=[] mi_stack=[] fi=range(len(f.T)) ######## ## Step2 & 3. Compute MI w.r.t classes and find the 1st feature: ######## for i in range(len(f.T)): mi = mutual_info_score(f.T[i],cls) mi_stack.append(mi) print 'Evaluating the normalized mutual information coefficient w.r.t. the classes' max=np.max(mi_stack) for i in range(len(f.T)): if mi_stack[i] == max: s.append(i) fi.remove(i) print '(max_mi,max_mi_index):', (max, s[0]) print 'The rest feature index:', fi ########### ## Step4. Greedy Selection: Repeat unitl |S|=k. ## a) Calculate MI btw features: I(f_i;f_s) for all pairs (f_i,f_s).
def CalPred(dataset, K, r, Probabilities, Predictions, valid_data): # Bags_K = np.zeros((len(dataset), dataset.shape[1])) for k in range(K): samples = random.sample(range(0, len(dataset)), int(0.632 * len(dataset))) Bags_K = np.zeros((len(samples), dataset.shape[1])) Bags_K = dataset.iloc[samples, :] prob_x_1 = (dataset[dataset == 1].count(axis=0) + 2) / (len(dataset) + 4) prob_x_0 = 1 - prob_x_1 # len(acc) - acc.groupby(0)[1].sum() M_info = np.zeros((len(Bags_K.columns), len(Bags_K.columns))) random1 = random.sample(range(0, len(Bags_K.columns)), r * 2) temp1 = random1[r:] temp2 = random1[:r] from sklearn.metrics.cluster import mutual_info_score for i in Bags_K.columns: # print(i) for j in Bags_K.columns: M_info[i][j] = mutual_info_score(Bags_K[i].values, Bags_K[j].values) for i in temp1: for j in temp2: M_info[i][j] = 0 from scipy.sparse import csr_matrix, find from scipy.sparse.csgraph import minimum_spanning_tree, depth_first_tree X = csr_matrix(M_info) Tcsr = -minimum_spanning_tree(-X) # print(Tcsr) # Array1 = Tcsr.toarray().astype(float) maxTree = Tcsr.toarray() # # # Y = csr_matrix(Array1) # Tcsr_depth = depth_first_tree(Y, 1, directed = False) # Array2 = Tcsr_depth.toarray().astype(float) # really = np.column_stack(((find(Array2))[0], (find(Array2))[1])) G = t2G(maxTree) parents = dfs(G, random.randint(0, len(G) - 1)) # row = Bags_K.iloc[:,[really[0][0], really[0][1]]].header(None) def check(X, i, j): count = 0 if (X[0] == i and X[1] == j): count += 1 return count prediction = np.zeros(len(valid_data)) for i in range(parents.shape[1]): # print(i) parent = parents[0, i] table = dataset.iloc[:, [parent, i]] CPD = np.zeros((2, 2)) CPD[0][0] = (np.apply_along_axis(check, 1, table, 0, 0).sum() + 2) CPD[0][0] = CPD[0][0] / (len(dataset) + 4) CPD[0][1] = (np.apply_along_axis(check, 1, table, 0, 1).sum() + 2) CPD[0][1] = CPD[0][1] / (len(dataset) + 4) CPD[1][0] = (np.apply_along_axis(check, 1, table, 1, 0).sum() + 2) CPD[1][0] = CPD[1][0] / (len(dataset) + 4) CPD[1][1] = (np.apply_along_axis(check, 1, table, 1, 1).sum() + 2) CPD[1][1] = CPD[1][1] / (len(dataset) + 4) for j in range(len(valid_data)): if parent == -1: if valid_data.iloc[j, i] == 1: prediction[j] += np.log2(prob_x_1[i]) else: prediction[j] += np.log2(prob_x_0[i]) elif parent > -1: if (valid_data.iloc[j, parent] == 0 and valid_data.iloc[j, i] == 0): prediction[j] += np.log2(CPD[0][0] / (prob_x_0[parent])) elif (valid_data.iloc[j, parent] == 0 and valid_data.iloc[j, i] == 1): prediction[j] += np.log2(CPD[0][1] / (prob_x_0[parent])) elif (valid_data.iloc[j, parent] == 1 and valid_data.iloc[j, i] == 0): prediction[j] += np.log2(CPD[1][0] / (prob_x_1[parent])) elif (valid_data.iloc[j, parent] == 1 and valid_data.iloc[j, i] == 1): prediction[j] += np.log2(CPD[1][1] / (prob_x_1[parent])) Predictions[k] = Probabilities[k] * (prediction.sum() / len(valid_data)) return Predictions.sum()
def info_var(z, zh): """Compute variation of information based on M. Meila (2007).""" return entropy(z) + entropy(zh) - 2 * mutual_info_score(z, zh)
def info_var(z, zh): """Compute variation of information based on M. Meila (2007).""" return entropy(z) + entropy(zh) - 2*mutual_info_score(z, zh)
print ipy.entropy(labels_pred) ## test comparison from scikit-learn from sklearn.metrics.cluster import entropy print entropy(labels_true) print entropy(labels_pred) print "## test mutual information" print ipy.mutual_information(labels_true, labels_true) print ipy.mutual_information(labels_pred, labels_pred) print ipy.mutual_information(labels_true, labels_pred) ## test comparison from scikit-learn from sklearn.metrics.cluster import mutual_info_score print mutual_info_score(labels_true, labels_true) print mutual_info_score(labels_pred, labels_pred) print mutual_info_score(labels_true, labels_pred) print "## test variation of information" print ipy.information_variation(labels_true, labels_pred) print "## test normalized mutual information" print ipy.normalized_mutual_information([0, 0, 0, 0], [0, 1, 2, 3]) print ipy.normalized_mutual_information([0, 0, 1, 1], [1, 1, 0, 0]) print ipy.normalized_mutual_information([0, 0, 1, 1], [0, 0, 1, 1]) ## test comparison from scikit-learn from sklearn.metrics.cluster import normalized_mutual_info_score print normalized_mutual_info_score([0, 0, 0, 0], [0, 1, 2, 3]) print normalized_mutual_info_score([0, 0, 1, 1], [1, 1, 0, 0])
# DBSCAN print("DBSCAN evaluation: ",adjusted_mutual_info_score(digits.target, labels_dbscan)) # AgglomerativeClustering print("AgglomerativeClustering evaluation: ",adjusted_mutual_info_score(digits.target, labels_Agg)) # <a id='2.7.2'></a> # #### 2.7.2 Thực hiện đáng giá theo mutual_info_score # In[139]: # KMeans print("KMeans evaluation: ",mutual_info_score(digits.target, labels)) # Spectral cluster print("Spectral evaluation: ",mutual_info_score(digits.target, labels_spectral)) # DBSCAN print("DBSCAN evaluation: ",mutual_info_score(digits.target, labels_dbscan)) # AgglomerativeClustering print("AgglomerativeClustering evaluation: ",mutual_info_score(digits.target, labels_Agg)) # <a id='2.7.3'></a> # #### 2.7.3 Thực hiện đáng giá theo homogeneity_completeness_v_measure # - Giá trị trả về trong khoảng 0 >> 1 # - Càng về 1 thì độ khớp của True labels và cluster labels càng cao.
def MI_score(clusters1, clusters2): return mutual_info_score(clusters1, clusters2)
def NMI(X, Y): return mutual_info_score(X, Y)
def cluster_eval(config, net, test_dataloader, tf3, crop_transform, preprocessing_pool, sobel): net.eval() # Computed predicted clusters and gets ground truth predicted_clusters, ground_truth_clusters = _clustering_get_data( config, net, test_dataloader, tf3, crop_transform, preprocessing_pool, sobel=sobel, using_IR=False, verbose=False) predicted_clusters = predicted_clusters[0] num_samples = predicted_clusters.shape[0] # Computes accuracy if the number of predicted clusters matches the number of ground truth ones accuracy = None if config.gt_k == config.output_k_B: match = _hungarian_match(predicted_clusters, ground_truth_clusters, config.gt_k, config.output_k_B) found = torch.zeros(config.gt_k) reordered_preds = torch.zeros(num_samples, dtype=predicted_clusters.dtype).cuda() for pred_i, target_i in match: # reordered_preds[flat_predss_all[i] == pred_i] = target_i reordered_preds[torch.eq( predicted_clusters, int(pred_i))] = torch.from_numpy( np.array(target_i)).cuda().int().item() found[pred_i] = 1 assert (found.sum() == config.gt_k) # each output_k must get mapped accuracy = int((reordered_preds == ground_truth_clusters).sum()) / float(num_samples) predicted_clusters = predicted_clusters.cpu().numpy() ground_truth_clusters = ground_truth_clusters.cpu().numpy() confusion_matrix = compute_cluster_confusion_matrix( predicted_clusters, ground_truth_clusters, config.output_k_B, config.gt_k) # Computes entropies _, predicted_clusters_distribution = np.unique(predicted_clusters, return_counts=True) predicted_clusters_entropy = scipy.stats.entropy( predicted_clusters_distribution) _, ground_truth_clusters_distribution = np.unique(ground_truth_clusters, return_counts=True) ground_truth_clusters_entropy = scipy.stats.entropy( ground_truth_clusters_distribution) # Computes information scores mutual_information = mutual_info_score(predicted_clusters, ground_truth_clusters) conditional_entropy = -(mutual_information - ground_truth_clusters_entropy) nmi = normalized_mutual_info_score(predicted_clusters, ground_truth_clusters) net.train() return nmi, mutual_information, conditional_entropy, ground_truth_clusters_entropy, predicted_clusters_entropy, accuracy, confusion_matrix
def compute_scores(self, x): self.cluster_labels = np.ndarray((x.shape[0], )) for i in range(0, x.shape[0], self.batch_size): predictions = self.kmeans.predict(x[i:(i + self.batch_size)]) self.cluster_labels[i:(i + self.batch_size)] = predictions if (i + self.batch_size) > x.shape[0]: predictions = self.kmeans.predict(x[i:x.shape[0]]) self.cluster_labels[i:x.shape[0]] = predictions confusion_matrix = cscores.contingency_matrix(self.labels_true, self.labels_pred) purity_score = np.sum(np.amax(confusion_matrix, axis=0)) / np.sum(confusion_matrix) homogeneity_score, completeness_score, v_measure_score = cscores.homogeneity_completeness_v_measure( self.labels_true, self.labels_pred) scores = [ #['calinski_harabasz_score', 'internal', cscores.calinski_harabasz_score(x, self.cluster_labels)], [ 'davies_bouldin_score', 'internal', metrics.davies_bouldin_score(x, self.cluster_labels) ], [ 'silhouette_score', 'internal', metrics.silhouette_score(x, self.cluster_labels) ], #['silhouette_samples', 'internal', cscores.silhouette_samples(x, self.cluster_labels)], ['purity_score', 'external', purity_score], [ 'adjusted_rand_score', 'external', cscores.adjusted_rand_score(self.labels_true, self.labels_pred) ], ['completeness_score', 'external', completeness_score], [ 'fowlkes_mallows_score', 'external', cscores.fowlkes_mallows_score(self.labels_true, self.labels_pred) ], ['homogeneity_score', 'external', homogeneity_score], [ 'adjusted_mutual_info_score', 'external', cscores.adjusted_mutual_info_score(self.labels_true, self.labels_pred) ], [ 'mutual_info_score', 'external', cscores.mutual_info_score(self.labels_true, self.labels_pred) ], [ 'normalized_mutual_info_score', 'external', cscores.normalized_mutual_info_score(self.labels_true, self.labels_pred) ], ['v_measure_score', 'external', v_measure_score] ] scores = pd.DataFrame(scores, columns=['name', 'type', 'score']) scores.to_csv(files.small_images_classes_kmeans_scores, index=False)
file = sys.argv[1] dataset = pd.read_csv(file + ".ts.data", header=None) test_data = pd.read_csv(file + ".test.data", header=None) prob_x_1 = (dataset[dataset == 1].count(axis=0) + 2) / (len(dataset) + 4) prob_x_0 = 1 - prob_x_1 M_info = np.zeros((len(dataset.columns), len(dataset.columns))) from sklearn.metrics.cluster import mutual_info_score for i in dataset.columns: print(i) for j in dataset.columns: M_info[i][j] = mutual_info_score(dataset[i].values, dataset[j].values) from scipy.sparse import csr_matrix, find from scipy.sparse.csgraph import minimum_spanning_tree, depth_first_tree X = csr_matrix(M_info) Tcsr = -minimum_spanning_tree(-X) print(Tcsr) Array1 = Tcsr.toarray().astype(float) #Y = csr_matrix(A) Tcsr_depth = depth_first_tree(Array1, 1, directed=False) print(Tcsr_depth) Array2 = Tcsr_depth.toarray().astype(float) really = np.column_stack(((find(Array2))[0], (find(Array2))[1]))
def sklearn_measures(U, V): # http://scikit-learn.org/stable/modules/classes.html#clustering-metrics import sklearn.metrics.cluster as sym U_labels = np.nonzero(U)[1] V_labels = np.nonzero(V)[1] print U_labels, V_labels # V2_labels = np.nonzero(V2)[1] print 'entro(U)=',sym.entropy(U_labels),'entro(V)=',sym.entropy(V_labels), 'entro(U,V)=',sym.mutual_info_score(U_labels, V_labels) res = [ ['ari', 'nmi', 'ami', 'vm' ], \ [ sym.adjusted_rand_score(U_labels, V_labels),\ sym.normalized_mutual_info_score(U_labels, V_labels),\ sym.adjusted_mutual_info_score(U_labels, V_labels),\ sym.v_measure_score(U_labels, V_labels)]] print res return res