def test_v_measure_and_mutual_information(seed=36): # Check relation between v_measure, entropy and mutual information for i in np.logspace(1, 4, 4).astype(np.int): random_state = np.random.RandomState(seed) labels_a, labels_b = (random_state.randint(0, 10, i), random_state.randint(0, 10, i)) assert_almost_equal(v_measure_score(labels_a, labels_b), 2.0 * mutual_info_score(labels_a, labels_b) / (entropy(labels_a) + entropy(labels_b)), 0) avg = 'arithmetic' assert_almost_equal(v_measure_score(labels_a, labels_b), normalized_mutual_info_score(labels_a, labels_b, average_method=avg) )
def test_int_input(): X_list = [[0, 0], [10, 10], [12, 9], [-1, 1], [2, 0], [8, 10]] for dtype in [np.int32, np.int64]: X_int = np.array(X_list, dtype=dtype) X_int_csr = sp.csr_matrix(X_int) init_int = X_int[:2] fitted_models = [ KMeans(n_clusters=2).fit(X_int), KMeans(n_clusters=2, init=init_int, n_init=1).fit(X_int), # mini batch kmeans is very unstable on such a small dataset hence # we use many inits MiniBatchKMeans(n_clusters=2, n_init=10, batch_size=2).fit(X_int), MiniBatchKMeans(n_clusters=2, n_init=10, batch_size=2).fit(X_int_csr), MiniBatchKMeans(n_clusters=2, batch_size=2, init=init_int, n_init=1).fit(X_int), MiniBatchKMeans(n_clusters=2, batch_size=2, init=init_int, n_init=1).fit(X_int_csr), ] for km in fitted_models: assert_equal(km.cluster_centers_.dtype, np.float64) expected_labels = [0, 1, 1, 0, 0, 1] scores = np.array([v_measure_score(expected_labels, km.labels_) for km in fitted_models]) assert_array_equal(scores, np.ones(scores.shape[0]))
def test_k_means_function(): # test calling the k_means function directly # catch output from cStringIO import StringIO import sys old_stdout = sys.stdout sys.stdout = StringIO() cluster_centers, labels, inertia = k_means(X, n_clusters=n_clusters, verbose=True) sys.stdout = old_stdout centers = cluster_centers assert_equal(centers.shape, (n_clusters, n_features)) labels = labels assert_equal(np.unique(labels).shape[0], n_clusters) # check that the labels assignements are perfect (up to a permutation) assert_equal(v_measure_score(true_labels, labels), 1.0) assert_greater(inertia, 0.0) # check warning when centers are passed with warnings.catch_warnings(record=True) as w: k_means(X, n_clusters=n_clusters, init=centers) assert_equal(len(w), 1) # to many clusters desired assert_raises(ValueError, k_means, X, n_clusters=X.shape[0] + 1)
def test_k_means_function(): # test calling the k_means function directly # catch output old_stdout = sys.stdout sys.stdout = StringIO() try: cluster_centers, labels, inertia = k_means(X, n_clusters=n_clusters, sample_weight=None, verbose=True) finally: sys.stdout = old_stdout centers = cluster_centers assert_equal(centers.shape, (n_clusters, n_features)) labels = labels assert_equal(np.unique(labels).shape[0], n_clusters) # check that the labels assignment are perfect (up to a permutation) assert_equal(v_measure_score(true_labels, labels), 1.0) assert_greater(inertia, 0.0) # check warning when centers are passed assert_warns(RuntimeWarning, k_means, X, n_clusters=n_clusters, sample_weight=None, init=centers) # to many clusters desired assert_raises(ValueError, k_means, X, n_clusters=X.shape[0] + 1, sample_weight=None) # kmeans for algorithm='elkan' raises TypeError on sparse matrix assert_raise_message(TypeError, "algorithm='elkan' not supported for " "sparse input X", k_means, X=X_csr, n_clusters=2, sample_weight=None, algorithm="elkan")
def test_fitted_model(self): # non centered, sparse centers to check the centers = np.array([ [0.0, 5.0, 0.0, 0.0, 0.0], [1.0, 1.0, 4.0, 0.0, 0.0], [1.0, 0.0, 0.0, 5.0, 1.0], ]) n_samples = 100 n_clusters, n_features = centers.shape X, true_labels = make_blobs(n_samples=n_samples, centers=centers, cluster_std=1., random_state=42) cbook = CoodeBook(n_words=3) cbook = cbook.fit(X) # TODO: Is it neaded to reasign? or it can be just cbook.fit(X) # check that the number of clusters centers and distinct labels match # the expectation centers = cbook.get_dictionary() assert_equal(centers.shape, (n_clusters, n_features)) labels = cbook.predict(X) assert_equal(np.unique(labels).shape[0], n_clusters) # check that the labels assignment are perfect (up to a permutation) assert_equal(v_measure_score(true_labels, labels), 1.0) assert_greater(cbook.cluster_core.inertia_, 0.0) # check that the descriptor looks like the homogenous PDF used # to create the original samples cbook_hist = cbook.get_BoF_descriptor(X) expected_value = float(1)/cbook.n_words for bin_value in cbook_hist[0]: assert_less(round(bin_value-expected_value,3), 0.01)
def test_k_means_function(): # test calling the k_means function directly # catch output old_stdout = sys.stdout sys.stdout = StringIO() try: cluster_centers, labels, inertia = k_means(X, n_clusters=n_clusters, verbose=True) finally: sys.stdout = old_stdout centers = cluster_centers assert_equal(centers.shape, (n_clusters, n_features)) labels = labels assert_equal(np.unique(labels).shape[0], n_clusters) # check that the labels assignment are perfect (up to a permutation) assert_equal(v_measure_score(true_labels, labels), 1.0) assert_greater(inertia, 0.0) # check warning when centers are passed assert_warns(RuntimeWarning, k_means, X, n_clusters=n_clusters, init=centers) # to many clusters desired assert_raises(ValueError, k_means, X, n_clusters=X.shape[0] + 1)
def calculate_scores(self): x, c, labels = self.x, self.c, self.labels self.v_measure = v_measure_score(c, labels) self.complete = completeness_score(c, labels) self.adjusted_mutual = adjusted_mutual_info_score(c, labels) self.adjusted_rand = adjusted_rand_score(c, labels) self.silhouette = silhouette_score(x, c) self.purity, self.partial_purity = self.__purity__()
def test_exactly_zero_info_score(): """Check numerical stability when information is exactly zero""" for i in np.logspace(1, 4, 4).astype(np.int): labels_a, labels_b = np.ones(i, dtype=np.int), np.arange(i, dtype=np.int) assert_equal(normalized_mutual_info_score(labels_a, labels_b), 0.0) assert_equal(v_measure_score(labels_a, labels_b), 0.0) assert_equal(adjusted_mutual_info_score(labels_a, labels_b), 0.0) assert_equal(normalized_mutual_info_score(labels_a, labels_b), 0.0)
def test_k_means_perfect_init(): try: p_suite = []#PY_suite(suite_name=u'perfect_init') for i in range(10): X, true_labels = make_blobs(n_samples=n_samples, centers=centers, cluster_std=1., random_state=42) km = KMeans(init=centers.copy(), n_clusters=n_clusters, random_state=42,n_init=1).fit(X) p_suite+=[PY_raises(ValueError,km.fit,[[0.,1.]]), PY_equals(v_measure_score(true_labels, km.labels_),1.0), PY_equals(km.cluster_centers_.shape,(n_clusters,n_features)), PY_equals(v_measure_score(true_labels,km.labels_), 1.0), PY_greater(km.inertia_,0.0) ] return p_suite except Exception: return 50
def test_k_means_plus_plus_init_not_precomputed(): try: p_suite = []#PY_suite(suite_name=u'plus_plus_init_not_precomputed') for i in range(10): X, true_labels = make_blobs(n_samples=n_samples, centers=centers, cluster_std=1., random_state=42) km = KMeans(init="k-means++", n_clusters=n_clusters, random_state=42,precompute_distances=False).fit(X) p_suite+=[PY_raises(ValueError,km.fit,[[0.,1.]]), PY_equals(v_measure_score(true_labels, km.labels_),1.0), PY_equals(km.cluster_centers_.shape,(n_clusters,n_features)), PY_equals(v_measure_score(true_labels,km.labels_), 1.0), PY_greater(km.inertia_,0.0) ] return p_suite except Exception: return 50
def test_k_means_random_init_sparse(): try: p_suite = []#PY_suite(suite_name=u'init_random_sparse') for i in range(10): X, true_labels = make_blobs(n_samples=n_samples, centers=centers, cluster_std=1., random_state=42) X_csr = sp.csr_matrix(X) km = KMeans(init="random", n_clusters=n_clusters, random_state=42).fit(X_csr) p_suite+=[PY_raises(ValueError,km.fit,[[0.,1.]]), PY_equals(v_measure_score(true_labels, km.labels_),1.0), PY_equals(km.cluster_centers_.shape,(n_clusters,n_features)), PY_equals(v_measure_score(true_labels,km.labels_), 1.0), PY_greater(km.inertia_,0.0) ] return p_suite except Exception: return 50
def test_accuracy(self): from sklearn.cluster import KMeans as skKMeans n_samples = 100000 centers = 10 X, true_labels = make_blobs(n_samples=n_samples, centers=centers, cluster_std=1., random_state=42) kmeans_h2o = KMeans(n_gpus=1, n_clusters=centers, random_state=42) kmeans_h2o.fit(X) kmeans_sk = skKMeans(n_init=1, n_clusters=centers, init='random', random_state=42) kmeans_sk.fit(X) accuracy_h2o = v_measure_score(kmeans_h2o.labels_, true_labels) accuracy_sk = v_measure_score(kmeans_sk.labels_, true_labels) # We also want to be either better or at most 10% worse than SKLearn # Everything else is horrible and we probably should fix something assert accuracy_h2o - accuracy_sk >= -0.1
def test_v_measure_and_mutual_information(seed=36): """Check relation between v_measure, entropy and mutual information""" for i in np.logspace(1, 4, 4): random_state = np.random.RandomState(seed) labels_a, labels_b = random_state.random_integers(0, 10, i),\ random_state.random_integers(0, 10, i) assert_almost_equal(v_measure_score(labels_a, labels_b), 2.0 * mutual_info_score(labels_a, labels_b) / (entropy(labels_a) + entropy(labels_b)), 0)
def test_mini_batch_k_means_random_init_partial_fit(): km = MiniBatchKMeans(n_clusters=n_clusters, init="random", random_state=42) # use the partial_fit API for online learning for X_minibatch in np.array_split(X, 10): km.partial_fit(X_minibatch) # compute the labeling on the complete dataset labels = km.predict(X) assert_equal(v_measure_score(true_labels, labels), 1.0)
def test_scaled_weights(): # scaling all sample weights by a common factor # shouldn't change the result sample_weight = np.ones(n_samples) for estimator in [KMeans(n_clusters=n_clusters, random_state=42), MiniBatchKMeans(n_clusters=n_clusters, random_state=42)]: est_1 = clone(estimator).fit(X) est_2 = clone(estimator).fit(X, sample_weight=0.5*sample_weight) assert_almost_equal(v_measure_score(est_1.labels_, est_2.labels_), 1.0) assert_almost_equal(_sort_centers(est_1.cluster_centers_), _sort_centers(est_2.cluster_centers_))
def test_unit_weights_vs_no_weights(): # not passing any sample weights should be equivalent # to all weights equal to one sample_weight = np.ones(n_samples) for estimator in [KMeans(n_clusters=n_clusters, random_state=42), MiniBatchKMeans(n_clusters=n_clusters, random_state=42)]: est_1 = clone(estimator).fit(X) est_2 = clone(estimator).fit(X, sample_weight=sample_weight) assert_almost_equal(v_measure_score(est_1.labels_, est_2.labels_), 1.0) assert_almost_equal(_sort_centers(est_1.cluster_centers_), _sort_centers(est_2.cluster_centers_))
def _check_fitted_model(km): # check that the number of clusters centers and distinct labels match # the expectation centers = km.cluster_centers_ assert_equal(centers.shape, (n_clusters, n_features)) labels = km.labels_ assert_equal(np.unique(labels).shape[0], n_clusters) # check that the labels assignment are perfect (up to a permutation) assert_equal(v_measure_score(true_labels, labels), 1.0) assert_greater(km.score_, 0.0)
def _check_fitted_model(km): centers = km.cluster_centers_ assert_equal(centers.shape, (n_clusters, n_features)) labels = km.labels_ assert_equal(np.unique(labels).shape[0], n_clusters) # check that the labels assignements are perfect (up to a permutation) assert_equal(v_measure_score(true_labels, labels), 1.0) assert_true(km.inertia_ > 0.0) # check error on dataset being too small assert_raises(ValueError, km.fit, [[0., 1.]])
def test_k_means_perfect_init(): try: p_suite = [] #PY_suite(suite_name=u'perfect_init') for i in range(10): X, true_labels = make_blobs(n_samples=n_samples, centers=centers, cluster_std=1., random_state=42) km = KMeans(init=centers.copy(), n_clusters=n_clusters, random_state=42, n_init=1).fit(X) p_suite += [ PY_raises(ValueError, km.fit, [[0., 1.]]), PY_equals(v_measure_score(true_labels, km.labels_), 1.0), PY_equals(km.cluster_centers_.shape, (n_clusters, n_features)), PY_equals(v_measure_score(true_labels, km.labels_), 1.0), PY_greater(km.inertia_, 0.0) ] return p_suite except Exception: return 50
def test_k_means_plus_plus_init_not_precomputed(): try: p_suite = [] #PY_suite(suite_name=u'plus_plus_init_not_precomputed') for i in range(10): X, true_labels = make_blobs(n_samples=n_samples, centers=centers, cluster_std=1., random_state=42) km = KMeans(init="k-means++", n_clusters=n_clusters, random_state=42, precompute_distances=False).fit(X) p_suite += [ PY_raises(ValueError, km.fit, [[0., 1.]]), PY_equals(v_measure_score(true_labels, km.labels_), 1.0), PY_equals(km.cluster_centers_.shape, (n_clusters, n_features)), PY_equals(v_measure_score(true_labels, km.labels_), 1.0), PY_greater(km.inertia_, 0.0) ] return p_suite except Exception: return 50
def evaluate(data, net, t, landmarks): out = net(torch.from_numpy(data).float(), False) print(time.time() - start_time) t = t.astype(float) out = out.detach().numpy() print('New score metric') print(score(out, t)) cmap = colors.ListedColormap(['red', 'blue']) plt.scatter(out[:, 0], out[:, 1], c=t, cmap=cmap, marker='o') kmeans = KMeans(n_clusters=2) kmeans.fit(out) vmeasure = v_measure_score(t, kmeans.labels_) print(vmeasure)
def test_unit_weights_vs_no_weights(): # not passing any sample weights should be equivalent # to all weights equal to one sample_weight = np.ones(n_samples) for estimator in [ KMeans(n_clusters=n_clusters, random_state=42), MiniBatchKMeans(n_clusters=n_clusters, random_state=42) ]: est_1 = clone(estimator).fit(X) est_2 = clone(estimator).fit(X, sample_weight=sample_weight) assert_almost_equal(v_measure_score(est_1.labels_, est_2.labels_), 1.0) assert_almost_equal(_sort_centers(est_1.cluster_centers_), _sort_centers(est_2.cluster_centers_))
def clusterEvaluation(trueY, fittedY): result = dict() ## NMI denotes normalized mutual information ## ARS denotes adjusted rand score ## HS stands for homogeneity_score, 1 means perfect ## VM represents v_measure_score ranging [0, 1], 1.0 is perfectly complete labeling ## SS represents silhouette_score result['NMI'] = normalized_mutual_info_score(trueY, fittedY) result['ARS'] = adjusted_rand_score(trueY, fittedY) result['HS'] = homogeneity_score(trueY, fittedY) result['CS'] = completeness_score(trueY, fittedY) result['VM'] = v_measure_score(trueY, fittedY) return result
def run_kmeans(Xtrain, Ytrain, Xtest, Ytest, K=6, n_init=1, verbose=1, plotTSNE=False): # let's use the TF-IDF vectorizer tfidf = True # we use a dummy function as tokenizer and preprocessor, # since the texts are already preprocessed and tokenized. if tfidf: vec = TfidfVectorizer(preprocessor=identity, tokenizer=identity) else: vec = CountVectorizer(preprocessor=identity, tokenizer=identity) ######## RUN K-MEANS ######## km = KMeans(n_clusters=K, n_init=n_init, verbose=verbose) classifier = Pipeline([('vec', vec), ('cls', km)]) classifier.fit(Xtrain) print("\n########## Development scores on train set:") print("adjusted rand score: ", adjusted_rand_score(Ytrain, km.labels_)) print("v measure: ", v_measure_score(Ytrain, km.labels_)) Yguess = classifier.predict(Xtest) print("\n########## Generalization scores on test set:") print("adjusted rand score: ", adjusted_rand_score(Ytest, Yguess)) print("v measure: ", v_measure_score(Ytest, Yguess)) if plotTSNE: # perform_tsne(Xtrain, Ytrain, clusterLabels=True) # tSNE with gold labels perform_tsne(Xtrain, km.labels_, vec=vec, clusterLabels=True) # tSNE clustering
def test_exactly_zero_info_score(): # Check numerical stability when information is exactly zero for i in np.logspace(1, 4, 4).astype(np.int): labels_a, labels_b = np.ones(i, dtype=np.int),\ np.arange(i, dtype=np.int) assert_equal(normalized_mutual_info_score(labels_a, labels_b, max_n_classes=1e4), 0.0) assert_equal(v_measure_score(labels_a, labels_b, max_n_classes=1e4), 0.0) assert_equal(adjusted_mutual_info_score(labels_a, labels_b, max_n_classes=1e4), 0.0) assert_equal(normalized_mutual_info_score(labels_a, labels_b, max_n_classes=1e4), 0.0)
def test_scaled_weights(): # scaling all sample weights by a common factor # shouldn't change the result sample_weight = np.ones(n_samples) for estimator in [ KMeans(n_clusters=n_clusters, random_state=42), MiniBatchKMeans(n_clusters=n_clusters, random_state=42) ]: est_1 = clone(estimator).fit(X) est_2 = clone(estimator).fit(X, sample_weight=0.5 * sample_weight) assert_almost_equal(v_measure_score(est_1.labels_, est_2.labels_), 1.0) assert_almost_equal(_sort_centers(est_1.cluster_centers_), _sort_centers(est_2.cluster_centers_))
def test_exactly_zero_info_score(): # Check numerical stability when information is exactly zero for i in np.logspace(1, 4, 4).astype(int): labels_a, labels_b = (np.ones(i, dtype=int), np.arange(i, dtype=int)) assert normalized_mutual_info_score(labels_a, labels_b) == 0.0 assert v_measure_score(labels_a, labels_b) == 0.0 assert adjusted_mutual_info_score(labels_a, labels_b) == 0.0 assert normalized_mutual_info_score(labels_a, labels_b) == 0.0 for method in ["min", "geometric", "arithmetic", "max"]: assert adjusted_mutual_info_score(labels_a, labels_b, method) == 0.0 assert normalized_mutual_info_score(labels_a, labels_b, method) == 0.0
def testKMeansFunction(self): # test calling the k_means function directly # non centered, sparse centers to check the centers = np.array([ [0.0, 5.0, 0.0, 0.0, 0.0], [1.0, 1.0, 4.0, 0.0, 0.0], [1.0, 0.0, 0.0, 5.0, 1.0], ]) n_samples = 100 n_clusters, n_features = centers.shape X, true_labels = make_blobs(n_samples=n_samples, centers=centers, cluster_std=1., random_state=42) # catch output old_stdout = sys.stdout sys.stdout = StringIO() try: cluster_centers, labels, inertia = k_means(X, n_clusters=n_clusters, sample_weight=None, verbose=True, init='k-means++') finally: sys.stdout = old_stdout centers = cluster_centers assert centers.shape == (n_clusters, n_features) labels = labels.fetch() assert np.unique(labels).shape[0] == n_clusters # check that the labels assignment are perfect (up to a permutation) assert v_measure_score(true_labels, labels) == 1.0 assert inertia > 0.0 # check warning when centers are passed assert_warns(RuntimeWarning, k_means, X, n_clusters=n_clusters, sample_weight=None, init=centers) # to many clusters desired with pytest.raises(ValueError): k_means(X, n_clusters=X.shape[0] + 1, sample_weight=None, init='k-means++')
def test_exactly_zero_info_score(): # Check numerical stability when information is exactly zero for i in np.logspace(1, 4, 4).astype(np.int): labels_a, labels_b = (np.ones(i, dtype=np.int), np.arange(i, dtype=np.int)) assert_equal(normalized_mutual_info_score(labels_a, labels_b), 0.0) assert_equal(v_measure_score(labels_a, labels_b), 0.0) assert_equal(adjusted_mutual_info_score(labels_a, labels_b), 0.0) assert_equal(normalized_mutual_info_score(labels_a, labels_b), 0.0) for method in ["min", "geometric", "arithmetic", "max"]: assert adjusted_mutual_info_score(labels_a, labels_b, method) == 0.0 assert normalized_mutual_info_score(labels_a, labels_b, method) == 0.0
def print_five_measures(target, predicted): print('homogeneity score:') print(homogeneity_score(target, predicted)) print('completeness score:') print(completeness_score(target, predicted)) print('V-measure:') print(v_measure_score(target, predicted)) print('adjusted rand score:') print(adjusted_rand_score(target, predicted)) print('adjuted mutual info score:') print(adjusted_mutual_info_score(target, predicted))
def evaluate(clusters, typedict): """Given the predicted clusters and type dictionary, this function calculates homogeneity, completeness, and V-measure assuming the gold tags are the most frequent tags for each type in the type dict input: clusters (dict of int:Cluster): Clusters by id typedict (dict of str:Word): Word by wordform return: (float): homogeneity score (float): completeness score (float): V measure""" # The instructor completed this function in 7 line including the return golds = [] preds = [] # Your code here return homogeneity_score(golds, preds), completeness_score( golds, preds), v_measure_score(golds, preds, beta=2.0)
def test_copac(self): """ Minimal test that COPAC runs at all. """ k = 40 mu = 10 eps = 2 alpha = 0.85 copac = COPAC(k=k, mu=mu, eps=eps, alpha=alpha) y_pred = copac.fit_predict(self.X) v = v_measure_score(self.y, y_pred) # Must score perfectly on very simple data assert_equal(self.v, v) # Check correct labels_ attribute copac = COPAC(k=k, mu=mu, eps=eps, alpha=alpha) copac.fit(self.X) assert_array_equal(copac.labels_, y_pred)
def _check_fitted_model(km): # check that the number of clusters centers and distinct labels match # the expectation centers = km.cluster_centers_ assert_equal(centers.shape, (n_clusters, n_features)) labels = km.labels_ assert_equal(np.unique(labels).shape[0], n_clusters) # check that the labels assignment are perfect (up to a permutation) assert_equal(v_measure_score(true_labels, labels), 1.0) assert_greater(km.inertia_, 0.0) # check error on dataset being too small assert_raises(ValueError, km.fit, [[0., 1.]])
def test_clustering(): matched = load_matched_data(MATCHED_DATA_FILE) for filename in glob.glob(os.path.join(CLUSTERS_PREDICTION_DIR, '*.fth')): num_clusters = int(filename.split('_clusters_')[1].split('_')[0]) matched['cluster_uniform'] = random_unif_pred(num_clusters, matched.shape[0]) matched['cluster_exp'] = random_exp_pred(num_clusters, matched.shape[0]) print(filename) print( "(uniform) V-measure:", v_measure_score(matched.property_decoded, matched.cluster_uniform)) print( "(uniform) AMI:", adjusted_mutual_info_score(matched.property_decoded, matched.cluster_uniform)) print("(exp) V-measure:", v_measure_score(matched.property_decoded, matched.cluster_exp)) print( "(exp) AMI:", adjusted_mutual_info_score(matched.property_decoded, matched.cluster_exp)) print()
def sklearn_measures(U, V): # http://scikit-learn.org/stable/modules/classes.html#clustering-metrics import sklearn.metrics.cluster as sym U_labels = np.nonzero(U)[1] V_labels = np.nonzero(V)[1] print U_labels, V_labels # V2_labels = np.nonzero(V2)[1] print 'entro(U)=',sym.entropy(U_labels),'entro(V)=',sym.entropy(V_labels), 'entro(U,V)=',sym.mutual_info_score(U_labels, V_labels) res = [ ['ari', 'nmi', 'ami', 'vm' ], \ [ sym.adjusted_rand_score(U_labels, V_labels),\ sym.normalized_mutual_info_score(U_labels, V_labels),\ sym.adjusted_mutual_info_score(U_labels, V_labels),\ sym.v_measure_score(U_labels, V_labels)]] print res return res
def sklearn_measures(U, V): # http://scikit-learn.org/stable/modules/classes.html#clustering-metrics import sklearn.metrics.cluster as sym U_labels = np.nonzero(U)[1] V_labels = np.nonzero(V)[1] print U_labels, V_labels # V2_labels = np.nonzero(V2)[1] print 'entro(U)=', sym.entropy(U_labels), 'entro(V)=', sym.entropy( V_labels), 'entro(U,V)=', sym.mutual_info_score(U_labels, V_labels) res = [ ['ari', 'nmi', 'ami', 'vm' ], \ [ sym.adjusted_rand_score(U_labels, V_labels),\ sym.normalized_mutual_info_score(U_labels, V_labels),\ sym.adjusted_mutual_info_score(U_labels, V_labels),\ sym.v_measure_score(U_labels, V_labels)]] print res return res
def _check_fitted_model(km): # check that the number of clusters centers and distinct labels match # the expectation centers = km.cluster_centers_ assert centers.shape == (n_clusters, n_features) labels = km.labels_ assert np.unique(labels).shape[0] == n_clusters # check that the labels assignment are perfect (up to a permutation) assert v_measure_score(true_labels, labels) == 1.0 assert km.inertia_ > 0.0 # check error on dataset being too small assert_raise_message(ValueError, "n_samples=1 should be >= n_clusters=%d" % km.n_clusters, km.fit, [[0., 1.]])
def k_means_clustering(training_data, target_labels, title='Contingency Matrix', n_clusters=20, random_state=0, max_iter=1000, n_init=30): start = time.time() km = KMeans(n_clusters=n_clusters, random_state=random_state, max_iter=max_iter, n_init=n_init) km.fit(training_data) print("Finished clustering in %f seconds" % (time.time() - start)) cm = contingency_matrix(target_labels, km.labels_) # reorder to maximize along diagonal rows, cols = linear_sum_assignment(cm, maximize=True) new_cm = cm[rows[:, np.newaxis], cols] print("Show Contingency Matrix:") plot_contingency_table_20(new_cm, title=title) print("Report 5 Measures for K-Means Clustering") homogeneity = homogeneity_score(target_labels, km.labels_) completeness = completeness_score(target_labels, km.labels_) v_measure = v_measure_score(target_labels, km.labels_) adjusted_rand_index = adjusted_rand_score(target_labels, km.labels_) adjusted_mutual_info = adjusted_mutual_info_score(target_labels, km.labels_) print("Homogeneity Score: %f" % homogeneity) print("Completeness Score: %f" % completeness) print("V-Measure Score: %f" % v_measure) print("Adjusted Rand Index: %f" % adjusted_rand_index) print("Adjusted Mutual Information: %f" % adjusted_mutual_info) results = { "homogeneity": homogeneity, "completeness": completeness, "v_measure": v_measure, "adjusted_rand_index": adjusted_rand_index, "adjusted_mutual_info": adjusted_mutual_info } return results, km
def compute_result(self, loss, preds, targets, stage): # Cluster embedded values using k-means. kmeans_input = preds.cpu().numpy() kmeans = KMeans(n_clusters=7, random_state=0).fit(kmeans_input) pred = kmeans.predict(kmeans_input) labels = targets.cpu().numpy() completeness = torch.Tensor([completeness_score(labels, pred)]) hm = torch.Tensor([homogeneity_score(labels, pred)]) nmi = torch.Tensor([v_measure_score(labels, pred)]) # auc, ap = model.test(z, data.test_pos_edge_index, data.test_neg_edge_index) result = pl.EvalResult(loss) result.log(f"{stage}_completeness", completeness, prog_bar=True) result.log(f"{stage}_hm", hm, prog_bar=True) result.log(f"{stage}_nmi", nmi, prog_bar=True) return result
def test(): model.eval() z = model.encode(data.x, data.train_pos_edge_index) # Cluster embedded values using k-means. kmeans_input = z.cpu().numpy() kmeans = KMeans(n_clusters=7, random_state=0).fit(kmeans_input) pred = kmeans.predict(kmeans_input) labels = data.y.cpu().numpy() completeness = completeness_score(labels, pred) hm = homogeneity_score(labels, pred) nmi = v_measure_score(labels, pred) auc, ap = model.test(z, data.test_pos_edge_index, data.test_neg_edge_index) return auc, ap, completeness, hm, nmi
def evaluate(self): eval_result_dict = {} eval_result_dict['ami'] = adjusted_mutual_info_score( self.data['true_y'], self.data['pred_y']) eval_result_dict['rand'] = adjusted_rand_score(self.data['true_y'], self.data['pred_y']) eval_result_dict['comp'] = completeness_score(self.data['true_y'], self.data['pred_y']) eval_result_dict['fow'] = fowlkes_mallows_score( self.data['true_y'], self.data['pred_y']) eval_result_dict['hom'] = homogeneity_score(self.data['true_y'], self.data['pred_y']) eval_result_dict['nmi'] = normalized_mutual_info_score( self.data['true_y'], self.data['pred_y']) eval_result_dict['v_score'] = v_measure_score(self.data['true_y'], self.data['pred_y']) return eval_result_dict
def test_beta_parameter(): # test for when beta passed to # homogeneity_completeness_v_measure # and v_measure_score beta_test = 0.2 h_test = 0.67 c_test = 0.42 v_test = (1 + beta_test) * h_test * c_test / (beta_test * h_test + c_test) h, c, v = homogeneity_completeness_v_measure( [0, 0, 0, 1, 1, 1], [0, 1, 0, 1, 2, 2], beta=beta_test ) assert_almost_equal(h, h_test, 2) assert_almost_equal(c, c_test, 2) assert_almost_equal(v, v_test, 2) v = v_measure_score([0, 0, 0, 1, 1, 1], [0, 1, 0, 1, 2, 2], beta=beta_test) assert_almost_equal(v, v_test, 2)
def test_k_means_function(): # test calling the k_means function directly # catch output old_stdout = sys.stdout sys.stdout = StringIO() try: cluster_centers, labels, score = k_means(X, n_clusters=n_clusters) finally: sys.stdout = old_stdout centers = cluster_centers assert_equal(centers.shape, (n_clusters, n_features)) labels = labels assert_equal(np.unique(labels).shape[0], n_clusters) # check that the labels assignment are perfect (up to a permutation) assert_equal(v_measure_score(true_labels, labels), 1.0) assert_greater(score, 0.0)
def _check_fitted_model(self, km, n_clusters, n_features, true_labels): # check that the number of clusters centers and distinct labels match # the expectation centers = km.cluster_centers_ self.assertEqual(centers.shape, (n_clusters, n_features)) labels = km.labels_.fetch() self.assertEqual(np.unique(labels).shape[0], n_clusters) # check that the labels assignment are perfect (up to a permutation) self.assertEqual(v_measure_score(true_labels, labels), 1.0) self.assertGreater(km.inertia_, 0.0) # check error on dataset being too small assert_raise_message( ValueError, "n_samples=1 should be >= n_clusters=%d" % km.n_clusters, km.fit, [[0., 1.]])
def _eval(self, ind, X, Y): if ind["fenotype"] == None: self.distance_creator.expand(ind) # evaluation using a pre-constructed distance matrix # with sklearn's agglomerative clustering algorithm # as was allowed in the Duvidas TP1 Moodle thread. d_matrix = cdist(X, X, metric=ind["fenotype"]) d_matrix = numpy.nan_to_num(d_matrix) kmeans_instance = AgglomerativeClustering(n_clusters=self.classes, affinity="precomputed", linkage="single") # predicts and adapts the cluster numbers to be compatible with the # numbers given in the test CSV pred = kmeans_instance.fit_predict(d_matrix) + numpy.ones(len(X)) # recupera os clusters gerados return v_measure_score(Y, pred)
def test_weighted_vs_repeated(): # a sample weight of N should yield the same result as an N-fold # repetition of the sample sample_weight = np.random.randint(1, 5, size=n_samples) X_repeat = np.repeat(X, sample_weight, axis=0) estimators = [ KMeans(init="k-means++", n_clusters=n_clusters, random_state=42), KMeans(init="random", n_clusters=n_clusters, random_state=42), KMeans(init=centers.copy(), n_clusters=n_clusters, random_state=42), MiniBatchKMeans(n_clusters=n_clusters, batch_size=10, random_state=42) ] for estimator in estimators: est_weighted = clone(estimator).fit(X, sample_weight=sample_weight) est_repeated = clone(estimator).fit(X_repeat) repeated_labels = np.repeat(est_weighted.labels_, sample_weight) assert_almost_equal( v_measure_score(est_repeated.labels_, repeated_labels), 1.0) if not isinstance(estimator, MiniBatchKMeans): assert_almost_equal(_sort_centers(est_weighted.cluster_centers_), _sort_centers(est_repeated.cluster_centers_))
def test_clustering(): for filename in glob.glob(os.path.join(CLUSTERS_PREDICTION_DIR, '*.fth')): print() print('Looking at the', filename) clusterized = load_model_predictions(filename) if not clusterized.shape[0]: print('Empty predictions file.') else: matched = load_matched_data(MATCHED_DATA_FILE) matched = clusters4matched(matched, clusterized) print("Matched pairs are of shape", matched[matched.cluster.notna()].shape) print("V-measure:", v_measure_score(matched.property_decoded, matched.cluster)) print( "AMI:", adjusted_mutual_info_score(matched.property_decoded, matched.cluster))
def test_weighted_vs_repeated(): # a sample weight of N should yield the same result as an N-fold # repetition of the sample sample_weight = np.random.randint(1, 5, size=n_samples) X_repeat = np.repeat(X, sample_weight, axis=0) estimators = [KMeans(init="k-means++", n_clusters=n_clusters, random_state=42), KMeans(init="random", n_clusters=n_clusters, random_state=42), KMeans(init=centers.copy(), n_clusters=n_clusters, random_state=42), MiniBatchKMeans(n_clusters=n_clusters, batch_size=10, random_state=42)] for estimator in estimators: est_weighted = clone(estimator).fit(X, sample_weight=sample_weight) est_repeated = clone(estimator).fit(X_repeat) repeated_labels = np.repeat(est_weighted.labels_, sample_weight) assert_almost_equal(v_measure_score(est_repeated.labels_, repeated_labels), 1.0) if not isinstance(estimator, MiniBatchKMeans): assert_almost_equal(_sort_centers(est_weighted.cluster_centers_), _sort_centers(est_repeated.cluster_centers_))
def test_beta_parameter(): # test for when beta passed to # homogeneity_completeness_v_measure # and v_measure_score beta_test = 0.2 h_test = 0.67 c_test = 0.42 v_test = ((1 + beta_test) * h_test * c_test / (beta_test * h_test + c_test)) h, c, v = homogeneity_completeness_v_measure( [0, 0, 0, 1, 1, 1], [0, 1, 0, 1, 2, 2], beta=beta_test) assert_almost_equal(h, h_test, 2) assert_almost_equal(c, c_test, 2) assert_almost_equal(v, v_test, 2) v = v_measure_score( [0, 0, 0, 1, 1, 1], [0, 1, 0, 1, 2, 2], beta=beta_test) assert_almost_equal(v, v_test, 2)
tokenizer=number_aware_tokenizer) cocluster = SpectralCoclustering(n_clusters=len(categories), svd_method='arpack', random_state=0) kmeans = MiniBatchKMeans(n_clusters=len(categories), batch_size=20000, random_state=0) print("Vectorizing...") X = vectorizer.fit_transform(newsgroups.data) print("Coclustering...") start_time = time() cocluster.fit(X) y_cocluster = cocluster.row_labels_ print("Done in {:.2f}s. V-measure: {:.4f}".format( time() - start_time, v_measure_score(y_cocluster, y_true))) print("MiniBatchKMeans...") start_time = time() y_kmeans = kmeans.fit_predict(X) print("Done in {:.2f}s. V-measure: {:.4f}".format( time() - start_time, v_measure_score(y_kmeans, y_true))) feature_names = vectorizer.get_feature_names() document_names = list(newsgroups.target_names[i] for i in newsgroups.target) def bicluster_ncut(i): rows, cols = cocluster.get_indices(i) if not (np.any(rows) and np.any(cols)):