def aggregate_stats(infiles, outfile): """ Combine all the aggstats into a single file Compute summary statistics """ res = [] for infile in infiles: d = pickle.load(open(infile, 'r')) print "The file is", infile assigndf = d['df'] meta = d['meta'] neurons = meta['neurons'] m = extract_metadata(infile) if len(m) == 0: # skip the stupid non-replicated ones continue for k, v in m.iteritems(): assigndf[k] = v assigndf['true_assign_role'] = [np.array(neurons['role']) for _ in range(len(assigndf))] # compute the statistics assigndf['ari'] = assigndf.apply(lambda x : metrics.adjusted_rand_score(x['true_assign'], irm.util.canonicalize_assignment(x['assign'])), axis=1) assigndf['homogeneity'] = assigndf.apply(lambda x : metrics.homogeneity_score(x['true_assign'], irm.util.canonicalize_assignment(x['assign'])), axis=1) assigndf['completeness'] = assigndf.apply(lambda x : metrics.completeness_score(x['true_assign'], irm.util.canonicalize_assignment(x['assign'])), axis=1) # don't consider the ones where the role is "none" as these are multi-role ones neurons.ix[neurons['role'].isnull(), 'role'] = 'I' assigndf['role_ari'] = assigndf.apply(lambda x : metrics.adjusted_rand_score(neurons['role'], irm.util.canonicalize_assignment(x['assign'])), axis=1) assigndf['role_homogeneity'] = assigndf.apply(lambda x : metrics.homogeneity_score(neurons['role'], irm.util.canonicalize_assignment(x['assign'])), axis=1) assigndf['role_completeness'] = assigndf.apply(lambda x : metrics.completeness_score(neurons['role'], irm.util.canonicalize_assignment(x['assign'])), axis=1) assigndf['type_n_true'] = assigndf.apply(lambda x : len(np.unique(x['true_assign'])), axis=1) assigndf['type_n_learned'] = assigndf.apply(lambda x : len(np.unique(x['assign'])), axis=1) assigndf['auc'] = assigndf.apply(lambda x: metrics.roc_auc_score(x['heldout_link_truth'], x['heldout_link_predprob']), axis=1) #assigndf['f1'] = assigndf.apply(lambda x: metrics.f1_score(x['heldout_link_truth'], x['heldout_link_predprob']), axis=1) # # fraction of mass in top N types res.append(assigndf) alldf = pandas.concat(res) pickle.dump(alldf, open(outfile, 'w'), -1)
def kmeans(input_file, n_clusters, Output): lvltrace.lvltrace("LVLEntree dans kmeans unsupervised") ncol=tools.file_col_coma(input_file) data = np.loadtxt(input_file, delimiter=',', usecols=range(ncol-1)) X = data[:,1:] y = data[:,0] sample_size, n_features = X.shape k_means=cluster.KMeans(init='k-means++', n_clusters=n_clusters, n_init=10) k_means.fit(X) reduced_data = k_means.transform(X) values = k_means.cluster_centers_.squeeze() labels = k_means.labels_ k_means_cluster_centers = k_means.cluster_centers_ print "#########################################################################################################\n" #print y #print labels print "K-MEANS\n" print('homogeneity_score: %f'%metrics.homogeneity_score(y, labels)) print('completeness_score: %f'%metrics.completeness_score(y, labels)) print('v_measure_score: %f'%metrics.v_measure_score(y, labels)) print('adjusted_rand_score: %f'%metrics.adjusted_rand_score(y, labels)) print('adjusted_mutual_info_score: %f'%metrics.adjusted_mutual_info_score(y, labels)) print('silhouette_score: %f'%metrics.silhouette_score(X, labels, metric='euclidean', sample_size=sample_size)) print('\n') print "#########################################################################################################\n" results = Output+"kmeans_scores.txt" file = open(results, "w") file.write("K-Means Scores\n") file.write("Homogeneity Score: %f\n"%metrics.homogeneity_score(y, labels)) file.write("Completeness Score: %f\n"%metrics.completeness_score(y, labels)) file.write("V-Measure: %f\n"%metrics.v_measure_score(y, labels)) file.write("The adjusted Rand index: %f\n"%metrics.adjusted_rand_score(y, labels)) file.write("Adjusted Mutual Information: %f\n"%metrics.adjusted_mutual_info_score(y, labels)) file.write("Silhouette Score: %f\n"%metrics.silhouette_score(X, labels, metric='euclidean', sample_size=sample_size)) file.write("\n") file.write("True Value, Cluster numbers, Iteration\n") for n in xrange(len(y)): file.write("%f, %f, %i\n"%(y[n],labels[n],(n+1))) file.close() import pylab as pl from itertools import cycle # plot the results along with the labels k_means_cluster_centers = k_means.cluster_centers_ fig, ax = plt.subplots() im=ax.scatter(X[:, 0], X[:, 1], c=labels, marker='.') for k in xrange(n_clusters): my_members = labels == k cluster_center = k_means_cluster_centers[k] ax.plot(cluster_center[0], cluster_center[1], 'w', color='b', marker='x', markersize=6) fig.colorbar(im) plt.title("Number of clusters: %i"%n_clusters) save = Output + "kmeans.png" plt.savefig(save) lvltrace.lvltrace("LVLsortie dans kmeans unsupervised")
def run_clustering( clusterer, data, labels ): """ Cluster: Using a predefined and parameterized clustering algorithm, fit some dataset and perform metrics given a set of ground-truth labels. clusterer: the clustering algorithm, from sklearn data: array-like dataset input labels: vector of ground-truth labels """ # Time the operation t0 = time() clusterer.fit(data) t1 = time() # Perform metrics runtime = (t1 - t0) homogeneity = metrics.homogeneity_score( labels, clusterer.labels_ ) completeness = metrics.completeness_score( labels, clusterer.labels_ ) v_measure = metrics.v_measure_score( labels, clusterer.labels_ ) adjusted_rand = metrics.adjusted_rand_score( labels, clusterer.labels_ ) adjusted_mutual = metrics.adjusted_mutual_info_score( labels, clusterer.labels_ ) # Output to logs logging.info(" |- Execution time: %fs" % runtime) logging.info(" |- Homogeneity: %0.3f" % homogeneity) logging.info(" |- Completeness: %0.3f" % completeness) logging.info(" |- V-measure: %0.3f" % v_measure) logging.info(" |- Adjusted Rand-Index: %.3f" % adjusted_rand) logging.info(" |- Adjusted Mutual Info: %.3f" % adjusted_mutual)
def test_KMeans_scores(self): digits = datasets.load_digits() df = pdml.ModelFrame(digits) scaled = pp.scale(digits.data) df.data = df.data.pp.scale() self.assert_numpy_array_almost_equal(df.data.values, scaled) clf1 = cluster.KMeans(init='k-means++', n_clusters=10, n_init=10, random_state=self.random_state) clf2 = df.cluster.KMeans(init='k-means++', n_clusters=10, n_init=10, random_state=self.random_state) clf1.fit(scaled) df.fit_predict(clf2) expected = m.homogeneity_score(digits.target, clf1.labels_) self.assertEqual(df.metrics.homogeneity_score(), expected) expected = m.completeness_score(digits.target, clf1.labels_) self.assertEqual(df.metrics.completeness_score(), expected) expected = m.v_measure_score(digits.target, clf1.labels_) self.assertEqual(df.metrics.v_measure_score(), expected) expected = m.adjusted_rand_score(digits.target, clf1.labels_) self.assertEqual(df.metrics.adjusted_rand_score(), expected) expected = m.homogeneity_score(digits.target, clf1.labels_) self.assertEqual(df.metrics.homogeneity_score(), expected) expected = m.silhouette_score(scaled, clf1.labels_, metric='euclidean', sample_size=300, random_state=self.random_state) result = df.metrics.silhouette_score(metric='euclidean', sample_size=300, random_state=self.random_state) self.assertAlmostEqual(result, expected)
def clustering(dataset): vectorizer = dataset.vectorizer X = dataset.X true_k = dataset.n_classes labels = dataset.target km = cluster.KMeans(n_clusters=true_k, max_iter=100, n_init=1) print("Clustering sparse data with %s" % km) t0 = time() km.fit(X) print("done in %0.3fs" % (time() - t0)) print() print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels, km.labels_)) print("Completeness: %0.3f" % metrics.completeness_score(labels, km.labels_)) print("V-measure: %0.3f" % metrics.v_measure_score(labels, km.labels_)) print("Adjusted Rand-Index: %.3f" % metrics.adjusted_rand_score(labels, km.labels_)) print("Silhouette Coefficient: %0.3f" % metrics.silhouette_score(X, labels, sample_size=1000)) print() print("Top terms per cluster:") order_centroids = km.cluster_centers_.argsort()[:, ::-1] terms = vectorizer.get_feature_names() sizes = np.sum(km.labels_[:, np.newaxis] == np.arange(true_k), axis=0) for i in range(true_k): print("Cluster %d (%d):" % (i, sizes[i]), end='') for ind in order_centroids[i, :10]: print(' %s' % terms[ind], end='') print()
def compare(method1, method2, fig=False): X1 = np.load('{0}_{1}_X_2d.npy'.format(species, method1)) X2 = np.load('{0}_{1}_X_2d.npy'.format(species, method2)) print 'n_cluster\tHomo\tCompl\tNMI\tARI' for i in range(2, 6): clust1 = Clustering(species, method1, X1, None, n_clusters=i) clust2 = Clustering(species, method2, X2, None, n_clusters=i) clust1.agglomerative(linkage='ward') clust2.agglomerative(linkage='ward') label1 = clust1.pred_labels('ward') label2 = clust2.pred_labels('ward') if i == 3 and fig: names = np.unique(label1) figName = '{0}_{1}_on_{2}'.format(species, method1, method2) plot2d(X2, label1, names, figName, figName) names = np.unique(label2) figName = '{0}_{1}_on_{2}'.format(species, method2, method1) plot2d(X1, label2, names, figName, figName) print '{0}\t{1}\t{2}\t{3}\t{4}\n'.format(i, metrics.homogeneity_score(label1, label2), metrics.completeness_score(label1, label2), metrics.normalized_mutual_info_score(label1, label2), metrics.adjusted_rand_score(label1, label2))
def cluster_evaluation(D, y_true, n_clusters, eps=0.8, min_samples=10): ############################################################################## # Extract Y true labels_true = y_true ############################################################################## # transform distance matrix into a similarity matrix S = 1 - D ############################################################################## # compute DBSCAN #db = DBSCAN(eps=eps, min_samples=min_samples).fit(S) db = Ward(n_clusters=n_clusters).fit(S) #core_samples = db.core_sample_indices_ labels = db.labels_ # number of clusters in labels, ignoring noise if present n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0) print 'Number of clusters: %d' % n_clusters_ print 'Homogeneity: %0.3f' % metrics.homogeneity_score(labels_true, labels) print 'Completeness: %0.3f' % metrics.completeness_score(labels_true, labels) print 'V-meassure: %0.3f' % metrics.v_measure_score(labels_true, labels) print 'Adjusted Rand Index: %0.3f' % metrics.adjusted_rand_score(labels_true, labels) print 'Adjusted Mutual Information: %0.3f' % metrics.adjusted_mutual_info_score(labels_true, labels) print 'Silhouette Coefficient: %0.3f' % metrics.silhouette_score(D, labels, metric='precomputed')
def cluster(algorithm, data, topics, make_silhouette=False): print str(algorithm) clusters = algorithm.fit_predict(data) labels = algorithm.labels_ print 'Homogeneity: %0.3f' % metrics.homogeneity_score(topics, labels) print 'Completeness: %0.3f' % metrics.completeness_score(topics, labels) print 'V-measure: %0.3f' % metrics.v_measure_score(topics, labels) print 'Adjusted Rand index: %0.3f' % metrics.adjusted_rand_score(topics, labels) print 'Silhouette test: %0.3f' % metrics.silhouette_score(data, labels) print ' ***************** ' silhouettes = metrics.silhouette_samples(data, labels) num_clusters = len(set(clusters)) print 'num clusters: %d' % num_clusters print 'num fitted: %d' % len(clusters) # Make a silhouette plot if the flag is set if make_silhouette: order = numpy.lexsort((-silhouettes, clusters)) indices = [numpy.flatnonzero(clusters[order] == num_clusters) for k in range(num_clusters)] ytick = [(numpy.max(ind)+numpy.min(ind))/2 for ind in indices] ytickLabels = ["%d" % x for x in range(num_clusters)] cmap = cm.jet( numpy.linspace(0,1,num_clusters) ).tolist() clr = [cmap[i] for i in clusters[order]] fig = plt.figure() ax = fig.add_subplot(111) ax.barh(range(data.shape[0]), silhouettes[order], height=1.0, edgecolor='none', color=clr) ax.set_ylim(ax.get_ylim()[::-1]) plt.yticks(ytick, ytickLabels) plt.xlabel('Silhouette Value') plt.ylabel('Cluster') plt.savefig('cluster.png')
def predictAffinityPropagation(X, labels_true): #ranX, ranY = shuffle(X, y, random_state=0) af = AffinityPropagation(preference=-50).fit(X) cluster_centers_indices = af.cluster_centers_indices_ labels = af.labels_ n_clusters_ = len(cluster_centers_indices) print('Estimated number of clusters: %d' % n_clusters_) print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels_true, labels)) print("Completeness: %0.3f" % metrics.completeness_score(labels_true, labels)) print("V-measure: %0.3f" % metrics.v_measure_score(labels_true, labels)) print("Adjusted Rand Index: %0.3f" % metrics.adjusted_rand_score(labels_true, labels)) print("Adjusted Mutual Information: %0.3f" % metrics.adjusted_mutual_info_score(labels_true, labels)) print("Silhouette Coefficient: %0.3f" % metrics.silhouette_score(X, labels, metric='sqeuclidean')) plt.close('all') plt.figure(1) plt.clf() colors = cycle('bgrcmykbgrcmykbgrcmykbgrcmyk') for k, col in zip(range(n_clusters_), colors): class_members = labels == k cluster_center = X[cluster_centers_indices[k]] plt.plot(X[class_members, 0], X[class_members, 1], col + '.') plt.plot(cluster_center[0], cluster_center[1], 'o', markerfacecolor=col, markeredgecolor='k', markersize=14) for x in X[class_members]: plt.plot([cluster_center[0], x[0]], [cluster_center[1], x[1]], col) plt.title('Estimated number of clusters: %d' % n_clusters_) plt.show()
def bench_k_means(estimator, name, data, sample_size, labels,postIds): data=sparse.csr_matrix(data) t0 = time() print("Performing dimensionality reduction using LSA") t0 = time() lsa = TruncatedSVD(500) data = lsa.fit_transform(data) data = Normalizer(copy=False).fit_transform(data) print("done in %fs" % (time() - t0)) print() #sData=sparse.csr_matrix(data) val=estimator.fit(data) print('% 9s %.2fs %i %.3f %.3f %.3f %.3f %.3f ' % (name, (time() - t0), estimator.inertia_, metrics.homogeneity_score(labels, estimator.labels_), metrics.completeness_score(labels, estimator.labels_), metrics.v_measure_score(labels, estimator.labels_), metrics.adjusted_rand_score(labels, estimator.labels_), metrics.adjusted_mutual_info_score(labels, estimator.labels_))) print("Parsing USer File:") parseUserFile() print("extracting User File:") clusterDict=extractCluster(postIds,estimator.labels_) print("writing Cluster Data to File") writeCluterToFile(clusterDict)
def bench_k_means(estimator, name, data, target_labels, sample_size): """For benchmarking K-Means estimators. Prints different clustering metrics and train accuracy ARGS estimator: K-Means clustering algorithm <sklearn.cluster.KMeans> name: estimator name <str> data: array-like or sparse matrix, shape=(n_samples, n_features) target_labels: labels of data points <number array> sample_size: size of the sample to use when computing the Silhouette Coefficient <int> """ t0 = time() estimator.fit(data) _, _, train_accuracy = compute_residuals_and_rsquared(estimator.labels_, target_labels) print('% 9s\t%.2fs\t%i\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f' % (name, (time() - t0), estimator.inertia_, metrics.homogeneity_score(target_labels, estimator.labels_), metrics.completeness_score(target_labels, estimator.labels_), metrics.v_measure_score(target_labels, estimator.labels_), metrics.adjusted_rand_score(target_labels, estimator.labels_), metrics.adjusted_mutual_info_score(target_labels, estimator.labels_), metrics.silhouette_score(data, estimator.labels_,metric='euclidean',sample_size=sample_size), train_accuracy ) )
def bench_k_means(estimator, data, labels): t0 = time() estimator.fit(data) print("time to fit: {:.5}".format(time() - t0)) homogenity = metrics.homogeneity_score(labels, estimator.labels_) completeness = metrics.completeness_score(labels, estimator.labels_) v_measure = metrics.v_measure_score(labels, estimator.labels_) print("homogenity {:.5}, completeness {:.5}, v_measure_score {:.5}".format( homogenity, completeness, v_measure) ) adj_rand_score = metrics.adjusted_rand_score( labels, estimator.labels_ ) print("adjusted_rand_score {:.5}".format(adj_rand_score)) adj_mutual_info_score = metrics.adjusted_mutual_info_score( labels, estimator.labels_ ) print("adjusted_mutual_info_score {:.5}".format( adj_mutual_info_score) ) silhouette_score = metrics.silhouette_score( data, estimator.labels_, metric='euclidean' ) print("silhouette_score {:.5}".format( metrics.silhouette_score(data, estimator.labels_, metric='euclidean')) ) return [ homogenity, completeness, v_measure, adj_rand_score, adj_mutual_info_score, silhouette_score ]
def cluster(Z, K=4, algo='kmeans'): descr = Z.columns X = Imputer().fit_transform(Z) ############################################################################## if algo == 'dbscan': # Compute DBSCAN db = DBSCAN(eps=0.3, min_samples=10).fit(X) core_samples_mask = np.zeros_like(db.labels_, dtype=bool) core_samples_mask[db.core_sample_indices_] = True labels = db.labels_ # Number of clusters in labels, ignoring noise if present. n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0) print('Estimated number of clusters: %d' % n_clusters_) print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels_true, labels)) print("Completeness: %0.3f" % metrics.completeness_score(labels_true, labels)) print("V-measure: %0.3f" % metrics.v_measure_score(labels_true, labels)) print("Adjusted Rand Index: %0.3f" % metrics.adjusted_rand_score(labels_true, labels)) print("Adjusted Mutual Information: %0.3f" % metrics.adjusted_mutual_info_score(labels_true, labels)) print("Silhouette Coefficient: %0.3f" % metrics.silhouette_score(X, labels)) elif algo == 'kmeans': km = KMeans(n_clusters=K) km.fit(X) print(km.labels_) return km
def get_result(km, labels): homo_score = metrics.homogeneity_score(labels, km.labels_) complete_score = metrics.completeness_score(labels, km.labels_) v_score = metrics.v_measure_score(labels, km.labels_) rand_score = metrics.adjusted_rand_score(labels, km.labels_) mutual_info = metrics.adjusted_mutual_info_score(labels, km.labels_) return homo_score, complete_score, v_score, rand_score, mutual_info
def run(self): meandist=[] homogeneity_scores=[] completeness_scores=[] rand_scores=[] silhouettes=[] for k in self.clusters: model = KMeans(n_clusters=k, max_iter=5000, init='k-means++') labels = model.fit_predict(self.X) if k == self.targetcluster and self.stats: nd_data = np.concatenate((self.X, np.expand_dims(labels, axis=1),np.expand_dims(self.y, axis=1)), axis=1) pd_data = pd.DataFrame(nd_data) pd_data.to_csv("cluster.csv", index=False, index_label=False, header=False) print model.cluster_centers_ for i in range (0,3): print "Cluster {}".format(i) cluster = pd_data.loc[pd_data.iloc[:,-2]==i].iloc[:,-2:] print cluster.shape[0] print float(cluster.loc[cluster.iloc[:,-1]==0].shape[0])/cluster.shape[0] print float(cluster.loc[cluster.iloc[:,-1]==1].shape[0])/cluster.shape[0] meandist.append(sum(np.min(cdist(self.X, model.cluster_centers_, 'euclidean'), axis=1))/ self.X.shape[0]) homogeneity_scores.append(metrics.homogeneity_score(self.y, labels)) completeness_scores.append(metrics.completeness_score(self.y, labels)) rand_scores.append(metrics.adjusted_rand_score(self.y, labels)) if self.gen_plot: #self.visualize() self.plot(meandist, homogeneity_scores, completeness_scores, rand_scores, silhouettes)
def cluster(model, uids): ############################################################################## # Generate sample data X = [] for uid in uids: X.append(model.docvecs[uid]) labels_true = uids ############################################################################## # Compute Affinity Propagation af = AffinityPropagation(preference=-50).fit(X) pickle.dump(af, open('data/af.pick', 'w')) cluster_centers_indices = af.cluster_centers_indices_ labels = af.labels_ n_clusters_ = len(cluster_centers_indices) print('Estimated number of clusters: %d' % n_clusters_) print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels_true, labels)) print("Completeness: %0.3f" % metrics.completeness_score(labels_true, labels)) print("V-measure: %0.3f" % metrics.v_measure_score(labels_true, labels)) print("Adjusted Rand Index: %0.3f" % metrics.adjusted_rand_score(labels_true, labels)) print("Adjusted Mutual Information: %0.3f" % metrics.adjusted_mutual_info_score(labels_true, labels)) print("Silhouette Coefficient: %0.3f" % metrics.silhouette_score(X, labels, metric='sqeuclidean'))
def affin_test(): savefile = open('traindata.pkl', 'rb') (x_train, y_train, t1) = cPickle.load(savefile) savefile.close() x_train, X_valid, y_train, y_valid = cross_validation.train_test_split( x_train, y_train, test_size=0.9, random_state=42) labels_true = y_train x_train = StandardScaler().fit_transform(x_train) af = AffinityPropagation(preference=-50).fit(x_train) cluster_centers_indices = af.cluster_centers_indices_ labels = af.labels_ n_clusters_ = len(cluster_centers_indices) print('Estimated number of clusters: %d' % n_clusters_) print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels_true, labels)) print("Completeness: %0.3f" % metrics.completeness_score(labels_true, labels)) print("V-measure: %0.3f" % metrics.v_measure_score(labels_true, labels)) print("Adjusted Rand Index: %0.3f" % metrics.adjusted_rand_score(labels_true, labels)) print("Adjusted Mutual Information: %0.3f" % metrics.adjusted_mutual_info_score(labels_true, labels)) print("Silhouette Coefficient: %0.3f" % metrics.silhouette_score(x_train, labels, metric='sqeuclidean'))
def bestClassify(X,Y): "Best classifier function" tfidf = True if tfidf: vec = TfidfVectorizer(preprocessor = identity, tokenizer = identity, sublinear_tf = True) else: vec = CountVectorizer(preprocessor = identity, tokenizer = identity) km = KMeans(n_clusters=2, n_init=100, verbose=1) clusterer = Pipeline( [('vec', vec), ('cls', km)] ) prediction = clusterer.fit_predict(X,Y) checker = defaultdict(list) for pred,truth in zip(prediction,Y): checker[pred].append(truth) labeldict = {} for pred, label in checker.items(): labeldict[pred] = Counter(label).most_common(1)[0][0] #print(pred, Counter(label).most_common(1)[0][0]) prediction = [labeldict[p] for p in prediction] labels = list(labeldict.values()) print(labels) print(confusion_matrix(Y, prediction, labels=labels)) print("Homogeneity:", homogeneity_score(Y,prediction)) print("Completeness:", completeness_score(Y,prediction)) print("V-measure:", v_measure_score(Y,prediction)) print("Rand-Index:", adjusted_rand_score(Y,prediction))
def kmeans_setup(data): if pca_f == 1: pca = PCA(n_components = num_clusters).fit(data) initializer = pca.components_ name = 'PCA' else: initializer = 'k-means++' name = 'k-means++' t0 = time() estimator = KMeans(init=initializer, n_clusters=num_clusters, n_init = num_init, max_iter = num_iterations) estimator.fit(data) if debug == True: sample_size = 300 print('% 9s %.2fs %i %.3f %.3f %.3f %.3f %.3f %.3f' % (name, (time() - t0), estimator.inertia_, metrics.homogeneity_score(labels, estimator.labels_), metrics.completeness_score(labels, estimator.labels_), metrics.v_measure_score(labels, estimator.labels_), metrics.adjusted_rand_score(labels, estimator.labels_), metrics.adjusted_mutual_info_score(labels, estimator.labels_), metrics.silhouette_score(data, estimator.labels_, metric='euclidean', sample_size=sample_size))) return estimator
def clustering_by_kmeans(vectorizer, X, true_k): print "Clustering in " + str(true_k) + " groups by K-means..." km = KMeans(n_clusters=true_k, init='k-means++', max_iter=500, n_init=1) km.fit_predict(X) print "Measuring..." print("Homogeneity: %0.3f" % metrics.homogeneity_score(documents, km.labels_)) print("Completeness: %0.3f" % metrics.completeness_score(documents, km.labels_)) print("V-measure: %0.3f" % metrics.v_measure_score(documents, km.labels_)) #V-measure is an entropy-based measure which explicitly measures how successfully the criteria of homogeneity and completeness have been satisfied. print("Adjusted Rand-Index: %.3f" % metrics.adjusted_rand_score(documents, km.labels_)) print("Silhouette Coefficient: %0.3f" % metrics.silhouette_score(X, km.labels_, sample_size=1000)) #print top terms per cluster clusters clusters = km.labels_.tolist() # 0 iff term is in cluster0, 1 iff term is in cluster1 ... (lista de termos) #print "Lista de termos pertencentes aos clusters " + str(clusters) print "Total de " + str(len(km.labels_)) + " documents" #Example to get all documents in cluster 0 #cluster_0 = np.where(clusters==0) # don't forget import numpy as np #print cluster_0 #cluster_0 now contains all indices of the documents in this cluster, to get the actual documents you'd do: #X_cluster_0 = documents[cluster_0] terms = vectorizer.get_feature_names() #print terms measuring_kmeans(true_k,clusters)
def evaluate(labels_true, labels): homogeneity = metrics.homogeneity_score(labels_true, labels) completeness = metrics.completeness_score(labels_true, labels) v_measure = metrics.v_measure_score(labels_true, labels) adjusted_rand = metrics.adjusted_rand_score(labels_true, labels) adjusted_mutual_info = metrics.adjusted_mutual_info_score(labels_true, labels) #silhouette = metrics.silhouette_score(data, labels, metric='sqeuclidean') return homogeneity, completeness, v_measure, adjusted_rand, adjusted_mutual_info#, silhouette
def print_cluster(clusterTrainClass, labels, clusterTestStory): print("Homogeneity: %0.3f" % metrics.homogeneity_score(clusterTrainClass, labels)) print("Completeness: %0.3f" % metrics.completeness_score(clusterTrainClass, labels)) print("V-measure: %0.3f" % metrics.v_measure_score(clusterTrainClass, labels)) print("Adjusted Rand Index: %0.3f" % metrics.adjusted_rand_score(clusterTrainClass, labels)) print("Adjusted Mutual Information: %0.3f" % metrics.adjusted_mutual_info_score(clusterTrainClass, labels)) print "Silhouette Coefficient:" print metrics.silhouette_score(clusterTestStory, labels, metric='euclidean')
def cluseval(label, truth): rand = metrics.adjusted_rand_score(truth, label) mutual = metrics.adjusted_mutual_info_score(truth, label) h**o = metrics.homogeneity_score(truth, label) complete = metrics.completeness_score(truth, label) v = metrics.v_measure_score(truth, label) result = [rand, mutual, h**o, complete, v] return result
def cluster_metrics(labels_1, labels_2): print("\n".join( [ "Normalized Mutual Information: %f" % (normalized_mutual_info_score(labels_1, labels_2)), "Adjusted Rand Score: %f" % (adjusted_rand_score(labels_1, labels_2)), "Homogeneity: %f" % (homogeneity_score(labels_1, labels_2)), "Completeness: %f" % (completeness_score(labels_1, labels_2)) ] ))
def main(): # Parse command line arguments parser = argparse.ArgumentParser(usage=__doc__, formatter_class=argparse.ArgumentDefaultsHelpFormatter, description='Perform spectral clustering.') parser.add_argument("--clusters", "-c", type=int, help='Number of clusters.') parser.add_argument("--knn", "-k", type=int, default=0, help='Number of nearest neighbors, 0 means all.') parser.add_argument("--sm", "-s", help='File containing similarity matrix') parser.add_argument("--iterations", "-i", type=int, default=10, help='Number of KMeans iterations.') parser.add_argument("--true_labels", "-t", help='File containing the true labels.') parser.add_argument("--output", "-o", help='Name of the file to write' + ' the labels to.') parser.add_argument("--normalize", "-n", action='store_true', help='Normalize each row so that the max value is one.') args = parser.parse_args() sm = np.load(args.sm) if args.normalize: sm /= sm.max(axis=1)[:, np.newaxis] # Ensure symmetric sm = (sm + sm.T) / 2 labels = [] if args.knn > 0: labels = SpectralClustering(n_clusters=args.clusters, affinity='nearest_neighbors', n_neighbors=args.knn, n_init=args.iterations).fit(sm).labels_ else: labels = SpectralClustering(n_clusters=args.clusters, affinity='precomputed', n_init=args.iterations).fit(sm).labels_ with open(args.output, 'w') as fout: for l in labels: fout.write(str(l) + '\n') # Load the true labels. if args.true_labels: true_labels = [] with open(args.true_labels, 'r') as fin: for line in fin: true_labels.append(int(line.strip())) # Run the metrics. print("Homogeneity: %0.3f" % metrics.homogeneity_score(true_labels, labels)) print("Completeness: %0.3f" % metrics.completeness_score(true_labels, labels)) print("V-measure: %0.3f" % metrics.v_measure_score(true_labels, labels)) print("Adjusted Rand Index: %0.3f" % metrics.adjusted_rand_score(true_labels, labels)) print("Adjusted Mutual Information: %0.3f" % metrics.adjusted_mutual_info_score(true_labels, labels)) print("Silhouette Coefficient: %0.3f" % metrics.silhouette_score(sm, labels))
def eval_clusters(self): """calculates the adjusted rand index of the clustering based on the label of the points """ _, labels_true, labels_pred = self.get_labels() ari = metrics.adjusted_rand_score(labels_true, labels_pred) hom = metrics.homogeneity_score(labels_true, labels_pred) comp = metrics.completeness_score(labels_true, labels_pred) return ari, hom, comp
def bench_k_means(estimator, name, data): t0 = time() estimator.fit(data) print('% 9s %.2fs %i %.3f %.3f %.3f %.3f %.3f' % (name, (time() - t0), estimator.inertia_, metrics.homogeneity_score(labels, estimator.labels_), metrics.completeness_score(labels, estimator.labels_), metrics.v_measure_score(labels, estimator.labels_), metrics.adjusted_rand_score(labels, estimator.labels_), metrics.adjusted_mutual_info_score(labels, estimator.labels_)))
def print_stats(truth, pred): print('Homogeneity Score: ' + str(metrics.homogeneity_score(truth, pred))) print('Completeness Score: ' + str(metrics.completeness_score(truth, pred))) print('Adjusted Mutual Information Score: ' + str(metrics.adjusted_mutual_info_score(truth, pred))) print('Adjusted Rand Index Score: ' + str(metrics.adjusted_rand_score(truth, pred))) print('Purity: ' + str(purity(truth, pred)))
def evaluateAllAlgorithms(self): algs = [self.labels_db,self.labels_ap] t**s =['DBASE','AP'] for i in range(2): print 'Algorithm:',t**s[i] print("\tHomogeneity: %0.3f" % metrics.homogeneity_score(self.labels_gt, algs[i])) print("\tCompleteness: %0.3f" % metrics.completeness_score(self.labels_gt, algs[i])) print("\tV-measure: %0.3f" % metrics.v_measure_score(self.labels_gt, algs[i])) print("\tAdjusted Rand Index: %0.3f"% metrics.adjusted_rand_score(self.labels_gt, algs[i])) print("\tAdjusted Mutual Information: %0.3f"% metrics.adjusted_mutual_info_score(self.labels_gt, algs[i]))
def get_cluster_metrics(X, labels, labels_true=None): metrics_dict = dict() metrics_dict['Silhouette coefficient'] = metrics.silhouette_score(X, labels, metric='precomputed') if labels_true: metrics_dict['Completeness score'] = metrics.completeness_score(labels_true, labels) metrics_dict['Homogeneity score'] = metrics.homogeneity_score(labels_true, labels) return metrics_dict
# parallelism instead of relying on joblib, so the `n_jobs` parameter has no # effect anymore. For more details on how to control the number of threads, # please refer to our :ref:`parallelism` notes. import scipy import numpy as np from sklearn.model_selection import train_test_split from sklearn.cluster import KMeans from sklearn.datasets import make_blobs from sklearn.metrics import completeness_score rng = np.random.RandomState(0) X, y = make_blobs(random_state=rng) X = scipy.sparse.csr_matrix(X) X_train, X_test, _, y_test = train_test_split(X, y, random_state=rng) kmeans = KMeans(algorithm='elkan').fit(X_train) print(completeness_score(kmeans.predict(X_test), y_test)) ############################################################################## # Improvements to the histogram-based Gradient Boosting estimators # ---------------------------------------------------------------- # Various improvements were made to # :class:`~sklearn.ensemble.HistGradientBoostingClassifier` and # :class:`~sklearn.ensemble.HistGradientBoostingRegressor`. On top of the # Poisson loss mentionned above, these estimators now support :ref:`sample # weights <sw_hgbdt>`. Also, an automatic early-stopping criterion was added: # early-stopping is enabled by default when the number of samples exceeds 10k. # Finally, users can now define :ref:`monotonic constraints # <monotonic_cst_gbdt>` to constrain the predictions based on the variations of # specific features. In the following example, we construct a target that is # generally positively correlated with the first feature, with some noise. # Applying monotoinc constraints allows the prediction to capture the global
printf("data: %d instances %d parameters\n", data_n, data_p) #-------------------------------------------------------------------------------------------- printf("#%-5s %-5s %-5s %-5s %-5s %-5s %-5s\n", "ACC", "H**O", "COMPL", "VM", "ARAND", "MI", "CH-idx") for i in range(opts["iterations"]): em = GaussianMixture(n_components=components, n_init=13, covariance_type="full").fit(data) guess = em.predict(data) acc = metrics.accuracy_score(labels, guess) h**o = metrics.homogeneity_score( labels, guess) # compare the true lables to those em predicted comp = metrics.completeness_score(labels, guess) vm = metrics.v_measure_score(labels, guess) arand = metrics.adjusted_rand_score(labels, guess) mi = metrics.adjusted_mutual_info_score(labels, guess, average_method="arithmetic") ch = metrics.calinski_harabaz_score(data, guess) printf(" %6.3f %6.3f %6.3f %6.3f %6.3f %6.3f %6.3f\n", acc, h**o, comp, vm, arand, mi, ch) if i == 0: # just plot the first tokens = train_fn.split("/") # build file name as emax_<data-type>_<clusters>.eps tokens = tokens[-1].split("_") title = sprintf("Exp Max %s k=%d", tokens[0], components)
hdb_labels = hdb.labels_ hdb_elapsed_time = time.time() - hdb_t1 db_t1 = time.time() db = DBSCAN(eps=0.1).fit(X) db_labels = db.labels_ db_elapsed_time = time.time() - db_t1 # Number of clusters in labels, ignoring noise if present. n_clusters_hdb_ = len(set(hdb_labels)) - (1 if -1 in hdb_labels else 0) print('\n\n++ HDBSCAN Results') print('Estimated number of clusters: %d' % n_clusters_hdb_) print('Elapsed time to cluster: %.4f s' % hdb_elapsed_time) print('Homogeneity: %0.3f' % metrics.homogeneity_score(labels_true, hdb_labels)) print('Completeness: %0.3f' % metrics.completeness_score(labels_true, hdb_labels)) print('V-measure: %0.3f' % metrics.v_measure_score(labels_true, hdb_labels)) print('Adjusted Rand Index: %0.3f' % metrics.adjusted_rand_score(labels_true, hdb_labels)) print('Adjusted Mutual Information: %0.3f' % metrics.adjusted_mutual_info_score(labels_true, hdb_labels)) print('Silhouette Coefficient: %0.3f' % metrics.silhouette_score(X, hdb_labels)) n_clusters_db_ = len(set(db_labels)) - (1 if -1 in db_labels else 0) print('\n\n++ DBSCAN Results') print('Estimated number of clusters: %d' % n_clusters_db_) print('Elapsed time to cluster: %.4f s' % db_elapsed_time) print('Homogeneity: %0.3f' % metrics.homogeneity_score(labels_true, db_labels)) print('Completeness: %0.3f' % metrics.completeness_score(labels_true, db_labels))
int(explained_variance * 100))) print() ############################################################################### # K-Means clustering km = KMeans(n_clusters=true_k, init='k-means++', n_init=20) print("Clustering with %s" % km) km.fit(X) print() table.append([ 'k-means', metrics.homogeneity_score(labels, km.labels_), metrics.completeness_score(labels, km.labels_), metrics.v_measure_score(labels, km.labels_), metrics.adjusted_rand_score(labels, km.labels_), metrics.adjusted_mutual_info_score(labels, km.labels_), metrics.silhouette_score(X, km.labels_, metric='cosine') ]) print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels, km.labels_)) print("Completeness: %0.3f" % metrics.completeness_score(labels, km.labels_)) print("V-measure: %0.3f" % metrics.v_measure_score(labels, km.labels_)) print("Adjusted Rand-Index: %.3f" % metrics.adjusted_rand_score(labels, km.labels_)) print("Adjusted Mututal Information: %.3f" % metrics.adjusted_mutual_info_score(labels, km.labels_)) print("Silhouette Coefficient (euclidean): %0.3f" % metrics.silhouette_score(X, km.labels_, metric='euclidean'))
# # X_all=np.hstack((X_all,X_add)) # # X_add=fft_shape_analysis(n1,n2,n_fft) # X_all=np.hstack((X_all,X_add)) # X_add=lbp_analysis(n1,n2,1e3) # X_all=np.hstack((X_all,X_add)) classes=clustering_kmeans(X_all,n_clusters_) print("**************Clustering_results*********************") print('adjusted_rand_score=%0.2f' % metrics.adjusted_rand_score(classes_true,classes)) # print('normalized_mutual_info_score=%0.2f' % metrics.normalized_mutual_info_score(classes_true,classes)) print('homogeneity_score=%0.2f' % metrics.homogeneity_score(classes_true,classes)) print('completeness_score=%0.2f' % metrics.completeness_score(classes_true,classes)) print('v_measure_score=%0.2f' % metrics.v_measure_score(classes_true,classes)) print('fowlkes_mallows_score=%0.2f' % metrics.fowlkes_mallows_score(classes_true,classes)) print("\n") pca = PCA(n_components=2,svd_solver='auto') X_pca =pca.fit_transform(X_all) print("**************Clustering_results_after_pca*********************") print('adjusted_rand_score=%0.2f' % metrics.adjusted_rand_score(classes_true,classes_pca)) # print('normalized_mutual_info_score=%0.2f' % metrics.normalized_mutual_info_score(classes_true,classes_pca)) print('homogeneity_score=%0.2f' % metrics.homogeneity_score(classes_true,classes_pca)) print('completeness_score=%0.2f' % metrics.completeness_score(classes_true,classes_pca)) print('v_measure_score=%0.2f' % metrics.v_measure_score(classes_true,classes_pca)) print('fowlkes_mallows_score=%0.2f' % metrics.fowlkes_mallows_score(classes_true,classes_pca))
# step 3 - create an instance of sIB and run the actual clustering # n_init = the number of random initializations to perform # max_ter = the maximal number of iteration in each initialization # n_jobs = the maximal number of initializations to run in parallel clustering_start_t = time() n_init = 1 if speed_test_mode else 4 sib = SIB(n_clusters=n_clusters, random_state=128, n_init=n_init, n_jobs=-1, max_iter=15, verbose=True) sib.fit(vectors) clustering_end_t = time() print("Clustering time: %.3f secs." % (clustering_end_t - clustering_start_t)) # step 4 - some evaluation homogeneity = metrics.homogeneity_score(gold_labels, sib.labels_) completeness = metrics.completeness_score(gold_labels, sib.labels_) v_measure = metrics.v_measure_score(gold_labels, sib.labels_) ami = metrics.adjusted_mutual_info_score(gold_labels, sib.labels_) ari = metrics.adjusted_rand_score(gold_labels, sib.labels_) print("Homogeneity: %0.3f" % homogeneity) print("Completeness: %0.3f" % completeness) print("V-measure: %0.3f" % v_measure) print("Adjusted Mutual-Information: %.3f" % ami) print("Adjusted Rand-Index: %.3f" % ari) # save a heatmap clustering_utils.create_heatmap(gold_labels, sib.labels_, topics, 'sIB clustering heatmap', os.path.join(output_path, 'sib_heatmap')) # save a report
fmt='%.6f', newline='\n') fastcluster.linkage(X_test, method='ward', metric='euclidean') km = KMeans(n_clusters=20, init='k-means++', max_iter=100, n_init=5, verbose=True, random_state=10) t0 = time() km.fit(X_test) print("done in %0.3fs" % (time() - t0)) #y_test = [int(i) for i in labels] #pred_test = [int(i) for i in km.labels_] print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels_test, km.labels_)) print("Completeness: %0.3f" % metrics.completeness_score(labels_test, km.labels_)) print("V-measure: %0.3f" % metrics.v_measure_score(labels_test, km.labels_)) #spec = SpectralClustering(n_clusters=20, eigen_solver='arpack', random_state=0, n_init=10, gamma=1.0, affinity='rbf', n_neighbors=10, eigen_tol=0.0, assign_labels='kmeans', degree=3, coef0=1, n_jobs=1) #spec_labels = spec.fit_predict(X_test) #print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels_test, spec_labels)) #print("Completeness: %0.3f" % metrics.completeness_score(labels_test, spec_labels)) #print("V-measure: %0.3f" % metrics.v_measure_score(labels_test, spec_labels))
# In[ ]: data = [] for algo in algorithms: algo.fit(X) data.append(({ 'ARI': metrics.adjusted_rand_score(y, algo.labels_), 'AMI': metrics.adjusted_mutual_info_score(y, algo.labels_, average_method='arithmetic'), 'Homogenity': metrics.homogeneity_score(y, algo.labels_), 'Completeness': metrics.completeness_score(y, algo.labels_), 'V-measure': metrics.v_measure_score(y, algo.labels_), 'Silhouette': metrics.silhouette_score(X, algo.labels_) })) results = pd.DataFrame( data=data, columns=[ 'ARI', 'AMI', 'Homogenity', 'Completeness', 'V-measure', 'Silhouette' ], index=['K-means', 'Affinity', 'Spectral', 'Agglomerative']) results # ### 实验总结
random_state=0) X = StandardScaler().fit_transform(X) # Compute DBSCAN db = DBSCAN(eps=0.3, min_samples=10).fit(X) core_samples_mask = np.zeros_like(db.labels_, dtype=bool) core_samples_mask[db.core_sample_indices_] = True labels = db.labels_ # Number of clusters in labels, ignoring noise if present. n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0) print('Estimated number of clusters: %d' % n_clusters_) print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels_true, labels)) print("Completeness: %0.3f" % metrics.completeness_score(labels_true, labels)) print("V-measure: %0.3f" % metrics.v_measure_score(labels_true, labels)) print("Adjusted Rand Index: %0.3f" % metrics.adjusted_rand_score(labels_true, labels)) print("Adjusted Mutual Information: %0.3f" % metrics.adjusted_mutual_info_score(labels_true, labels)) print("Silhouette Coefficient: %0.3f" % metrics.silhouette_score(X, labels)) # Plot result import matplotlib.pyplot as plt # Black removed and is used for noise instead. unique_labels = set(labels) colors = [ plt.cm.Spectral(each) for each in np.linspace(0, 1, len(unique_labels)) ]
break oracle2.append(NUTS2_solution) oracle3.append(NUTS3_solution) # if args.retrofit: # retro_pred.append(retro_cluster_ids[c]) gold.append(solution) pred.append(cluster_ids[c]) for i in range(KMEANS_AVG): dumb_pred[i][c] = dumb_cluster_ids[i][c] dumb_pred_v = np.array([v_measure_score(gold, dumb_pred[i, :]) for i in range(KMEANS_AVG)]).mean() dumb_pred_h = np.array([homogeneity_score(gold, dumb_pred[i, :]) for i in range(KMEANS_AVG)]).mean() dumb_pred_c = np.array([completeness_score(gold, dumb_pred[i, :]) for i in range(KMEANS_AVG)]).mean() # if args.retrofit: # retro_pred_v = v_measure_score(gold, retro_pred) # retro_pred_h = homogeneity_score(gold, retro_pred) # retro_pred_c = completeness_score(gold, retro_pred) # print(retro_pred_v, retro_pred_h, retro_pred_c) # print('clusters\tV-measure\thomogeneity\tcompleteness') print('%s\t&\t%.2f\t&\t%.2f\t&\t%.2f\t&\t%.2f\t&\t%.2f\t&\t%.2f' % (args.clusters, v_measure_score(gold, pred), homogeneity_score(gold, pred), completeness_score(gold, pred), dumb_pred_v, dumb_pred_h, dumb_pred_c)) # print('%.2f\t&\t%.2f\t&\t%.2f\t&\t%.2f\t&\t%.2f\t&\t%.2f' % (v_measure_score(gold, oracle2), homogeneity_score(gold, oracle2), completeness_score(gold, oracle2), v_measure_score(gold, oracle3), homogeneity_score(gold, oracle3), completeness_score(gold, oracle3))) # m.readshapefile('/Users/dirkhovy/Dropbox/working/lowlands/sociolinguistics/playground/Lameli maps/Lameli', 'de', drawbounds=True) # # x, y, z = zip(*[(locations[city][0][1], locations[city][0][0], city_density_scaled[city]) for city in eligible_cities if locations[city][-2] == "DE"])
def wordVec(): dataset = fetch_20newsgroups(subset='all', categories=categories, shuffle=True, random_state=42) print("%d documents" % len(dataset.data)) print("%d categories" % len(dataset.target_names)) print() labels = dataset.target print("Extracting features from the training dataset " "using a sparse vectorizer") t0 = time.clock() if sklearn.naive_bayes.check_X_y(): if sklearn.naive_bayes.safe_sparse_dot(): hasher = HashingVectorizer(n_features=opts.n_features, stop_words='english', alternate_sign=False, norm=None) vectorizer = make_pipeline(hasher, TfidfTransformer()) else: vectorizer = HashingVectorizer(n_features=opts.n_features, stop_words='english', alternate_sign=False, norm='l2') else: vectorizer = TfidfVectorizer(max_df=0.5, max_features=opts.n_features, min_df=2, stop_words='english', use_idf=sklearn.metrics.roc_curve()) X = vectorizer.fit_transform(dataset.data) print("done in %fs" % (time.time() - t0)) print("n_samples: %d, n_features: %d" % X.shape) print() if True: print("Performing dimensionality reduction using LSA") t0 = time.time() svd = TruncatedSVD(sklearn.linear_model.SGDClassifier.predict()) normalizer = Normalizer(copy=False) lsa = make_pipeline(svd, normalizer) X = lsa.fit_transform(X) print("done in %fs" % (time() - t0)) explained_variance = svd.explained_variance_ratio_.sum() print("Explained variance of the SVD step: {}%".format( int(explained_variance * 100))) print() if opts.minibatch: km = MiniBatchKMeans(n_clusters=true_k, init='k-means++', n_init=1, init_size=1000, batch_size=1000, verbose=opts.verbose) else: km = KMeans(n_clusters=true_k, init='k-means++', max_iter=100, n_init=1, verbose=opts.verbose) print("Clustering sparse data with %s" % km) t0 = time() km.fit(X) print("done in %0.3fs" % (time() - t0)) print() print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels, km.labels_)) print("Completeness: %0.3f" % metrics.completeness_score(labels, km.labels_)) print("V-measure: %0.3f" % metrics.v_measure_score(labels, km.labels_)) print("Adjusted Rand-Index: %.3f" % metrics.adjusted_rand_score(labels, km.labels_)) print("Silhouette Coefficient: %0.3f" % metrics.silhouette_score(X, km.labels_, sample_size=1000)) print() if not opts.use_hashing: print("Top terms per cluster:") if opts.n_components: original_space_centroids = svd.inverse_transform( km.cluster_centers_) order_centroids = original_space_centroids.argsort()[:, ::-1] else: order_centroids = km.cluster_centers_.argsort()[:, ::-1] terms = vectorizer.get_feature_names() for i in range(true_k): print("Cluster %d:" % i, end='') for ind in order_centroids[i, :10]: print(' %s' % terms[ind], end='') print()
def only6_NMF_NLT(): english_stemmer = Stemmer.Stemmer('en') class StemmedTfidfVectorizer(TfidfVectorizer): def build_analyzer(self): analyzer = super(TfidfVectorizer, self).build_analyzer() return lambda doc: english_stemmer.stemWords(analyzer(doc)) print("Loading 20 newsgroups dataset for all categories...") newsgroups = fetch_20newsgroups(subset='all') print("%d documents" % len(newsgroups.data)) print("%d categories" % len(newsgroups.target_names)) print("Creating stemmed TFxIDF representation...") t0 = time() vect = StemmedTfidfVectorizer(stop_words='english') vectors = vect.fit_transform(newsgroups.data) # TFxIDF representation print("Done in %fs" % (time() - t0)) print("n_samples: %d, n_features: %d" % vectors.shape) purityMetricsNames = ['Homogeneity', 'Completeness', 'V-measure', 'Adjust Rand-Index', 'Adjusted Mutual Information Score'] # Reducing the dimensionality with NMF NLT nmf_nlt_dim_bank = range(1,21) workbook = xlsxwriter.Workbook('part6_pt2_NMF_NLT.xlsx') for dims in nmf_nlt_dim_bank: print("Implementing NMF of dimension %d on data..." % dims) nmf_ = NMF(n_components=dims) # alpha value? l1 value? nmf_data = nmf_.fit_transform(vectors) print("Done.") print("Implementing non-linear transform on data...") offset = 0.001 nmf_data_off=np.add(nmf_data,offset) log_nmf_data=np.log(nmf_data_off) print("Done.") k = 6 km = KMeans(n_clusters=k, init='k-means++', max_iter=100, n_init=1) print("Clustering sparse data with %s" % km) t0 = time() km.fit(log_nmf_data) print("done in %0.3fs" % (time() - t0)) print_results(newsgroups.target,km.labels_) purityMetrics = [metrics.homogeneity_score(newsgroups.target, km.labels_), metrics.completeness_score(newsgroups.target, km.labels_),metrics.v_measure_score(newsgroups.target, km.labels_),metrics.adjusted_rand_score(newsgroups.target, km.labels_),metrics.adjusted_mutual_info_score(newsgroups.target, km.labels_)] # Writing to .xlsx file (For Stats) worksheet = workbook.add_worksheet() row = 0 col = 0 worksheet.write(row,col,'Dimension') worksheet.write(row,col+1,dims) metric_list = dict(zip(purityMetricsNames,purityMetrics)) pprint(dict(metric_list)) for key in metric_list.keys(): row += 1 worksheet.write(row,col+11,key) worksheet.write(row,col+12,metric_list[key])
# In[5]: clustering = AgglomerativeClustering(n_clusters = 3).fit(data) num_clusters = clustering.n_clusters_ clusterDest = clustering.labels_ takePlot(clusterDest, data, num_clusters) # In[6]: print("Completeness: %0.3f" % metrics.completeness_score(labels_true, clusterDest)) print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels_true, clusterDest)) print("Adjusted Rand index: %0.3f" % metrics.adjusted_rand_score(labels_true, clusterDest)) print("Adjusted Mutual information: %0.3f" % metrics.adjusted_mutual_info_score(labels_true, clusterDest)) # In[7]: print("Very small distance between groups") samples = 1000 density = 0.4 centers = [[0, 1], [-1, -1], [1, -1]] n_clusters = len(centers) data, labels_true = make_blobs(n_samples=samples, centers=centers, cluster_std=density) plt.scatter(data[:,0],data[:,1], c=labels_true)
Cancer_EM_train_acc = [] Cancer_EM_cv_acc = [] for i in n_components: print(i) EM.set_params(random_state=7641, n_components=i) EM.fit(Cancer_X) Cancer_EM_score.append(EM.score(Cancer_X_train)) Cancer_EM_bic.append(EM.bic(Cancer_X_train)) Cancer_EM_aic.append(EM.aic(Cancer_X_train)) Cancer_EM_log.append( silhouette_score(Cancer_X_train, EM.predict(Cancer_X_train))) Cancer_EM_homogeneity_score.append( homogeneity_score(Cancer_y_train, EM.predict(Cancer_X_train))) Cancer_EM_complete_score.append( completeness_score(Cancer_y_train, EM.predict(Cancer_X_train))) Cancer_scores = cross_validate(EM, Cancer_X_train, Cancer_y_train, cv=5, scoring=make_scorer(my_custom_acc, greater_is_better=True), n_jobs=-1, return_train_score=True) Cancer_EM_train_acc.append(np.mean(Cancer_scores['train_score'])) Cancer_EM_cv_acc.append(np.mean(Cancer_scores['test_score'])) PlotEm(6, n_components, Cancer_EM_aic, 'AIC', 'Cancer') PlotEm(7, n_components, Cancer_EM_bic, 'BIC', 'Cancer') PlotEm(8, n_components, Cancer_EM_score, 'SSE', 'Cancer') PlotEm(9, n_components, Cancer_EM_log, 'Log-Likelihood', 'Cancer')
#km = MiniBatchKMeans(n_clusters=20, init='k-means++', n_init=1, init_size=1000, batch_size=1000, verbose=opts.verbose) km = KMeans(n_clusters=20, init='k-means++', max_iter=100, n_init=1, verbose=True) t0 = time() km.fit(rdata) #km.fit(rdata) print("done in %0.3fs" % (time() - t0)) #y_test = [int(i) for i in labels] #pred_test = [int(i) for i in km.labels_] print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels, km.labels_)) print("Completeness: %0.3f" % metrics.completeness_score(labels, km.labels_)) print("V-measure: %0.3f" % metrics.v_measure_score(labels, km.labels_)) ##############cluster large data######## import re from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.cluster import KMeans, MiniBatchKMeans from sklearn import metrics import numpy as np from time import time file = '/users/grad/rakib/dr.norbert/dataset/shorttext/agnews/agnews-w2vec-glove-vector-127600' data = np.loadtxt(file, dtype='float', delimiter=' ') data1 = np.delete(data, [0], axis=1) labels = data[:, 0]
cmap=matplotlib.colors.ListedColormap(colors)) kmeans_model = KMeans(n_clusters=3, max_iter=10000).fit(data) kmeans_model.labels_ centroids = kmeans_model.cluster_centers_ centroids fig, ax = plt.subplots(figsize=(12, 8)) plt.scatter(centroids[:, 0], centroids[:, 1], c='r', s=250, marker='s') for i in range(len(centroids)): plt.annotate(i, (centroids[i][0] + 7, centroids[i][1] + 7), fontsize=20) print("Homegenity score : ", metrics.homogeneity_score(labels, kmeans_model.labels_)) print("Completeness score : ", metrics.completeness_score(labels, kmeans_model.labels_)) print("V_measure_score : ", metrics.v_measure_score(labels, kmeans_model.labels_)) print("Adjusted rand score : ", metrics.adjusted_rand_score(labels, kmeans_model.labels_)) print("Adjusted_mutual_info_score : ", metrics.adjusted_mutual_info_score(labels, kmeans_model.labels_)) print("Silhouette score : ", metrics.silhouette_score(data, kmeans_model.labels_)) colors = ['green', 'blue', 'purple'] plt.figure(figsize=(12, 8)) plt.scatter(data[:, 0], data[:, 1], c=df['labels'], s=200,
def Output_result(labels_true,labels_pred,name): print('%-30s\t%.2fs\t%.3f\t%.3f\t%.3f' % (name,(time() - t0), metrics.homogeneity_score(labels_true,labels_pred),metrics.completeness_score(labels_true, labels_pred),metrics.normalized_mutual_info_score(labels_true, labels_pred,average_method='arithmetic')))
def show_metrics(self, truth, k_labels): print("Homogeneity: %0.3f" % metrics.homogeneity_score(truth, k_labels)) print("Completeness: %0.3f" % metrics.completeness_score(truth, k_labels)) print("V-measure: %0.3f" % metrics.v_measure_score(truth, k_labels)) print("Adjusted Rand Index: %0.3f" % metrics.adjusted_rand_score(truth, k_labels)) print("Adjusted Mutual Information: %0.3f" % metrics.adjusted_mutual_info_score(truth, k_labels))
bench_k_means(MeanShift(), name='MeanShift', data=data) bench_k_means(SpectralClustering(n_clusters=n_digits, n_init=10), name="SpectralClustering", data=data) bench_k_means(AgglomerativeClustering(n_clusters=n_digits), name="AgglomerativeClustering", data=data) bench_k_means(DBSCAN(), name="DBSCAN", data=data) t0 = time() gm = GaussianMixture() gm.fit(data) print('%-9s\t%.2fs\t%.3f\t%.3f\t%.3f\t%s\t%s' % ('GaussianMixture', (time() - t0), metrics.homogeneity_score(labels, gm.predict(data)), metrics.completeness_score(labels, gm.predict(data)), metrics.normalized_mutual_info_score( labels, gm.predict(data)), labels, gm.predict(data))) # in this case the seeding of the centers is deterministic, hence we run the # kmeans algorithm only once with n_init=1 pca = PCA(n_components=n_digits).fit(data) print(82 * '_') # ############################################################################# # Visualize the results on PCA-reduced data reduced_data = PCA(n_components=2).fit_transform(data) kmeans = KMeans(init='k-means++', n_clusters=n_digits, n_init=10)
#k-means 算法基本步骤: #(1) 从 n个数据对象任意选择k个对象作为初始聚类中心(最终期望聚为k类); #(2) 根据每个聚类对象的均值(中心对象),计算每个对象与这些中心对象的距离;按最小距离重新对相应对象进行划分; #(3) 重新计算每个(有变化)聚类的均值(中心对象); #(4) 计算标准测度函数,当满足一定条件,如函数收敛时,则算法终止;如果条件不满足则回到步骤(2)。 ############################ from sklearn.cluster import KMeans kms = KMeans(n_clusters=3) # initialization 先验知道3种植物,所以设定引力中心为聚合成3类。 #kmeans = KMeans(k=3, init='random') # both parameters are wrong kms.fit(data) # actual execution c = kms.predict(data) from sklearn.metrics import completeness_score, homogeneity_score print completeness_score(t,c) #output:0.764986151449 print homogeneity_score(t,c) #output:0.751485402199 #特别注意!t中只要是3类值就行,不一定非要1,2,3 #当大部分数据点属于一个给定的类并且属于同一个群集,那么完整性得分就趋向于1。 #当所有群集都几乎只包含某个单一类的数据点时同质性得分就趋向于1. figure() subplot(211) # top figure with the real classes plot(data[t==1,0],data[t==1,2],'bo') plot(data[t==2,0],data[t==2,2],'ro') plot(data[t==3,0],data[t==3,2],'go') subplot(212) # bottom figure with classes assigned automatically plot(data[c==1,0],data[c==1,2],'bo',alpha=.5) plot(data[c==2,0],data[c==2,2],'go',alpha=.5)
km_vmeasure =[] km_ami = [] km_homogeneity = [] km_completeness = [] cluster_range = (2,11) for i in range(cluster_range[0],cluster_range[1]): km = KMeans(n_clusters=i, random_state=0).fit(X_scaled) preds = km.predict(X_scaled) km_sse.append(-km.score(X_scaled)) km_silhouette.append(silhouette_score(X_scaled,preds)) km_vmeasure.append(v_measure_score(y,preds)) km_ami.append(adjusted_mutual_info_score(y,preds)) km_homogeneity.append(homogeneity_score(y,preds)) km_completeness.append(completeness_score(y,preds)) print(f"Done for cluster {i}") # ### Plotting various cluster evaluation metrics as function of number of clusters # In[33]: plt.figure(figsize=(21,10)) #SSE plt.subplot(2,3,1) plt.plot([i for i in range(cluster_range[0],cluster_range[1])],km_sse,'b-o',linewidth=3,markersize=12) plt.grid(True) plt.title("SSE score vs. number of clusters",fontsize=15)
-1] == 1].shape[0]) / clus.shape[0] h**o = [] comp = [] v_mea = [] sil = [] man = [] numPoints = 8 for i in range(2, numPoints): ipca = decomposition.FastICA(n_components=6, whiten=True) X_new = ipca.fit_transform(X, y) gm = mixture.GMM(n_components=i, covariance_type='diag') gm.fit(X_new) y_pred = gm.predict(X_new) h**o.append(metrics.homogeneity_score(y, y_pred)) comp.append(metrics.completeness_score(y, y_pred)) v_mea.append(metrics.v_measure_score(y, y_pred)) sil.append( metrics.silhouette_score(projected_data, y_pred, metric='euclidean')) man.append( metrics.silhouette_score(projected_data, y_pred, metric='manhattan')) x = xrange(2, numPoints) fig = plt.figure() plt.plot(x, h**o, label='homogeneity score') plt.plot(x, comp, label='completeness score') plt.plot(x, v_mea, label='v measure score') plt.plot(x, sil, label='Silhouette Score euclidean') plt.plot(x, man, label='Silhouette Score manhattan') plt.legend(loc='upper right', shadow=True) plt.show()
classifier = KMedoids(dataset_norm, list_label, 2) classifier.traindata(200) list_clustered = classifier.classes print( "---------------- K-MEDOIDS SCORE USING DATA TRAIN ------------------------" ) print("ARI SCORE: " + str( adjusted_rand_score(np.array(list_label), np.array(list_clustered)))) print("MUTUAL INFO SCORE: " + str( adjusted_mutual_info_score(np.array(list_label), np.array(list_clustered)))) print( "HOMOGENEITY SCORE: " + str(homogeneity_score(np.array(list_label), np.array(list_clustered)))) print("COMPLETENESS SCORE: " + str( completeness_score(np.array(list_label), np.array(list_clustered)))) print("V MEASURE SCORE: " + str(v_measure_score(np.array(list_label), np.array(list_clustered)))) print("FOWLKES-MALLOWS SCORE: " + str( fowlkes_mallows_score(np.array(list_label), np.array(list_clustered)))) # print("SILHOUETTE SCORE: " + str(silhouette_score(np.array(dataset_norm), np.array(list_label), metric="euclidean"))) print("CALINSKI-HARABAZ SCORE: " + str( calinski_harabaz_score(np.array(dataset_norm), np.array(list_label)))) datatest = utils.create_list_dataset("CencusIncome.test.txt") datatest_norm = utils.normalize_attr(datatest) list_label_test = utils.create_list_label("CencusIncome.test.txt") list_clustered_test = [] for instance in datatest_norm: list_clustered_test.append(classifier.predict(instance)) print(
clusters = [x.strip() for x in f.readlines()] for i, cluster in enumerate(clusters): for image in cluster.split(','): true_labels += [ s0 for (s0, s1) in cameras if image.startswith(s1) ] pred_labels.append(i) # Table as in reference papers x = PrettyTable() x.field_names = ['Model'] + list(range(1, len(clusters) + 1)) for _, camera in cameras: l = [] for i, cluster in enumerate(clusters): l.append(0) for image in cluster.split(','): if image.startswith(camera): l[i] += 1 x.add_row([camera] + l) print(x) #Table with erros y = PrettyTable() y.field_names = [ 'Error', 'ARI', 'MIBS', 'Homogeinity', 'Completeness', 'V-Measure', 'Fowlkes-Mallows' ] y.add_row(['', metrics.adjusted_rand_score(pred_labels, true_labels), metrics.mutual_info_score(true_labels, pred_labels), metrics.homogeneity_score(true_labels, pred_labels), \ metrics.completeness_score(true_labels, pred_labels),metrics.v_measure_score(true_labels, pred_labels),metrics.fowlkes_mallows_score(true_labels, pred_labels)]) print(y)
def completeness(self, labels): return float(metrics.completeness_score(labels, self.model.labels_))
from sklearn.metrics import classification_report #print(classification_report(classifier.predict(test), t_test, target_names=['setosa', 'versicolor', 'virginica'])) from sklearn.model_selection import cross_val_score #scores = cross_val_score(classifier, data, t, cv=6) #print(scores) #from numpy import mean #print(mean(scores)) from sklearn.cluster import KMeans kmeans = KMeans(n_clusters=3, init='random')# initialization kmeans.fit(data)# actual execution c = kmeans.predict(data) from sklearn.metrics import completeness_score, homogeneity_score print(completeness_score(t, c)) print(homogeneity_score(t, c)) from pylab import subplot, plot, show, figure figure() subplot(211) # top figure with the real classes plot(data[t==1,0],data[t==1,2],'bo') plot(data[t==2,0],data[t==2,2],'ro') plot(data[t==3,0],data[t==3,2],'go') subplot(212) # bottom figure with classes assigned automatically show()
y_upper = y_lower + ct_values.shape[0] color = cm.Accent(float(t) / n) ax[mapping[i]].fill_betweenx(np.arange(y_lower, y_upper), 0, ct_values, facecolor=color, edgecolor=color) y_lower = y_upper + 20 plt.show() # Compute the other metrics for K=2 km = KMeans(n_clusters=2, max_iter=1000, random_state=1000) Y_pred = km.fit_predict(cdf) df_km = pd.DataFrame(Y_pred, columns=['prediction'], index=cdf.index) kmdff = pd.concat([dff, df_km], axis=1) print('Completeness: {}'.format(completeness_score(kmdff['diagnosis'], kmdff['prediction']))) print('Homogeneity: {}'.format(homogeneity_score(kmdff['diagnosis'], kmdff['prediction']))) print('Adj. Mutual info: {}'.format(adjusted_mutual_info_score(kmdff['diagnosis'], kmdff['prediction']))) print('Adj. Rand score: {}'.format(adjusted_rand_score(kmdff['diagnosis'], kmdff['prediction']))) # Perform a K-Means clustering with K=8 km = KMeans(n_clusters=8, max_iter=1000, random_state=1000) Y_pred = km.fit_predict(cdf) df_km = pd.DataFrame(Y_pred, columns=['prediction'], index=cdf.index) kmdff = pd.concat([dff, df_km], axis=1) # Show the result fig, ax = plt.subplots(figsize=(18, 11)) with sns.plotting_context("notebook", font_scale=1.5):
def test_using_sklearn(label_true, label_true_test, dataset, datatest): X = numpy.array(dataset) kmeans = KMeans(n_clusters=2, random_state=0).fit(X) cluster_train = kmeans.labels_ arr_test = numpy.array(datatest) cluster_test = kmeans.predict(arr_test) # Evaluation for Full Training print( "\n------------------------ SCIKIT LEARN --------------------------------" ) print( "--------------- K-MEANS SCORE USING DATA TRAIN -----------------------" ) print("ARI SCORE: " + str( adjusted_rand_score(numpy.array(label_true), numpy.array( cluster_train)))) print("MUTUAL INFO SCORE: " + str( adjusted_mutual_info_score(numpy.array(label_true), numpy.array(cluster_train)))) print("HOMOGENEITY SCORE: " + str( homogeneity_score(numpy.array(label_true), numpy.array(cluster_train))) ) print("COMPLETENESS SCORE: " + str( completeness_score(numpy.array(label_true), numpy.array( cluster_train)))) print("V MEASURE SCORE: " + str( v_measure_score(numpy.array(label_true), numpy.array(cluster_train)))) print("FOWLKES-MALLOWS SCORE: " + str( fowlkes_mallows_score(numpy.array(label_true), numpy.array(cluster_train)))) # print("SILHOUETTE SCORE: " + str(silhouette_score(numpy.array(dataset), numpy.array(label_true), metric="euclidean"))) print("CALINSKI-HARABAZ SCORE: " + str( calinski_harabaz_score(numpy.array(dataset), numpy.array(label_true)))) # Evaluation for Split Validation print( "--------------- K-MEANS SCORE USING DATA TEST -----------------------" ) print("ARI SCORE: " + str( adjusted_rand_score(numpy.array(label_true_test), numpy.array(cluster_test)))) print("MUTUAL INFO SCORE: " + str( adjusted_mutual_info_score(numpy.array(label_true_test), numpy.array(cluster_test)))) print("HOMOGENEITY SCORE: " + str( homogeneity_score(numpy.array(label_true_test), numpy.array(cluster_test)))) print("COMPLETENESS SCORE: " + str( completeness_score(numpy.array(label_true_test), numpy.array(cluster_test)))) print("V MEASURE SCORE: " + str( v_measure_score(numpy.array(label_true_test), numpy.array( cluster_test)))) print("FOWLKES-MALLOWS SCORE: " + str( fowlkes_mallows_score(numpy.array(label_true_test), numpy.array(cluster_test)))) # print("SILHOUETTE SCORE: " + str(silhouette_score(numpy.array(dataset), numpy.array(label_true_test), metric="euclidean"))) print("CALINSKI-HARABAZ SCORE: " + str( calinski_harabaz_score(numpy.array(datatest), numpy.array(label_true_test)))) return None
""" """ Measuring Performance of K-means Homogeneity and Completeness – If you have pre-existing class labels that you’re trying to duplicate with k-means clustering, you can use two measures: homogeneity and completeness. Homogeneity means all of the observations with the same class label are in the same cluster. Completeness means all members of the same class are in the same cluster. Scikit-Learn (Python) has an excellent write-up on these two measures. """ #We can turn those concept as scores homogeneity_score and completeness_score. Both are bounded below by 0.0 and above by 1.0 (higher is better): from sklearn import metrics labels_true = [0, 0, 0, 1, 1, 1] labels_pred = [0, 0, 1, 1, 2, 2] metrics.homogeneity_score(labels_true, labels_pred) metrics.completeness_score(labels_true, labels_pred) #Their harmonic mean called V-measure is computed by v_measure_score metrics.v_measure_score(labels_true, labels_pred) #All calculated together metrics.homogeneity_completeness_v_measure(labels_true, labels_pred) #https://scikit-learn.org/stable/modules/clustering.html#clustering-performance-evaluation #http://www.learnbymarketing.com/methods/k-means-clustering/
cluster_centers = { 'X': kmeans_X.cluster_centers_, 'X_scaled': kmeans_X_scaled.cluster_centers_, 'total_data': kmeans_total_data.cluster_centers_, 'total_data_scaled': kmeans_total_data_scaled.cluster_centers_, } for each in metrics_report.keys(): metrics_report[each]['ARI'] = round( metrics.adjusted_rand_score(y, labels[each]), 2) metrics_report[each]['AMI'] = round( metrics.adjusted_mutual_info_score(y, labels[each]), 2) metrics_report[each]['homogeneity'] = round( metrics.homogeneity_score(y, labels[each]), 2) metrics_report[each]['completeness'] = round( metrics.completeness_score(y, labels[each]), 2) metrics_report[each]['v_measure'] = round( metrics.v_measure_score(y, labels[each]), 2) metrics_report[each]['silhouette'] = round( metrics.silhouette_score(X, labels[each]), 2) metrics_report[each]['accuracy'] = round( metrics.accuracy_score(y, labels[each]) * 100, 2) print(metrics_report) #visualizing - clustering of X_scaled dataset plt.scatter(X_scaled[kmeans_X_scaled.labels_ == 1, 4], X_scaled[kmeans_X_scaled.labels_ == 1, 8], s=20, c='blue', label='Cluster 1')