def test_KMeans_scores(self): digits = datasets.load_digits() df = pdml.ModelFrame(digits) scaled = pp.scale(digits.data) df.data = df.data.pp.scale() self.assert_numpy_array_almost_equal(df.data.values, scaled) clf1 = cluster.KMeans(init='k-means++', n_clusters=10, n_init=10, random_state=self.random_state) clf2 = df.cluster.KMeans(init='k-means++', n_clusters=10, n_init=10, random_state=self.random_state) clf1.fit(scaled) df.fit_predict(clf2) expected = m.homogeneity_score(digits.target, clf1.labels_) self.assertEqual(df.metrics.homogeneity_score(), expected) expected = m.completeness_score(digits.target, clf1.labels_) self.assertEqual(df.metrics.completeness_score(), expected) expected = m.v_measure_score(digits.target, clf1.labels_) self.assertEqual(df.metrics.v_measure_score(), expected) expected = m.adjusted_rand_score(digits.target, clf1.labels_) self.assertEqual(df.metrics.adjusted_rand_score(), expected) expected = m.homogeneity_score(digits.target, clf1.labels_) self.assertEqual(df.metrics.homogeneity_score(), expected) expected = m.silhouette_score(scaled, clf1.labels_, metric='euclidean', sample_size=300, random_state=self.random_state) result = df.metrics.silhouette_score(metric='euclidean', sample_size=300, random_state=self.random_state) self.assertAlmostEqual(result, expected)
def main(): digits = datasets.load_digits() #print_digit_data(digits) # tok #plot_training_data(digits) # tok #plot_target_data(digits) # tok #show_PCA_training(digits) # tok data = preprocess_data(digits) # tok #print(data) # tok X_train, X_test, y_train, y_test = split_data_into_training_and_test(data, digits) # tok clf = cluser_digits(X_train) # tok # show_cluster_digits(clf) # TOK y_pred = predict_labels(clf, X_test, y_test, X_train, y_train) # tok show_prediction_confusion_matrix(y_test, y_pred) # tok homogeneity_score(clf, X_test, y_test, X_train, y_train, y_pred) # tok ########################################## # try a different model svc_model, X_train, X_test, y_train, y_test, images_train, images_test = model_SVC(digits) # tok # grid_search - use this to tune parameters grid_search(digits) # tok apply_grid_search(clf, X_test, y_test, X_train, y_train) predicted = classify_rbf(svc_model, X_test, y_test, images_test) check_model_performance(y_test, predicted) show_model2_results(svc_model, X_train, y_train)
def aggregate_stats(infiles, outfile): """ Combine all the aggstats into a single file Compute summary statistics """ res = [] for infile in infiles: d = pickle.load(open(infile, 'r')) print "The file is", infile assigndf = d['df'] meta = d['meta'] neurons = meta['neurons'] m = extract_metadata(infile) if len(m) == 0: # skip the stupid non-replicated ones continue for k, v in m.iteritems(): assigndf[k] = v assigndf['true_assign_role'] = [np.array(neurons['role']) for _ in range(len(assigndf))] # compute the statistics assigndf['ari'] = assigndf.apply(lambda x : metrics.adjusted_rand_score(x['true_assign'], irm.util.canonicalize_assignment(x['assign'])), axis=1) assigndf['homogeneity'] = assigndf.apply(lambda x : metrics.homogeneity_score(x['true_assign'], irm.util.canonicalize_assignment(x['assign'])), axis=1) assigndf['completeness'] = assigndf.apply(lambda x : metrics.completeness_score(x['true_assign'], irm.util.canonicalize_assignment(x['assign'])), axis=1) # don't consider the ones where the role is "none" as these are multi-role ones neurons.ix[neurons['role'].isnull(), 'role'] = 'I' assigndf['role_ari'] = assigndf.apply(lambda x : metrics.adjusted_rand_score(neurons['role'], irm.util.canonicalize_assignment(x['assign'])), axis=1) assigndf['role_homogeneity'] = assigndf.apply(lambda x : metrics.homogeneity_score(neurons['role'], irm.util.canonicalize_assignment(x['assign'])), axis=1) assigndf['role_completeness'] = assigndf.apply(lambda x : metrics.completeness_score(neurons['role'], irm.util.canonicalize_assignment(x['assign'])), axis=1) assigndf['type_n_true'] = assigndf.apply(lambda x : len(np.unique(x['true_assign'])), axis=1) assigndf['type_n_learned'] = assigndf.apply(lambda x : len(np.unique(x['assign'])), axis=1) assigndf['auc'] = assigndf.apply(lambda x: metrics.roc_auc_score(x['heldout_link_truth'], x['heldout_link_predprob']), axis=1) #assigndf['f1'] = assigndf.apply(lambda x: metrics.f1_score(x['heldout_link_truth'], x['heldout_link_predprob']), axis=1) # # fraction of mass in top N types res.append(assigndf) alldf = pandas.concat(res) pickle.dump(alldf, open(outfile, 'w'), -1)
def kmeans(input_file, n_clusters, Output): lvltrace.lvltrace("LVLEntree dans kmeans unsupervised") ncol=tools.file_col_coma(input_file) data = np.loadtxt(input_file, delimiter=',', usecols=range(ncol-1)) X = data[:,1:] y = data[:,0] sample_size, n_features = X.shape k_means=cluster.KMeans(init='k-means++', n_clusters=n_clusters, n_init=10) k_means.fit(X) reduced_data = k_means.transform(X) values = k_means.cluster_centers_.squeeze() labels = k_means.labels_ k_means_cluster_centers = k_means.cluster_centers_ print "#########################################################################################################\n" #print y #print labels print "K-MEANS\n" print('homogeneity_score: %f'%metrics.homogeneity_score(y, labels)) print('completeness_score: %f'%metrics.completeness_score(y, labels)) print('v_measure_score: %f'%metrics.v_measure_score(y, labels)) print('adjusted_rand_score: %f'%metrics.adjusted_rand_score(y, labels)) print('adjusted_mutual_info_score: %f'%metrics.adjusted_mutual_info_score(y, labels)) print('silhouette_score: %f'%metrics.silhouette_score(X, labels, metric='euclidean', sample_size=sample_size)) print('\n') print "#########################################################################################################\n" results = Output+"kmeans_scores.txt" file = open(results, "w") file.write("K-Means Scores\n") file.write("Homogeneity Score: %f\n"%metrics.homogeneity_score(y, labels)) file.write("Completeness Score: %f\n"%metrics.completeness_score(y, labels)) file.write("V-Measure: %f\n"%metrics.v_measure_score(y, labels)) file.write("The adjusted Rand index: %f\n"%metrics.adjusted_rand_score(y, labels)) file.write("Adjusted Mutual Information: %f\n"%metrics.adjusted_mutual_info_score(y, labels)) file.write("Silhouette Score: %f\n"%metrics.silhouette_score(X, labels, metric='euclidean', sample_size=sample_size)) file.write("\n") file.write("True Value, Cluster numbers, Iteration\n") for n in xrange(len(y)): file.write("%f, %f, %i\n"%(y[n],labels[n],(n+1))) file.close() import pylab as pl from itertools import cycle # plot the results along with the labels k_means_cluster_centers = k_means.cluster_centers_ fig, ax = plt.subplots() im=ax.scatter(X[:, 0], X[:, 1], c=labels, marker='.') for k in xrange(n_clusters): my_members = labels == k cluster_center = k_means_cluster_centers[k] ax.plot(cluster_center[0], cluster_center[1], 'w', color='b', marker='x', markersize=6) fig.colorbar(im) plt.title("Number of clusters: %i"%n_clusters) save = Output + "kmeans.png" plt.savefig(save) lvltrace.lvltrace("LVLsortie dans kmeans unsupervised")
def clustering(dataset): vectorizer = dataset.vectorizer X = dataset.X true_k = dataset.n_classes labels = dataset.target km = cluster.KMeans(n_clusters=true_k, max_iter=100, n_init=1) print("Clustering sparse data with %s" % km) t0 = time() km.fit(X) print("done in %0.3fs" % (time() - t0)) print() print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels, km.labels_)) print("Completeness: %0.3f" % metrics.completeness_score(labels, km.labels_)) print("V-measure: %0.3f" % metrics.v_measure_score(labels, km.labels_)) print("Adjusted Rand-Index: %.3f" % metrics.adjusted_rand_score(labels, km.labels_)) print("Silhouette Coefficient: %0.3f" % metrics.silhouette_score(X, labels, sample_size=1000)) print() print("Top terms per cluster:") order_centroids = km.cluster_centers_.argsort()[:, ::-1] terms = vectorizer.get_feature_names() sizes = np.sum(km.labels_[:, np.newaxis] == np.arange(true_k), axis=0) for i in range(true_k): print("Cluster %d (%d):" % (i, sizes[i]), end='') for ind in order_centroids[i, :10]: print(' %s' % terms[ind], end='') print()
def kmeans_setup(data): if pca_f == 1: pca = PCA(n_components = num_clusters).fit(data) initializer = pca.components_ name = 'PCA' else: initializer = 'k-means++' name = 'k-means++' t0 = time() estimator = KMeans(init=initializer, n_clusters=num_clusters, n_init = num_init, max_iter = num_iterations) estimator.fit(data) if debug == True: sample_size = 300 print('% 9s %.2fs %i %.3f %.3f %.3f %.3f %.3f %.3f' % (name, (time() - t0), estimator.inertia_, metrics.homogeneity_score(labels, estimator.labels_), metrics.completeness_score(labels, estimator.labels_), metrics.v_measure_score(labels, estimator.labels_), metrics.adjusted_rand_score(labels, estimator.labels_), metrics.adjusted_mutual_info_score(labels, estimator.labels_), metrics.silhouette_score(data, estimator.labels_, metric='euclidean', sample_size=sample_size))) return estimator
def affin_test(): savefile = open('traindata.pkl', 'rb') (x_train, y_train, t1) = cPickle.load(savefile) savefile.close() x_train, X_valid, y_train, y_valid = cross_validation.train_test_split( x_train, y_train, test_size=0.9, random_state=42) labels_true = y_train x_train = StandardScaler().fit_transform(x_train) af = AffinityPropagation(preference=-50).fit(x_train) cluster_centers_indices = af.cluster_centers_indices_ labels = af.labels_ n_clusters_ = len(cluster_centers_indices) print('Estimated number of clusters: %d' % n_clusters_) print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels_true, labels)) print("Completeness: %0.3f" % metrics.completeness_score(labels_true, labels)) print("V-measure: %0.3f" % metrics.v_measure_score(labels_true, labels)) print("Adjusted Rand Index: %0.3f" % metrics.adjusted_rand_score(labels_true, labels)) print("Adjusted Mutual Information: %0.3f" % metrics.adjusted_mutual_info_score(labels_true, labels)) print("Silhouette Coefficient: %0.3f" % metrics.silhouette_score(x_train, labels, metric='sqeuclidean'))
def run(self): meandist=[] homogeneity_scores=[] completeness_scores=[] rand_scores=[] silhouettes=[] for k in self.clusters: model = KMeans(n_clusters=k, max_iter=5000, init='k-means++') labels = model.fit_predict(self.X) if k == self.targetcluster and self.stats: nd_data = np.concatenate((self.X, np.expand_dims(labels, axis=1),np.expand_dims(self.y, axis=1)), axis=1) pd_data = pd.DataFrame(nd_data) pd_data.to_csv("cluster.csv", index=False, index_label=False, header=False) print model.cluster_centers_ for i in range (0,3): print "Cluster {}".format(i) cluster = pd_data.loc[pd_data.iloc[:,-2]==i].iloc[:,-2:] print cluster.shape[0] print float(cluster.loc[cluster.iloc[:,-1]==0].shape[0])/cluster.shape[0] print float(cluster.loc[cluster.iloc[:,-1]==1].shape[0])/cluster.shape[0] meandist.append(sum(np.min(cdist(self.X, model.cluster_centers_, 'euclidean'), axis=1))/ self.X.shape[0]) homogeneity_scores.append(metrics.homogeneity_score(self.y, labels)) completeness_scores.append(metrics.completeness_score(self.y, labels)) rand_scores.append(metrics.adjusted_rand_score(self.y, labels)) if self.gen_plot: #self.visualize() self.plot(meandist, homogeneity_scores, completeness_scores, rand_scores, silhouettes)
def bench_k_means(estimator, name, data, target_labels, sample_size): """For benchmarking K-Means estimators. Prints different clustering metrics and train accuracy ARGS estimator: K-Means clustering algorithm <sklearn.cluster.KMeans> name: estimator name <str> data: array-like or sparse matrix, shape=(n_samples, n_features) target_labels: labels of data points <number array> sample_size: size of the sample to use when computing the Silhouette Coefficient <int> """ t0 = time() estimator.fit(data) _, _, train_accuracy = compute_residuals_and_rsquared(estimator.labels_, target_labels) print('% 9s\t%.2fs\t%i\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f' % (name, (time() - t0), estimator.inertia_, metrics.homogeneity_score(target_labels, estimator.labels_), metrics.completeness_score(target_labels, estimator.labels_), metrics.v_measure_score(target_labels, estimator.labels_), metrics.adjusted_rand_score(target_labels, estimator.labels_), metrics.adjusted_mutual_info_score(target_labels, estimator.labels_), metrics.silhouette_score(data, estimator.labels_,metric='euclidean',sample_size=sample_size), train_accuracy ) )
def bench_k_means(estimator, name, data, sample_size, labels,postIds): data=sparse.csr_matrix(data) t0 = time() print("Performing dimensionality reduction using LSA") t0 = time() lsa = TruncatedSVD(500) data = lsa.fit_transform(data) data = Normalizer(copy=False).fit_transform(data) print("done in %fs" % (time() - t0)) print() #sData=sparse.csr_matrix(data) val=estimator.fit(data) print('% 9s %.2fs %i %.3f %.3f %.3f %.3f %.3f ' % (name, (time() - t0), estimator.inertia_, metrics.homogeneity_score(labels, estimator.labels_), metrics.completeness_score(labels, estimator.labels_), metrics.v_measure_score(labels, estimator.labels_), metrics.adjusted_rand_score(labels, estimator.labels_), metrics.adjusted_mutual_info_score(labels, estimator.labels_))) print("Parsing USer File:") parseUserFile() print("extracting User File:") clusterDict=extractCluster(postIds,estimator.labels_) print("writing Cluster Data to File") writeCluterToFile(clusterDict)
def main(argv): file_vectors,clust_type, clusters, distance, cluster_param, std = get_arguments(argv) fname='.'.join(map(str,[file_vectors.split('/')[-1],clust_type, clusters, distance, cluster_param, std])) writer=open(fname,'w') ## better to put in EX1, EX2, .. folders print 'clustering:',clust_type print 'clusters:',clusters print 'cluster_param:',cluster_param print 'std:',std X,words,truth=load_data(file_vectors,True) X=np.array(X) if clust_type=='affin': labels=affin_sclustering(X, n_clust=int(clusters), distance=distance, gamma=float(cluster_param), std=bool(std)) else: labels=knn_sclustering(X, n_clust=int(clusters), k=int(cluster_param)) writer.write('\nVMeas:'+ str(v_measure_score(truth,labels))) writer.write('\nRand:'+str(adjusted_rand_score(truth,labels))) writer.write('\nHomogen:'+str(homogeneity_score(truth,labels))+'\n') i=0 for word in words: writer.write(word+' : '+str(labels[i])+'\n') i+=1 writer.close()
def bench_k_means(estimator, data, labels): t0 = time() estimator.fit(data) print("time to fit: {:.5}".format(time() - t0)) homogenity = metrics.homogeneity_score(labels, estimator.labels_) completeness = metrics.completeness_score(labels, estimator.labels_) v_measure = metrics.v_measure_score(labels, estimator.labels_) print("homogenity {:.5}, completeness {:.5}, v_measure_score {:.5}".format( homogenity, completeness, v_measure) ) adj_rand_score = metrics.adjusted_rand_score( labels, estimator.labels_ ) print("adjusted_rand_score {:.5}".format(adj_rand_score)) adj_mutual_info_score = metrics.adjusted_mutual_info_score( labels, estimator.labels_ ) print("adjusted_mutual_info_score {:.5}".format( adj_mutual_info_score) ) silhouette_score = metrics.silhouette_score( data, estimator.labels_, metric='euclidean' ) print("silhouette_score {:.5}".format( metrics.silhouette_score(data, estimator.labels_, metric='euclidean')) ) return [ homogenity, completeness, v_measure, adj_rand_score, adj_mutual_info_score, silhouette_score ]
def cluster(Z, K=4, algo='kmeans'): descr = Z.columns X = Imputer().fit_transform(Z) ############################################################################## if algo == 'dbscan': # Compute DBSCAN db = DBSCAN(eps=0.3, min_samples=10).fit(X) core_samples_mask = np.zeros_like(db.labels_, dtype=bool) core_samples_mask[db.core_sample_indices_] = True labels = db.labels_ # Number of clusters in labels, ignoring noise if present. n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0) print('Estimated number of clusters: %d' % n_clusters_) print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels_true, labels)) print("Completeness: %0.3f" % metrics.completeness_score(labels_true, labels)) print("V-measure: %0.3f" % metrics.v_measure_score(labels_true, labels)) print("Adjusted Rand Index: %0.3f" % metrics.adjusted_rand_score(labels_true, labels)) print("Adjusted Mutual Information: %0.3f" % metrics.adjusted_mutual_info_score(labels_true, labels)) print("Silhouette Coefficient: %0.3f" % metrics.silhouette_score(X, labels)) elif algo == 'kmeans': km = KMeans(n_clusters=K) km.fit(X) print(km.labels_) return km
def cluster_evaluation(D, y_true, n_clusters, eps=0.8, min_samples=10): ############################################################################## # Extract Y true labels_true = y_true ############################################################################## # transform distance matrix into a similarity matrix S = 1 - D ############################################################################## # compute DBSCAN #db = DBSCAN(eps=eps, min_samples=min_samples).fit(S) db = Ward(n_clusters=n_clusters).fit(S) #core_samples = db.core_sample_indices_ labels = db.labels_ # number of clusters in labels, ignoring noise if present n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0) print 'Number of clusters: %d' % n_clusters_ print 'Homogeneity: %0.3f' % metrics.homogeneity_score(labels_true, labels) print 'Completeness: %0.3f' % metrics.completeness_score(labels_true, labels) print 'V-meassure: %0.3f' % metrics.v_measure_score(labels_true, labels) print 'Adjusted Rand Index: %0.3f' % metrics.adjusted_rand_score(labels_true, labels) print 'Adjusted Mutual Information: %0.3f' % metrics.adjusted_mutual_info_score(labels_true, labels) print 'Silhouette Coefficient: %0.3f' % metrics.silhouette_score(D, labels, metric='precomputed')
def predictAffinityPropagation(X, labels_true): #ranX, ranY = shuffle(X, y, random_state=0) af = AffinityPropagation(preference=-50).fit(X) cluster_centers_indices = af.cluster_centers_indices_ labels = af.labels_ n_clusters_ = len(cluster_centers_indices) print('Estimated number of clusters: %d' % n_clusters_) print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels_true, labels)) print("Completeness: %0.3f" % metrics.completeness_score(labels_true, labels)) print("V-measure: %0.3f" % metrics.v_measure_score(labels_true, labels)) print("Adjusted Rand Index: %0.3f" % metrics.adjusted_rand_score(labels_true, labels)) print("Adjusted Mutual Information: %0.3f" % metrics.adjusted_mutual_info_score(labels_true, labels)) print("Silhouette Coefficient: %0.3f" % metrics.silhouette_score(X, labels, metric='sqeuclidean')) plt.close('all') plt.figure(1) plt.clf() colors = cycle('bgrcmykbgrcmykbgrcmykbgrcmyk') for k, col in zip(range(n_clusters_), colors): class_members = labels == k cluster_center = X[cluster_centers_indices[k]] plt.plot(X[class_members, 0], X[class_members, 1], col + '.') plt.plot(cluster_center[0], cluster_center[1], 'o', markerfacecolor=col, markeredgecolor='k', markersize=14) for x in X[class_members]: plt.plot([cluster_center[0], x[0]], [cluster_center[1], x[1]], col) plt.title('Estimated number of clusters: %d' % n_clusters_) plt.show()
def compare(method1, method2, fig=False): X1 = np.load('{0}_{1}_X_2d.npy'.format(species, method1)) X2 = np.load('{0}_{1}_X_2d.npy'.format(species, method2)) print 'n_cluster\tHomo\tCompl\tNMI\tARI' for i in range(2, 6): clust1 = Clustering(species, method1, X1, None, n_clusters=i) clust2 = Clustering(species, method2, X2, None, n_clusters=i) clust1.agglomerative(linkage='ward') clust2.agglomerative(linkage='ward') label1 = clust1.pred_labels('ward') label2 = clust2.pred_labels('ward') if i == 3 and fig: names = np.unique(label1) figName = '{0}_{1}_on_{2}'.format(species, method1, method2) plot2d(X2, label1, names, figName, figName) names = np.unique(label2) figName = '{0}_{1}_on_{2}'.format(species, method2, method1) plot2d(X1, label2, names, figName, figName) print '{0}\t{1}\t{2}\t{3}\t{4}\n'.format(i, metrics.homogeneity_score(label1, label2), metrics.completeness_score(label1, label2), metrics.normalized_mutual_info_score(label1, label2), metrics.adjusted_rand_score(label1, label2))
def clustering_by_kmeans(vectorizer, X, true_k): print "Clustering in " + str(true_k) + " groups by K-means..." km = KMeans(n_clusters=true_k, init='k-means++', max_iter=500, n_init=1) km.fit_predict(X) print "Measuring..." print("Homogeneity: %0.3f" % metrics.homogeneity_score(documents, km.labels_)) print("Completeness: %0.3f" % metrics.completeness_score(documents, km.labels_)) print("V-measure: %0.3f" % metrics.v_measure_score(documents, km.labels_)) #V-measure is an entropy-based measure which explicitly measures how successfully the criteria of homogeneity and completeness have been satisfied. print("Adjusted Rand-Index: %.3f" % metrics.adjusted_rand_score(documents, km.labels_)) print("Silhouette Coefficient: %0.3f" % metrics.silhouette_score(X, km.labels_, sample_size=1000)) #print top terms per cluster clusters clusters = km.labels_.tolist() # 0 iff term is in cluster0, 1 iff term is in cluster1 ... (lista de termos) #print "Lista de termos pertencentes aos clusters " + str(clusters) print "Total de " + str(len(km.labels_)) + " documents" #Example to get all documents in cluster 0 #cluster_0 = np.where(clusters==0) # don't forget import numpy as np #print cluster_0 #cluster_0 now contains all indices of the documents in this cluster, to get the actual documents you'd do: #X_cluster_0 = documents[cluster_0] terms = vectorizer.get_feature_names() #print terms measuring_kmeans(true_k,clusters)
def cluster(algorithm, data, topics, make_silhouette=False): print str(algorithm) clusters = algorithm.fit_predict(data) labels = algorithm.labels_ print 'Homogeneity: %0.3f' % metrics.homogeneity_score(topics, labels) print 'Completeness: %0.3f' % metrics.completeness_score(topics, labels) print 'V-measure: %0.3f' % metrics.v_measure_score(topics, labels) print 'Adjusted Rand index: %0.3f' % metrics.adjusted_rand_score(topics, labels) print 'Silhouette test: %0.3f' % metrics.silhouette_score(data, labels) print ' ***************** ' silhouettes = metrics.silhouette_samples(data, labels) num_clusters = len(set(clusters)) print 'num clusters: %d' % num_clusters print 'num fitted: %d' % len(clusters) # Make a silhouette plot if the flag is set if make_silhouette: order = numpy.lexsort((-silhouettes, clusters)) indices = [numpy.flatnonzero(clusters[order] == num_clusters) for k in range(num_clusters)] ytick = [(numpy.max(ind)+numpy.min(ind))/2 for ind in indices] ytickLabels = ["%d" % x for x in range(num_clusters)] cmap = cm.jet( numpy.linspace(0,1,num_clusters) ).tolist() clr = [cmap[i] for i in clusters[order]] fig = plt.figure() ax = fig.add_subplot(111) ax.barh(range(data.shape[0]), silhouettes[order], height=1.0, edgecolor='none', color=clr) ax.set_ylim(ax.get_ylim()[::-1]) plt.yticks(ytick, ytickLabels) plt.xlabel('Silhouette Value') plt.ylabel('Cluster') plt.savefig('cluster.png')
def cluster(model, uids): ############################################################################## # Generate sample data X = [] for uid in uids: X.append(model.docvecs[uid]) labels_true = uids ############################################################################## # Compute Affinity Propagation af = AffinityPropagation(preference=-50).fit(X) pickle.dump(af, open('data/af.pick', 'w')) cluster_centers_indices = af.cluster_centers_indices_ labels = af.labels_ n_clusters_ = len(cluster_centers_indices) print('Estimated number of clusters: %d' % n_clusters_) print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels_true, labels)) print("Completeness: %0.3f" % metrics.completeness_score(labels_true, labels)) print("V-measure: %0.3f" % metrics.v_measure_score(labels_true, labels)) print("Adjusted Rand Index: %0.3f" % metrics.adjusted_rand_score(labels_true, labels)) print("Adjusted Mutual Information: %0.3f" % metrics.adjusted_mutual_info_score(labels_true, labels)) print("Silhouette Coefficient: %0.3f" % metrics.silhouette_score(X, labels, metric='sqeuclidean'))
def bestClassify(X,Y): "Best classifier function" tfidf = True if tfidf: vec = TfidfVectorizer(preprocessor = identity, tokenizer = identity, sublinear_tf = True) else: vec = CountVectorizer(preprocessor = identity, tokenizer = identity) km = KMeans(n_clusters=2, n_init=100, verbose=1) clusterer = Pipeline( [('vec', vec), ('cls', km)] ) prediction = clusterer.fit_predict(X,Y) checker = defaultdict(list) for pred,truth in zip(prediction,Y): checker[pred].append(truth) labeldict = {} for pred, label in checker.items(): labeldict[pred] = Counter(label).most_common(1)[0][0] #print(pred, Counter(label).most_common(1)[0][0]) prediction = [labeldict[p] for p in prediction] labels = list(labeldict.values()) print(labels) print(confusion_matrix(Y, prediction, labels=labels)) print("Homogeneity:", homogeneity_score(Y,prediction)) print("Completeness:", completeness_score(Y,prediction)) print("V-measure:", v_measure_score(Y,prediction)) print("Rand-Index:", adjusted_rand_score(Y,prediction))
def get_result(km, labels): homo_score = metrics.homogeneity_score(labels, km.labels_) complete_score = metrics.completeness_score(labels, km.labels_) v_score = metrics.v_measure_score(labels, km.labels_) rand_score = metrics.adjusted_rand_score(labels, km.labels_) mutual_info = metrics.adjusted_mutual_info_score(labels, km.labels_) return homo_score, complete_score, v_score, rand_score, mutual_info
def run_clustering( clusterer, data, labels ): """ Cluster: Using a predefined and parameterized clustering algorithm, fit some dataset and perform metrics given a set of ground-truth labels. clusterer: the clustering algorithm, from sklearn data: array-like dataset input labels: vector of ground-truth labels """ # Time the operation t0 = time() clusterer.fit(data) t1 = time() # Perform metrics runtime = (t1 - t0) homogeneity = metrics.homogeneity_score( labels, clusterer.labels_ ) completeness = metrics.completeness_score( labels, clusterer.labels_ ) v_measure = metrics.v_measure_score( labels, clusterer.labels_ ) adjusted_rand = metrics.adjusted_rand_score( labels, clusterer.labels_ ) adjusted_mutual = metrics.adjusted_mutual_info_score( labels, clusterer.labels_ ) # Output to logs logging.info(" |- Execution time: %fs" % runtime) logging.info(" |- Homogeneity: %0.3f" % homogeneity) logging.info(" |- Completeness: %0.3f" % completeness) logging.info(" |- V-measure: %0.3f" % v_measure) logging.info(" |- Adjusted Rand-Index: %.3f" % adjusted_rand) logging.info(" |- Adjusted Mutual Info: %.3f" % adjusted_mutual)
def cluseval(label, truth): rand = metrics.adjusted_rand_score(truth, label) mutual = metrics.adjusted_mutual_info_score(truth, label) h**o = metrics.homogeneity_score(truth, label) complete = metrics.completeness_score(truth, label) v = metrics.v_measure_score(truth, label) result = [rand, mutual, h**o, complete, v] return result
def print_cluster(clusterTrainClass, labels, clusterTestStory): print("Homogeneity: %0.3f" % metrics.homogeneity_score(clusterTrainClass, labels)) print("Completeness: %0.3f" % metrics.completeness_score(clusterTrainClass, labels)) print("V-measure: %0.3f" % metrics.v_measure_score(clusterTrainClass, labels)) print("Adjusted Rand Index: %0.3f" % metrics.adjusted_rand_score(clusterTrainClass, labels)) print("Adjusted Mutual Information: %0.3f" % metrics.adjusted_mutual_info_score(clusterTrainClass, labels)) print "Silhouette Coefficient:" print metrics.silhouette_score(clusterTestStory, labels, metric='euclidean')
def evaluate(labels_true, labels): homogeneity = metrics.homogeneity_score(labels_true, labels) completeness = metrics.completeness_score(labels_true, labels) v_measure = metrics.v_measure_score(labels_true, labels) adjusted_rand = metrics.adjusted_rand_score(labels_true, labels) adjusted_mutual_info = metrics.adjusted_mutual_info_score(labels_true, labels) #silhouette = metrics.silhouette_score(data, labels, metric='sqeuclidean') return homogeneity, completeness, v_measure, adjusted_rand, adjusted_mutual_info#, silhouette
def cluster_metrics(labels_1, labels_2): print("\n".join( [ "Normalized Mutual Information: %f" % (normalized_mutual_info_score(labels_1, labels_2)), "Adjusted Rand Score: %f" % (adjusted_rand_score(labels_1, labels_2)), "Homogeneity: %f" % (homogeneity_score(labels_1, labels_2)), "Completeness: %f" % (completeness_score(labels_1, labels_2)) ] ))
def main(): # Parse command line arguments parser = argparse.ArgumentParser(usage=__doc__, formatter_class=argparse.ArgumentDefaultsHelpFormatter, description='Perform spectral clustering.') parser.add_argument("--clusters", "-c", type=int, help='Number of clusters.') parser.add_argument("--knn", "-k", type=int, default=0, help='Number of nearest neighbors, 0 means all.') parser.add_argument("--sm", "-s", help='File containing similarity matrix') parser.add_argument("--iterations", "-i", type=int, default=10, help='Number of KMeans iterations.') parser.add_argument("--true_labels", "-t", help='File containing the true labels.') parser.add_argument("--output", "-o", help='Name of the file to write' + ' the labels to.') parser.add_argument("--normalize", "-n", action='store_true', help='Normalize each row so that the max value is one.') args = parser.parse_args() sm = np.load(args.sm) if args.normalize: sm /= sm.max(axis=1)[:, np.newaxis] # Ensure symmetric sm = (sm + sm.T) / 2 labels = [] if args.knn > 0: labels = SpectralClustering(n_clusters=args.clusters, affinity='nearest_neighbors', n_neighbors=args.knn, n_init=args.iterations).fit(sm).labels_ else: labels = SpectralClustering(n_clusters=args.clusters, affinity='precomputed', n_init=args.iterations).fit(sm).labels_ with open(args.output, 'w') as fout: for l in labels: fout.write(str(l) + '\n') # Load the true labels. if args.true_labels: true_labels = [] with open(args.true_labels, 'r') as fin: for line in fin: true_labels.append(int(line.strip())) # Run the metrics. print("Homogeneity: %0.3f" % metrics.homogeneity_score(true_labels, labels)) print("Completeness: %0.3f" % metrics.completeness_score(true_labels, labels)) print("V-measure: %0.3f" % metrics.v_measure_score(true_labels, labels)) print("Adjusted Rand Index: %0.3f" % metrics.adjusted_rand_score(true_labels, labels)) print("Adjusted Mutual Information: %0.3f" % metrics.adjusted_mutual_info_score(true_labels, labels)) print("Silhouette Coefficient: %0.3f" % metrics.silhouette_score(sm, labels))
def eval_clusters(self): """calculates the adjusted rand index of the clustering based on the label of the points """ _, labels_true, labels_pred = self.get_labels() ari = metrics.adjusted_rand_score(labels_true, labels_pred) hom = metrics.homogeneity_score(labels_true, labels_pred) comp = metrics.completeness_score(labels_true, labels_pred) return ari, hom, comp
def get_cluster_metrics(X, labels, labels_true=None): metrics_dict = dict() metrics_dict['Silhouette coefficient'] = metrics.silhouette_score(X, labels, metric='precomputed') if labels_true: metrics_dict['Completeness score'] = metrics.completeness_score(labels_true, labels) metrics_dict['Homogeneity score'] = metrics.homogeneity_score(labels_true, labels) return metrics_dict
def evaluateAllAlgorithms(self): algs = [self.labels_db,self.labels_ap] t**s =['DBASE','AP'] for i in range(2): print 'Algorithm:',t**s[i] print("\tHomogeneity: %0.3f" % metrics.homogeneity_score(self.labels_gt, algs[i])) print("\tCompleteness: %0.3f" % metrics.completeness_score(self.labels_gt, algs[i])) print("\tV-measure: %0.3f" % metrics.v_measure_score(self.labels_gt, algs[i])) print("\tAdjusted Rand Index: %0.3f"% metrics.adjusted_rand_score(self.labels_gt, algs[i])) print("\tAdjusted Mutual Information: %0.3f"% metrics.adjusted_mutual_info_score(self.labels_gt, algs[i]))
gmmClusterer = GaussianMixture(n_components=2) t0 = time() gmmTrainedLabels = gmmClusterer.fit(Train_Matrix) gmmTestLabels = gmmClusterer.predict(Test_Matrix) print(82 * '*') print("Cluster Means: ", str(gmmClusterer.means_)) print(82 * '-') print("Cluster Covariance: ", gmmClusterer.covariances_) print(82 * '-') print("Precisions: ", str(gmmClusterer.precisions_)) print(82 * '-') print('Model\t\ttime\thomo\tcompl\tv-meas\tARI \tAMI') print('%-9s\t%.2fs\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f' % ('GMM', (time() - t0), metrics.homogeneity_score(Test_Target_Matrix, gmmTestLabels), metrics.completeness_score(Test_Target_Matrix, gmmTestLabels), metrics.v_measure_score(Test_Target_Matrix, gmmTestLabels), metrics.adjusted_rand_score(Test_Target_Matrix, gmmTestLabels), metrics.adjusted_mutual_info_score(Test_Target_Matrix, gmmTestLabels))) # plt.scatter(Test_Matrix.iloc[0,:], Test_Matrix.iloc[1,:], color='black') # # Prediction and draw the diagram # #plt.plot(range(len(testData)), y_testDataPrediction_tuned, color='red', linewidth=1) # #plt.legend(["predict", "true"], loc='upper right') # plt.title('GMM Clustering') # plt.show()
print '---' print 'true kappas {}'.format(kappas) print 'vmf-soft kappas {}'.format( vmf_soft.concentrations_[[vmf_soft_mu_0_idx, vmf_soft_mu_1_idx]]) print 'vmf-hard kappas {}'.format( vmf_hard.concentrations_[[vmf_hard_mu_0_idx, vmf_hard_mu_1_idx]]) print '---' print 'vmf-soft weights {}'.format( vmf_soft.weights_[[vmf_soft_mu_0_idx, vmf_soft_mu_1_idx]]) print 'vmf-hard weights {}'.format( vmf_hard.weights_[[vmf_hard_mu_0_idx, vmf_hard_mu_1_idx]]) print '---' print("Homogeneity: %0.3f (k-means)" % metrics.homogeneity_score(labels, km.labels_)) print("Homogeneity: %0.3f (spherical k-means)" % metrics.homogeneity_score(labels, skm.labels_)) print("Homogeneity: %0.3f (vmf-soft)" % metrics.homogeneity_score(labels, vmf_soft.labels_)) print("Homogeneity: %0.3f (vmf-hard)" % metrics.homogeneity_score(labels, vmf_hard.labels_)) print '---' print("Completeness: %0.3f (k-means)" % metrics.completeness_score(labels, km.labels_)) print("Completeness: %0.3f (spherical k-means)" % metrics.completeness_score(labels, skm.labels_)) print("Completeness: %0.3f" % metrics.completeness_score(labels, vmf_soft.labels_)) print("Completeness: %0.3f" %
class_list = [] data_test = np.array(data_test) index = 0 print (shape(centers)) for i in range (shape(centers)[0]): best_dist = 9999999 for j in range (shape(data_test)[0]): dist=0 for k in range(shape(centers)[1]): dist = dist+((centers[i][k]-data_test[j][k])**2) dist = (dist)**0.5 if dist<best_dist: best_dist=dist index=j class_list.append(label_test[index]) pred=clusters for i in range (len(clusters)): pred[i]=class_list[clusters [i]] print('Homogeneity score :\n', metrics.homogeneity_score(label_test, pred)) print('F1 score :\n', f1_score(label_test, pred, average=None)) print ('ACCURACY :\n', metrics.classification_report(label_test, pred)) print('Confusion matrix:\n', confusion_matrix(label_test, pred))
def kmeans(principalDf, NbCluster, finalDf): kmeans = KMeans(n_clusters=3, init='k-means++').fit(principalDf) KM_clustered = principalDf.copy() KM_clustered = pd.DataFrame(KM_clustered) KM_clustered.loc[:, 'Cluster'] = kmeans.labels_ # append labels to points frames = [finalDf['Analysis'], KM_clustered['Cluster']] result = pd.concat(frames, axis=1) print('-' * 60) print("Kmeans résultat") print('-' * 60) print("Shape: {}".format(result.shape)) print(result.sample(5)) # ============================================================================= # Assigning a label to each cluster # As there's no relation between a cluster number and the true label we need to map a cluster to the one label which appears most in that cluster # # These corrected predicted labels are needed below to calculate model performance vs the the true labels # ============================================================================= print('\n') for ClusterNum in range(3): OneCluster = pd.DataFrame( result[result['Cluster'] == ClusterNum].groupby('Analysis').size()) OneCluster.columns = ['Size'] NewDigit = OneCluster.index[OneCluster['Size'] == OneCluster['Size'].max()].tolist() NewDigit[0] rowIndex = result.index[result['Cluster'] == ClusterNum] result.loc[rowIndex, 'TransLabel'] = NewDigit[0] print(ClusterNum, NewDigit[0]) # ============================================================================= # # Check performance of classification to 3 clusters # ============================================================================= print('-' * 60) print('K-Means performance') print('-' * 60) Correct = (finalDf['Analysis'] == result['TransLabel']).sum() Accuracy = round(Correct / finalDf.shape[0], 3) print('Accuracy ', Accuracy) # ============================================================================= # # METRICS for clustering algorithms # ============================================================================= print( 'homogeneity_score: ', round( metrics.homogeneity_score(finalDf['Analysis'], result['TransLabel']), 3)) print( 'completeness_score: ', round( metrics.completeness_score(finalDf['Analysis'], result['TransLabel']), 3)) print( 'v_measure_score: ', round( metrics.v_measure_score(finalDf['Analysis'], result['TransLabel']), 3)) print( 'adjusted_rand_score: ', round( metrics.adjusted_rand_score(finalDf['Analysis'], result['TransLabel']), 3)) print( 'adjusted_mutual_info_score: ', round( metrics.adjusted_mutual_info_score(finalDf['Analysis'], result['TransLabel']), 3)) # Step size of the mesh. Decrease to increase the quality of the VQ. h = .02 # point in the mesh [x_min, x_max]x[y_min, y_max]. # Plot the decision boundary. For that, we will assign a color to each x_min, x_max = principalDf.to_numpy()[:, 0].min( ) - 1, principalDf.to_numpy()[:, 0].max() + 1 y_min, y_max = principalDf.to_numpy()[:, 1].min( ) - 1, principalDf.to_numpy()[:, 1].max() + 1 xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h)) # Obtain labels for each point in mesh. Use last trained model. Z = kmeans.predict(np.c_[xx.ravel(), yy.ravel()]) # Put the result into a color plot Z = Z.reshape(xx.shape) plt.figure(1) plt.clf() plt.imshow(Z, interpolation='nearest', extent=(xx.min(), xx.max(), yy.min(), yy.max()), cmap=plt.cm.Paired, aspect='auto', origin='lower') plt.plot(principalDf.to_numpy()[:, 0], principalDf.to_numpy()[:, 1], 'k.', markersize=2) # Plot the centroids as a white X centroids = kmeans.cluster_centers_ plt.scatter(centroids[:, 0], centroids[:, 1], marker='x', s=169, linewidths=3, color='w', zorder=10) plt.title('K-means clustering on the digits dataset (PCA-reduced data)\n' 'Centroids are marked with white cross') plt.xlim(x_min, x_max) plt.ylim(y_min, y_max) plt.xticks(()) plt.yticks(()) plt.show()
from sklearn.cluster import KMeans km3 = KMeans(n_clusters=3, init='k-means++', max_iter=100, n_init=1) get_ipython().magic('time km3.fit(X_lsa)') # In[11]: # How do we know the clustering result is good or not? # If we have labels available, we can use this to derive how coherent the clusters are. # Homogeneity: each cluster contains only members of a single class from sklearn import metrics labels = subnews['Class'] print("Homogeneity for 3 clusters: %0.3f" % metrics.homogeneity_score(labels, km3.labels_)) # In[12]: # Let's try some other K values to compare their metrics km2 = KMeans(n_clusters=2, init='k-means++', max_iter=100, n_init=1) get_ipython().magic('time km2.fit(X_lsa)') km4 = KMeans(n_clusters=4, init='k-means++', max_iter=100, n_init=1) get_ipython().magic('time km4.fit(X_lsa)') km5 = KMeans(n_clusters=5, init='k-means++', max_iter=100, n_init=1) get_ipython().magic('time km5.fit(X_lsa)') # In[13]:
Score = defaultdict(list) adjMI = defaultdict(list) S_homog = defaultdict(list) S_adjMI = defaultdict(list) S_vm = defaultdict(list) for k in clusters: km.set_params(n_clusters=k) gmm.set_params(n_components=k) km.fit(X_scaled) gmm.fit(X_scaled) Score['km'].append(km.score(X_scaled)) Score['gmm'].append(gmm.score(X_scaled)) S_homog['km'].append( metrics.homogeneity_score(labels, km.predict(X_scaled))) S_homog['gmm'].append( metrics.homogeneity_score(labels, gmm.predict(X_scaled))) S_adjMI['km'].append( metrics.adjusted_mutual_info_score(labels, km.predict(X_scaled))) S_adjMI['gmm'].append( metrics.adjusted_mutual_info_score(labels, gmm.predict(X_scaled))) S_vm['km'].append(metrics.v_measure_score(labels, km.predict(X_scaled))) S_vm['gmm'].append(metrics.v_measure_score(labels, gmm.predict(X_scaled))) plt.figure(figsize=(9.6, 7.2)) plt.xlabel('Number of clusters') plt.ylabel('Score value') plt.title('Score vs. Cluster number for K-mean and Gaussian Mixture (species)') plt.grid(True) #plt.legend(['Train', 'Test'], loc='lower right')
df.max() df.min() type(predictors) #============================================================================== # Clustering using Kmeans #============================================================================== #defining the object that will carry out the kmeans clustering part. KMeans_object = KMeans(init = 'k-means++', n_clusters = 17, n_init= 10) #doing the kmeans clustering KMeans_object.fit(predictors) #printing the inertia print('The Inertia is:', KMeans_object.inertia_) #Calculating the Homogenity score print('Homogenity Score is:', metrics.homogeneity_score(outcomes, KMeans_object.labels_)) #Calculating the Completeness Score print('Completeness Score is:', metrics.completeness_score(outcomes, KMeans_object.labels_)) #calculating the V-measure score print('V-Measure Score is:', metrics.v_measure_score(outcomes, KMeans_object.labels_)) #calculating the adjusted rand score print('Adjusted Rand Score is:', metrics.adjusted_rand_score(outcomes, KMeans_object.labels_)) #calculating the adjusted mutual information score print('Adjusted Mututal Info Score is:', metrics.adjusted_mutual_info_score(outcomes, KMeans_object.labels_)) #Calculating the SIlhoutte Score. We are not sampling the dataset to calculate it. print('Silhoutte Score is:', metrics.silhouette_score(predictors2, KMeans_object.labels_, metric='euclidean'))
def predict_and_cluster(opts,mode): n_digits = 3 n_samples, n_features = (25,1927) labels = array([0,1,2,1,1,2,2,1,2,0,0,0,1,1,2,1,1,1,1,1,1,1,1,2,1]) true_k = np.unique(labels).shape[0] corpus, news = jieba_tokenizer() print("Extracting features from the training dataset using a sparse vectorizer") t0 = time() if opts.use_hashing: if opts.use_idf: # Perform an IDF normalization on the output of HashingVectorizer hasher = HashingVectorizer(n_features=opts.n_features, stop_words='english', non_negative=True, norm=None, binary=False) vectorizer = make_pipeline(hasher, TfidfTransformer()) else: vectorizer = HashingVectorizer(n_features=opts.n_features, stop_words='english', non_negative=False, norm='l2', binary=False) else: vectorizer = TfidfVectorizer(max_df=0.5, max_features=opts.n_features, min_df=2, stop_words='english', use_idf=opts.use_idf) X = vectorizer.fit_transform(corpus) print("done in %fs" % (time() - t0)) # n_samples: how many articles are there # n_features: how many different words in all articles are there print("n_samples: %d, n_features: %d" % X.shape) print() if opts.n_components: print("Performing dimensionality reduction using LSA") t0 = time() # Vectorizer results are normalized, which makes KMeans behave as # spherical k-means for better results. Since LSA/SVD results are # not normalized, we have to redo the normalization. svd = TruncatedSVD(opts.n_components) lsa = make_pipeline(svd, Normalizer(copy=False)) X = lsa.fit_transform(X) print("done in %fs" % (time() - t0)) svd = TruncatedSVD().fit(X) X_proj = svd.transform(X) explained_variances = np.var(X_proj, axis=0) / np.var(X, axis=0).sum() print("Explained variance of the SVD step: {}%".format( int(explained_variances[0] * 100))) print() # ================================================= # clustering # if opts.minibatch: # km = MiniBatchKMeans(n_clusters=true_k, init='k-means++', n_init=1, # init_size=1000, batch_size=1000, verbose=True) # else: km = KMeans(n_clusters=true_k, init='k-means++', max_iter=100, n_init=1, verbose=True) # always better print("Clustering sparse data with %s" % km) t0 = time() km.fit(X) print("done in %0.3fs" % (time() - t0)) print() print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels, km.labels_)) print("Completeness: %0.3f" % metrics.completeness_score(labels, km.labels_)) print("V-measure: %0.3f" % metrics.v_measure_score(labels, km.labels_)) print("Adjusted Rand-Index: %.3f" % metrics.adjusted_rand_score(labels, km.labels_)) print("Silhouette Coefficient: %0.3f" % metrics.silhouette_score(X, labels, sample_size=None)) print() if not (opts.n_components or opts.use_hashing): print("Top terms per cluster:") order_centroids = km.cluster_centers_.argsort()[:, ::-1] terms = vectorizer.get_feature_names() for i in range(true_k): print("Cluster %d:" % i, end='') for ind in order_centroids[i, :10]: print(' %s' % terms[ind], end='') print() for i in range(len(news)): news[i].category = labels[i] from sklearn.metrics.pairwise import cosine_similarity FG=nx.Graph() for i in range(len(news)): news[i].similarity = cosine_similarity(X[i:i+1], X)[0] cs = news[i].similarity # print (cs) for j in range(len(news)): if i != j: FG.add_weighted_edges_from([(i,j,cs[j])]) # for i in range(len(news)): # print(news[i].number, news[i].title, news[i].time, news[i].category, news[i].url, news[i].similarity) bestpart(FG,labels,km.labels_)
titles = u'原始数据', u'KMeans++聚类', u'旋转后数据', u'旋转后KMeans++聚类',\ u'方差不相等数据', u'方差不相等KMeans++聚类', u'数量不相等数据', u'数量不相等KMeans++聚类' model = KMeans(n_clusters=4, init='k-means++', n_init=5) plt.figure(figsize=(9, 10), facecolor='w') for i, (x, y, title) in enumerate(zip(data_list, y_list, titles), start=1): plt.subplot(4, 2, i) plt.title(title) if i % 2 == 1: y_pred = y else: y_pred = model.fit_predict(x) model.cluster_centers_ print i print 'Homogeneity:', homogeneity_score(y, y_pred) print 'completeness:', completeness_score(y, y_pred) print 'V measure:', v_measure_score(y, y_pred) print 'AMI:', adjusted_mutual_info_score(y, y_pred) print 'ARI:', adjusted_rand_score(y, y_pred) print 'Silhouette:', silhouette_score(x, y_pred), '\n' plt.scatter(x[:, 0], x[:, 1], c=y_pred, s=30, cmap=cm, edgecolors='none') x1_min, x2_min = np.min(x, axis=0) x1_max, x2_max = np.max(x, axis=0) x1_min, x1_max = expand(x1_min, x1_max) x2_min, x2_max = expand(x2_min, x2_max)
# use: https://scikit-learn.org/stable/modules/clustering.html#clustering-performance-evaluation # ADJUSTED RAND SCORE # https://scikit-learn.org/stable/modules/clustering.html#adjusted-rand-index print('--> adjusted rand score') #print('adjusted rand score on training set: {}'.format(metrics.adjusted_rand_score(y_train, y_train_pred))) #print('adjusted rand score on testing set: {}'.format(metrics.adjusted_rand_score(y_test, y_test_pred))) print('adjusted rand score: {}'.format(metrics.adjusted_rand_score(y, y_pred))) # HOMOGENEITY # https://scikit-learn.org/stable/modules/clustering.html#homogeneity-completeness-and-v-measure # homogeneity: each cluster contains only members of a single class. print('--> homogeneity: each cluster contains only members of a single class.') # print('homogeneity score on training set: {}'.format(metrics.homogeneity_score(y_train, y_train_pred))) # print('homogeneity score on testing set: {}'.format(metrics.homogeneity_score(y_test, y_test_pred))) print('homogeneity score: {}'.format(metrics.homogeneity_score(y, y_pred))) # COMPLETENESS # https://scikit-learn.org/stable/modules/clustering.html#homogeneity-completeness-and-v-measure # completeness: all members of a given class are assigned to the same cluster. print('--> completeness: all members of a given class are assigned to the same cluster.') # print('completeness score on training set: {}'.format(metrics.completeness_score(y_train, y_train_pred))) # print('completeness score on testing set: {}'.format(metrics.completeness_score(y_test, y_test_pred))) print('completeness score: {}'.format(metrics.completeness_score(y, y_pred))) # FOWLKES MALLOWS SCORES # https://scikit-learn.org/stable/modules/clustering.html#fowlkes-mallows-scores print('--> fowlkes mallows score: The Fowlkes-Mallows score FMI is defined as the geometric mean of the pairwise precision and recall.') # print('fowlkes mallows score on training set: {}'.format(metrics.fowlkes_mallows_score(y_train, y_train_pred))) # print('fowlkes mallows score on testing set: {}'.format(metrics.fowlkes_mallows_score(y_test, y_test_pred))) print('fowlkes mallows score: {}'.format(metrics.fowlkes_mallows_score(y, y_pred)))
# -*- coding: utf-8 -*- """ Created on Wed May 2 21:53:10 2018 meanshift with iris data @author: shifuddin """ from load_data import load_csv from sklearn.cluster import MeanShift, estimate_bandwidth from sklearn.metrics import homogeneity_score ''' Load X, y from uri ''' uri = 'https://archive.ics.uci.edu/ml/machine-learning-databases/spect/SPECTF.test' X, y = load_csv(uri, ',', 1, 45, 0, 1, True) ''' Calculate bandwidth / radius of each cluster centroid from data ''' bandwidth = estimate_bandwidth(X, quantile=.1, n_samples=100) ms = MeanShift(bandwidth=bandwidth, bin_seeding=True) ms.fit(X) labels = ms.labels_ centroids = ms.cluster_centers_ homogeneity = homogeneity_score(y.ravel(), labels)
def show_db(label_data, db, corpus, corpus_embeddings, corpus_file=None, label_file=None): if label_file is not None: # 读取真实标签数据 label_data = pd.read_csv(label_file) labels_true = label_data.flag.to_list() labels = db.labels_ n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0) n_noise_ = list(labels).count(-1) clustered_sentences = [[] for i in range(n_clusters_)] if corpus_file is not None: # 读取原始文本 corpus = pd.read_csv(corpus_file).content.to_list() for sentence_id, cluster_id in enumerate(labels): clustered_sentences[cluster_id].append(corpus[sentence_id]) for i, cluster in enumerate(clustered_sentences): print("Cluster ", i + 1) print(cluster) print("") print('Estimated number of clusters: %d' % n_clusters_) print('Estimated number of noise points: %d' % n_noise_) print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels_true, labels)) print("Completeness: %0.3f" % metrics.completeness_score(labels_true, labels)) print("V-measure: %0.3f" % metrics.v_measure_score(labels_true, labels)) print("Adjusted Rand Index: %0.3f" % metrics.adjusted_rand_score(labels_true, labels)) print("Adjusted Mutual Information: %0.3f" % metrics.adjusted_mutual_info_score(labels_true, labels)) print("Silhouette Coefficient: %0.3f" % metrics.silhouette_score(corpus_embeddings, labels)) # ############################################################################# # Plot result import matplotlib.pyplot as plt core_samples_mask = np.zeros_like(db.labels_, dtype=bool) core_samples_mask[db.core_sample_indices_] = True # Black removed and is used for noise instead. unique_labels = set(labels) colors = [ plt.cm.Spectral(each) for each in np.linspace(0, 1, len(unique_labels)) ] for k, col in zip(unique_labels, colors): if k == -1: # Black used for noise. col = [0, 0, 0, 1] class_member_mask = (labels == k) xy = corpus_embeddings[class_member_mask & core_samples_mask] plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=tuple(col), markeredgecolor='k', markersize=14) xy = corpus_embeddings[class_member_mask & ~core_samples_mask] plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=tuple(col), markeredgecolor='k', markersize=6) plt.title('Estimated number of clusters: %d' % n_clusters_) plt.show()
def main(): # initialization retval = os.getcwd() corpus_path = retval + "/../vector_model_w_stem/corpus3/" word_set = {} term_list = [] num_dimensionality = 25 num_clusters = 5 # get word list for each document for file_name in os.listdir(corpus_path): vector = {} file_path = corpus_path + file_name for line in open(file_path).read().split("\n"): word = line.split("\t")[0] if word: value = float(line.split("\t")[1]) term_list.append(word) vector[word] = value word_set[file_name] = vector # get term list (vocabulary) based on documents print("------------------ Parameters Detail ---------------------") print("The length of total word list: " + str(len(term_list))) # remove duplicate term_list = list(set(term_list)) print("Remove duplicate the length of vocabulary: " + str(len(term_list))) # generate term-document matrix and document list term_document_matrix = [] doc_list = [] ground_truth = [] for doc_name in word_set.keys(): vector = word_set[doc_name] ground_truth.append(doc_name.split("-")[0]) term_document_vector = [] for word_voc in term_list: if word_voc in vector.keys(): value = vector[word_voc] term_document_vector.append(value) else: term_document_vector.append(0) term_document_matrix.append(term_document_vector) doc_list.append(doc_name) print("The number of document: " + str(len(doc_list))) print("The number of clusters: " + str(num_clusters)) print("The dimensionality: " + str(num_dimensionality)) term_document_matrix = np.array(term_document_matrix).transpose() # SVD U, Sigma, V = np.linalg.svd(term_document_matrix) # dimensionality reduction for each document doc_matrix_reduced = dimensionality_reduction(Sigma, V, num_dimensionality) # k-means clustering km = KMeans(n_clusters=num_clusters) km.fit(doc_matrix_reduced.transpose()) # evaluate the quality of k-means clustering print("------------------ Clustering Result ---------------------") for label in range(0, num_clusters): for idx in range(0, len(doc_list)): if km.labels_[idx] == label: print doc_list[idx] + "\t" + str(km.labels_[idx]) print("------------------- Evaluation Score ---------------------") print("Homogeneity: %0.3f" % metrics.homogeneity_score(ground_truth, km.labels_)) print("Completeness: %0.3f" % metrics.completeness_score(ground_truth, km.labels_)) print("V-measure: %0.3f" % metrics.v_measure_score(ground_truth, km.labels_)) print("Adjusted Rand-Index: %.3f" % metrics.adjusted_rand_score(ground_truth, km.labels_)) # visualization the result of clustering visualization_clustering(doc_matrix_reduced, ground_truth) print("----------------------- All Set -------------------------")
def ClusterByHDbScan(listtuple_pred_true_text, avgItemsInCluster_in_a_batch): print("\nClusterByHDbScan") printClusterEvaluation_list(listtuple_pred_true_text) print(len(listtuple_pred_true_text), avgItemsInCluster_in_a_batch) dic_tupple_class_predicted = groupTxtByClass(listtuple_pred_true_text, False) numberOfClusters_predicted = len(dic_tupple_class_predicted) dic_tupple_class_true = groupTxtByClass(listtuple_pred_true_text, True) numberOfClusters_true = len(dic_tupple_class_true) print("numberOfClusters_true=" + str(numberOfClusters_true) + ", numberOfClusters_predicted=" + str(numberOfClusters_predicted)) train_data = [] train_predlabels = [] train_trueLabels = [] for pred_true_text in listtuple_pred_true_text: train_predlabels.append(pred_true_text[0]) train_trueLabels.append(pred_true_text[1]) train_data.append(pred_true_text[2]) vectorizer = TfidfVectorizer(max_df=1.0, min_df=1, stop_words='english', use_idf=True, smooth_idf=True, norm='l2') X = vectorizer.fit_transform(train_data) svd = TruncatedSVD(2) normalizer = Normalizer(copy=False) lsa = make_pipeline(svd, normalizer) X_svd = lsa.fit_transform(X) min_cluster_size_in_a_batch = int(math.ceil(avgItemsInCluster_in_a_batch)) min_cluster_size_in_a_batch = 2 clusterer = hdbscan.HDBSCAN(min_cluster_size=min_cluster_size_in_a_batch) clusterer.fit(X) X_hdbscan_labels = clusterer.labels_ print("X-total-clusters=" + str(X_hdbscan_labels.max())) print("Homogeneity: %0.4f" % metrics.homogeneity_score(train_trueLabels, X_hdbscan_labels)) print("Completeness: %0.4f" % metrics.completeness_score(train_trueLabels, X_hdbscan_labels)) print("V-measure: %0.4f" % metrics.v_measure_score(train_trueLabels, X_hdbscan_labels)) print("Adjusted Rand-Index: %.4f" % metrics.adjusted_rand_score(train_trueLabels, X_hdbscan_labels)) print("nmi_score-whole-data: %0.4f" % metrics.normalized_mutual_info_score( train_trueLabels, X_hdbscan_labels, average_method='arithmetic')) clusterer_svd = hdbscan.HDBSCAN( min_cluster_size=min_cluster_size_in_a_batch) clusterer_svd.fit(X_svd) X_svd_hdbscan_labels = clusterer_svd.labels_ db = DBSCAN().fit(X_svd) X_svd_dbscan_labels = db.labels_ print("X-svd-total-clusters=" + str(X_svd_hdbscan_labels.max())) print("Homogeneity: %0.4f" % metrics.homogeneity_score(train_trueLabels, X_svd_hdbscan_labels)) print("Completeness: %0.4f" % metrics.completeness_score(train_trueLabels, X_svd_hdbscan_labels)) print("V-measure: %0.4f" % metrics.v_measure_score(train_trueLabels, X_svd_hdbscan_labels)) print("Adjusted Rand-Index: %.4f" % metrics.adjusted_rand_score(train_trueLabels, X_svd_hdbscan_labels)) print("nmi_score-whole-data: %0.4f" % metrics.normalized_mutual_info_score(train_trueLabels, X_svd_hdbscan_labels, average_method='arithmetic')) print("X-svd-dbscan-total-clusters=" + str(X_svd_dbscan_labels.max())) print("Homogeneity: %0.4f" % metrics.homogeneity_score(train_trueLabels, X_svd_dbscan_labels)) print("Completeness: %0.4f" % metrics.completeness_score(train_trueLabels, X_svd_dbscan_labels)) print("V-measure: %0.4f" % metrics.v_measure_score(train_trueLabels, X_svd_dbscan_labels)) print("Adjusted Rand-Index: %.4f" % metrics.adjusted_rand_score(train_trueLabels, X_svd_dbscan_labels)) print("nmi_score-whole-data: %0.4f" % metrics.normalized_mutual_info_score(train_trueLabels, X_svd_dbscan_labels, average_method='arithmetic'))
print float(clus.loc[clus.iloc[:, -1] == 1].shape[0]) / clus.shape[0] h**o = [] comp = [] v_mea = [] sil = [] man = [] numPoints = 8 for i in range(2, numPoints): rp = SparseRandomProjection(n_components=6) projected_data = rp.fit_transform(X) gm = mixture.GMM(n_components=i, covariance_type='diag') gm.fit(projected_data) y_pred = gm.predict(projected_data) h**o.append(metrics.homogeneity_score(y, y_pred)) comp.append(metrics.completeness_score(y, y_pred)) v_mea.append(metrics.v_measure_score(y, y_pred)) sil.append( metrics.silhouette_score(projected_data, gm.predict(projected_data), metric='euclidean')) man.append( metrics.silhouette_score(projected_data, gm.predict(projected_data), metric='manhattan')) x = xrange(2, numPoints) fig = plt.figure() plt.plot(x, h**o, label='homogeneity score') plt.plot(x, comp, label='completeness score')
from sklearn import metrics labels_true = [0, 0, 0, 1, 1, 1] labels_pred = [0, 0, 1, 1, 2, 2] # labels_pred = [1, 1, 0, 0, 3, 3] labels_pred = labels_true[:] # metrics.adjusted_rand_score(labels_true, labels_pred) print metrics.adjusted_rand_score(labels_true, labels_pred) print metrics.adjusted_mutual_info_score(labels_true, labels_pred) print metrics.homogeneity_score(labels_true, labels_pred) print metrics.completeness_score(labels_true, labels_pred)
def analyze_k_means(estimator, name, data): t0 = time() estimator.fit(data) print(" %9s %.2fs %i %.3f %.3f %.3f %.3f %.3f %.3f"%( name, time()-t0, estimator.inertia_, metrics.homogeneity_score(labels, estimator.labels_), metrics.completeness_score(labels, estimator.labels_), metrics.v_measure_score(labels, estimator.labels_), metrics.adjusted_rand_score(labels, estimator.labels_), metrics.adjusted_mutual_info_score(labels, estimator.labels_), metrics.silhouette_score(data, estimator.labels_, metric='euclidean', sample_size = samples) ))
# contrast train data X_contrast = np.zeros(np.shape(X_train)) for i in range(len(X_contrast)): image = X_train[i, :] image = image.astype(np.uint8) X_contrast[i] = cv2.equalizeHist(image).reshape(1, NUMBER_OF_PIXELS) # normalize train data X_contrast = X_contrast.astype('float32') / MAX_BRIGHTNESS - MEAN X_train = X_train.astype('float32') / MAX_BRIGHTNESS - MEAN # run kmeans with 19 clusters, as there are 19 letters left in the data kmeans = KMeans(init="k-means++", n_clusters=19, n_init=4) # run k-means on full dataset train kmeans_full = kmeans.fit(X_contrast) labels = kmeans.predict(X_contrast) # print number of iterations train data print('Number of iterations Full Kmeans train data {}'.format( kmeans_full.n_iter_)) # Print scores full train dataset print('Homogeneity Score Full Train Dataset: {}'.format( homogeneity_score(y_train, labels))) print('Completeness Score Full Train Dataset: {}'.format( completeness_score(y_train, labels))) print('V-score Score Full train Dataset: {}'.format( v_measure_score(y_train, labels)))
def Evaluate_old(listtuple_pred_true_text, ignoreMinusOne=False): preds = [] trues = [] new_listtuple_pred_true_text = [] totalwords = 0 for pred_true_text in listtuple_pred_true_text: if str(pred_true_text[1]) == '-1' and ignoreMinusOne == True: continue preds.append(pred_true_text[0]) trues.append(pred_true_text[1]) new_listtuple_pred_true_text.append( [pred_true_text[0], pred_true_text[1], pred_true_text[2]]) totalwords += len(pred_true_text[2]) #print(pred_true_text[2], totalwords) print("evaluate total texts=" + str(len(new_listtuple_pred_true_text))) score = metrics.homogeneity_score(trues, preds) print("homogeneity_score-whole-data: %0.8f" % score) score = metrics.completeness_score(trues, preds) print("completeness_score-whole-data: %0.8f" % score) score = metrics.v_measure_score(trues, preds) print("v_measure_score-whole-data: %0.8f" % score) score = metrics.normalized_mutual_info_score(trues, preds, average_method='arithmetic') print("nmi_score-whole-data: %0.8f" % score) #score=metrics.adjusted_mutual_info_score(trues, preds) #print ("adjusted_mutual_info_score-whole-data: %0.4f" % score) #score=metrics.adjusted_rand_score(trues, preds) #print ("adjusted_rand_score-whole-data: %0.4f" % score) dic_tupple_class = groupItemsBySingleKeyIndex(new_listtuple_pred_true_text, 0) #before 0 dic_tupple_class_true = groupItemsBySingleKeyIndex( new_listtuple_pred_true_text, 1) #before 1 print("pred clusters=" + str(len(dic_tupple_class)) + ", true clusters=" + str(len(dic_tupple_class_true))) ComputePurity(dic_tupple_class) li = [ len(dic_tupple_class_true[x]) for x in dic_tupple_class_true if isinstance(dic_tupple_class_true[x], list) ] print('min', min(li), 'max', max(li), 'median', statistics.median(li), 'avg', statistics.mean(li), 'std', statistics.stdev(li), 'sum of li', sum(li)) print('avg words per text', totalwords / len(new_listtuple_pred_true_text), 'totalwords', totalwords, '#texts', len(new_listtuple_pred_true_text)) '''print("---Pred distribution")
################################################################### Kclusters = range(2,50,2) km_sil_scores = [] km_homo_scores = [] km_inertia_scores = [] km_fitness_times = [] for k in Kclusters: t1 = time.time() km = KMeans(n_clusters=k, n_init=10,random_state=100,n_jobs=-1).fit(X1) t2 = time.time() km_fitness_times.append(t2 - t1) km_sil_scores.append(silhouette_score(X1, km.labels_)) km_homo_scores.append(homogeneity_score(Y1, km.labels_)) km_inertia_scores.append(km.inertia_) em_sil_scores = [] em_homo_scores = [] em_aic_scores = [] em_bic_scores = [] em_fitness_times = [] for k in Kclusters: t1 = time.time() em = GaussianMixture(n_components=k,covariance_type='diag',n_init=1,warm_start=True,random_state=100).fit(X1) t2 = time.time() em_fitness_times.append(t2 - t1)
6: "total sulfur dioxide (mg/dm^3)", 7: "density(g/cm^3)", 8: "pH", 9: "sulphates (g/dm^3)", 10: "alcohol (vol.%)", 11: "quality" } wine_2 = wine_2.rename(columns=mapping_2) wine_2 = wine_2.drop(['quality'], axis=1) kmeans = KMeans(n_clusters=7, random_state=0).fit(wine_2) print("For 7 clusters, comparing them to the wine quality:") print("Silhouette score", metrics.silhouette_score(wine_2, kmeans.labels_)) print("Completeness score", metrics.completeness_score(wine["quality"], kmeans.labels_)) print("Homogeneity score", metrics.homogeneity_score(wine["quality"], kmeans.labels_)) #testing cluster sizes store = [] for i in range(3, 10): kmeans = KMeans(n_clusters=i, random_state=0).fit(wine_2) store.append((metrics.silhouette_score(wine_2, kmeans.labels_), i)) plt.scatter([s[1] for s in store], [s[0] for s in store]) plt.xlabel("Clusters") plt.ylabel("Silhouette score") plt.savefig("clusters.png") plt.close() #graphs showing the groupings obtained
batch_size=1000, verbose=opts.verbose) else: km = KMeans(n_clusters=true_k, init='k-means++', max_iter=100, n_init=1, verbose=opts.verbose) print("Clustering sparse data with %s" % km) t0 = time() km.fit(X) print("done in %0.3fs" % (time() - t0)) print() print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels, km.labels_)) print("Completeness: %0.3f" % metrics.completeness_score(labels, km.labels_)) print("V-measure: %0.3f" % metrics.v_measure_score(labels, km.labels_)) print("Adjusted Rand-Index: %.3f" % metrics.adjusted_rand_score(labels, km.labels_)) print("Silhouette Coefficient: %0.3f" % metrics.silhouette_score(X, km.labels_, sample_size=1000)) print() if not opts.use_hashing: print("Top terms per cluster:") if opts.n_components: original_space_centroids = svd.inverse_transform(km.cluster_centers_) order_centroids = original_space_centroids.argsort()[:, ::-1]
np.sum(evidenceList[allClusters[i] == lowClusterNo[i]] == "N")) # TN allResults[i][3] = float( np.sum(evidenceList[allClusters[i] == lowClusterNo[i]] == "Y")) # FN # Evaluating cluster validation scores # Each valList element contains [Adjusted Rand, Mutual Info, Adjusted Mutual Info, Normalized Mutual Info, Homogeneity, Completeness, V Measure] valList = [] for i in range(0, len(allClusters)): valItem = [] valItem.append(metrics.adjusted_rand_score(evidenceList, allClusters[i])) valItem.append(metrics.mutual_info_score(evidenceList, allClusters[i])) valItem.append( metrics.adjusted_mutual_info_score(evidenceList, allClusters[i])) valItem.append( metrics.normalized_mutual_info_score(evidenceList, allClusters[i])) valItem.append(metrics.homogeneity_score(evidenceList, allClusters[i])) valItem.append(metrics.completeness_score(evidenceList, allClusters[i])) valItem.append(metrics.v_measure_score(evidenceList, allClusters[i])) valList.append(valItem) # Writing results statFile = open( outputLocationStat + inputFileName.split("/")[-1].split(".")[0] + "_" + algorithm + "_" + method + "_" + distance + "_" + str(noClust) + ".stat", "w") for i in range(0, len(allResults)): statFile.write("# " + allLabels[i] + "\n") tp = allResults[i][0] fp = allResults[i][1] tn = allResults[i][2] fn = allResults[i][3]
#for each k, calculate the silhouette_coefficient by using: silhouette_score(X_training, kmeans.labels_) #find which k maximizes the silhouette_coefficient silhouette_coeff = silhouette_score(X_training, kmeans.labels_) silhouette_scores[k] = (silhouette_coeff) #plot the value of the silhouette_coefficient for each k value of kmeans so that we can see the best k k_value = [x for x in range(2,21)] plt.plot(silhouette_scores.keys(),silhouette_scores.values(),) # plt.show() best_k = dict(sorted(silhouette_scores.items(), key = itemgetter(1), reverse = True)[:1]) #reading the validation data (clusters) by using Pandas library df1 = pd.read_csv('testing.csv', header=None) #assign your data labels to vector labels (you might need to reshape the row vector to a column vector) # do this: np.array(df.values).reshape(1,<number of samples>)[0] labels = np.array(df1.values).reshape(1,-1)[0] #Calculate and print the Homogeneity of this kmeans clustering print("K-Means Homogeneity Score = " + metrics.homogeneity_score(labels, kmeans.labels_).__str__()) #rung agglomerative clustering now by using the best value o k calculated before by kmeans #Do it: agg = AgglomerativeClustering(n_clusters=best_k.keys()[0], linkage='ward') agg.fit(X_training) # Calculate and print the Homogeneity of this agglomerative clustering print("Agglomerative Clustering Homogeneity Score = " + metrics.homogeneity_score(labels, agg.labels_).__str__())
# In[ ]: x = data[['0', '1']].values x.shape # In[ ]: #printing results print('labels:') # print(labelsPred) # tEnd = datetime.datetime.now() # print('Time: ' + str(tEnd - tStart)) print('Measures:') print('HS: ' + str(metrics.homogeneity_score(y, labelsPred))) print('CS: ' + str(metrics.completeness_score(y, labelsPred))) print('VM: ' + str(metrics.v_measure_score(y, labelsPred))) print('AMI: ' + str(metrics.adjusted_mutual_info_score(y, labelsPred))) print('ARI: ' + str(metrics.adjusted_rand_score(y, labelsPred))) # In[ ]: import matplotlib.pyplot as plt from itertools import cycle, islice fig = plt.figure() colors = np.array( list( islice( cycle([
X = X.toarray() #y = np.array(labels) print "Affinity Clustering..." print X ############################################################################## # Compute Affinity Propagation af = AffinityPropagation().fit(X) cluster_centers_indices = af.cluster_centers_indices_ labels = af.labels_ n_clusters_ = len(cluster_centers_indices) print('Estimated number of clusters: %d' % n_clusters_) print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels_true, labels)) print("Completeness: %0.3f" % metrics.completeness_score(labels_true, labels)) print("V-measure: %0.3f" % metrics.v_measure_score(labels_true, labels)) print("Adjusted Rand Index: %0.3f" % metrics.adjusted_rand_score(labels_true, labels)) print("Adjusted Mutual Information: %0.3f" % metrics.adjusted_mutual_info_score(labels_true, labels)) print("Silhouette Coefficient: %0.3f" % metrics.silhouette_score(X, labels, metric='sqeuclidean')) ############################################################################## # Plot result import pylab as pl from itertools import cycle
def calc_measures_avg(measures, n_imgs, ignore_classes, for_final_result): measures_result = {} # these measures can just be averaged for measure in [ Constants.ERRORS, Constants.IOU, Constants.BINARY_IOU, Constants.AP, Constants.MOTA, Constants.MOTP, Constants.AP_INTERPOLATED, Constants.FALSE_POSITIVES, Constants.FALSE_NEGATIVES, Constants.ID_SWITCHES ]: if measure in measures: measures_result[measure] = numpy.sum(measures[measure]) / n_imgs # TODO: This has to be added as IOU instead of conf matrix. if Constants.CONFUSION_MATRIX in measures: measures_result[Constants.IOU] = calc_iou(measures, n_imgs, ignore_classes) if Constants.CLICKS in measures: clicks = [ int(x.rsplit(':', 1)[-1]) for x in measures[Constants.CLICKS] ] measures_result[Constants.CLICKS] = float(numpy.sum(clicks)) / n_imgs if for_final_result and Constants.DETECTION_AP in measures: from object_detection.utils.object_detection_evaluation import ObjectDetectionEvaluation if isinstance(measures[Constants.DETECTION_AP], ObjectDetectionEvaluation): evaluator = measures[Constants.DETECTION_AP] else: n_classes = measures[Constants.DETECTION_AP][-2] evaluator = ObjectDetectionEvaluation(n_classes, matching_iou_threshold=0.5) evaluator.next_image_key = 0 # add a new field which we will use _add_aps(evaluator, measures[Constants.DETECTION_AP]) aps, mAP, _, _, _, _ = evaluator.evaluate() measures_result[Constants.DETECTION_APS] = aps measures_result[Constants.DETECTION_AP] = mAP if for_final_result and Constants.CLUSTER_IDS in measures and Constants.ORIGINAL_LABELS in measures: from sklearn.metrics import adjusted_mutual_info_score, homogeneity_score, completeness_score labels_true = numpy.reshape( numpy.array(measures[Constants.ORIGINAL_LABELS], dtype=numpy.int32), [-1]) labels_pred = numpy.reshape( numpy.array(measures[Constants.CLUSTER_IDS], dtype=numpy.int32), [-1]) ami = adjusted_mutual_info_score(labels_true, labels_pred) measures_result[Constants.ADJUSTED_MUTUAL_INFORMATION] = ami homogeneity = homogeneity_score(labels_true, labels_pred) measures_result[Constants.HOMOGENEITY] = homogeneity completeness = completeness_score(labels_true, labels_pred) measures_result[Constants.COMPLETENESS] = completeness NO_EVAL = False if not NO_EVAL: if for_final_result and Constants.ORIGINAL_LABELS in measures and Constants.EMBEDDING in measures: from sklearn import mixture from sklearn.cluster import KMeans from sklearn.metrics import adjusted_mutual_info_score, homogeneity_score, completeness_score embeddings = numpy.array(measures[Constants.EMBEDDING], dtype=numpy.int32) embeddings = numpy.reshape(embeddings, [-1, embeddings.shape[-1]]) labels_true = numpy.reshape( numpy.array(measures[Constants.ORIGINAL_LABELS], dtype=numpy.int32), [-1]) # n_components = 80 # n_components = 400 # n_components = 1000 n_components = 3000 import time # start = time.time() # gmm = mixture.GaussianMixture(n_components=n_components, covariance_type='full') # gmm.fit(embeddings) # labels_pred= gmm.predict(embeddings) # print "gmm took ", time.time()-start start = time.time() kmeans = KMeans(n_clusters=n_components, n_jobs=-1) labels_pred = kmeans.fit_predict(embeddings) print("km took ", time.time() - start) ami = adjusted_mutual_info_score(labels_true, labels_pred) measures_result[Constants.ADJUSTED_MUTUAL_INFORMATION] = ami homogeneity = homogeneity_score(labels_true, labels_pred) measures_result[Constants.HOMOGENEITY] = homogeneity completeness = completeness_score(labels_true, labels_pred) measures_result[Constants.COMPLETENESS] = completeness return measures_result
shuffle(shuffleind) zipper = sorted((zip(shuffleind, reads, read_parent_id))) z, reads, read_parent_id = (list(t) for t in zip(*zipper)) #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #HOMOGENEITY AND COMPLETENESS FOR FULL DATASET WITH PREVIOUSLY CALCULATED GOOD_THRESH! #Find suitable threshold print("\n\nThreshold approximation:") good_thresh = find_threshold(50, 15, 40, 1) print("\n\nFull clustering:") clus_N, cluster_sizes, read_labels = simplesim_cluster(good_thresh) print('\n\n~~~~~~~INFO~~~~~~~') hom = metrics.homogeneity_score(read_parent_id, read_labels) comp = metrics.completeness_score(read_parent_id, read_labels) print("Homogeneity: %0f" % hom) print("Completeness: %0f" % comp) homogeneity_lst.append(hom) completeness_lst.append(comp) print("\nDONE TRIPLICATES LOOP\n") cluster_eff_lst_lst.append(cluster_eff_lst) homogeneity_lst_lst.append(homogeneity_lst) completeness_lst_lst.append(completeness_lst) #PANDAS DATAFRAME FROM THIS?!
max_features=10000, min_df=2, stop_words='english', use_idf=True) matrix = vectorizer.fit_transform(dataset.data) print("n_samples: %d, n_features: %d" % matrix.shape) print() #降维 print("Performing dimensionality reduction using LSA") t0 = time() svd = TruncatedSVD(2) #维度 normalizer = Normalizer(copy=False) lsa = make_pipeline(svd, normalizer) matrix_l = lsa.fit_transform(matrix) # ############################################################################# # Do the actual clustering gmm = mixture.GaussianMixture(n_components=50, covariance_type='full') labels = gmm.fit(matrix_l).predict(matrix_l) labels_pred = labels print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels_ture, labels_pred)) print("Completeness: %0.3f" % metrics.completeness_score(labels_ture, labels_pred)) print("NMI: %0.3f" % metrics.normalized_mutual_info_score( labels_ture, labels_pred, average_method='arithmetic'))
ax[1].set_title('Actual Training Labels') # Show the plots plt.show() # Evaluation of Clustering Model # Import `metrics` from `sklearn` from sklearn import metrics # Print out the confusion matrix with `confusion_matrix()` print(metrics.confusion_matrix(y_test, y_pred)) from sklearn.metrics import homogeneity_score, completeness_score, v_measure_score, adjusted_rand_score, adjusted_mutual_info_score, silhouette_score print('% 9s' % 'inertia h**o compl v-meas ARI AMI silhouette') print('%i %.3f %.3f %.3f %.3f %.3f %.3f' % (clf.inertia_, homogeneity_score(y_test, y_pred), completeness_score(y_test, y_pred), v_measure_score( y_test, y_pred), adjusted_rand_score( y_test, y_pred), adjusted_mutual_info_score(y_test, y_pred), silhouette_score(X_test, y_pred, metric='euclidean'))) # try out Support Vector Machines # Import `train_test_split` from sklearn.cross_validation import train_test_split # Split the data into training and test sets X_train, X_test, y_train, y_test, images_train, images_test = train_test_split( digits.data, digits.target, digits.images, test_size=0.25, random_state=42) # Import the `svm` model from sklearn import svm