def test(): data = SimData(400, 4, 15) cor = np.nan_to_num(np.corrcoef(data.answers, rowvar=0)) # pearson metric cor = np.nan_to_num(np.corrcoef(cor)) label1 = kmeans2(cor, 6, minit='points', iter=100)[1] # hack pocet komponent label2 = kmeans(cor, 6, True) xs, ys = mds(cor, euclid=True) plt.subplot(1, 2, 1) plt.title('kmeans2 ' + str(adjusted_rand_score(data.item_concept, label1))) plot_clustering( range(cor.shape[0]), xs, ys, labels=label1, shapes=data.item_concept, ) plt.subplot(1, 2, 2) plt.title('Kmeans ' + str(adjusted_rand_score(data.item_concept, label2))) plot_clustering( range(cor.shape[0]), xs, ys, labels=label2, shapes=data.item_concept, ) plt.show()
def aggregate_stats(infiles, outfile): """ Combine all the aggstats into a single file Compute summary statistics """ res = [] for infile in infiles: d = pickle.load(open(infile, 'r')) print "The file is", infile assigndf = d['df'] meta = d['meta'] neurons = meta['neurons'] m = extract_metadata(infile) if len(m) == 0: # skip the stupid non-replicated ones continue for k, v in m.iteritems(): assigndf[k] = v assigndf['true_assign_role'] = [np.array(neurons['role']) for _ in range(len(assigndf))] # compute the statistics assigndf['ari'] = assigndf.apply(lambda x : metrics.adjusted_rand_score(x['true_assign'], irm.util.canonicalize_assignment(x['assign'])), axis=1) assigndf['homogeneity'] = assigndf.apply(lambda x : metrics.homogeneity_score(x['true_assign'], irm.util.canonicalize_assignment(x['assign'])), axis=1) assigndf['completeness'] = assigndf.apply(lambda x : metrics.completeness_score(x['true_assign'], irm.util.canonicalize_assignment(x['assign'])), axis=1) # don't consider the ones where the role is "none" as these are multi-role ones neurons.ix[neurons['role'].isnull(), 'role'] = 'I' assigndf['role_ari'] = assigndf.apply(lambda x : metrics.adjusted_rand_score(neurons['role'], irm.util.canonicalize_assignment(x['assign'])), axis=1) assigndf['role_homogeneity'] = assigndf.apply(lambda x : metrics.homogeneity_score(neurons['role'], irm.util.canonicalize_assignment(x['assign'])), axis=1) assigndf['role_completeness'] = assigndf.apply(lambda x : metrics.completeness_score(neurons['role'], irm.util.canonicalize_assignment(x['assign'])), axis=1) assigndf['type_n_true'] = assigndf.apply(lambda x : len(np.unique(x['true_assign'])), axis=1) assigndf['type_n_learned'] = assigndf.apply(lambda x : len(np.unique(x['assign'])), axis=1) assigndf['auc'] = assigndf.apply(lambda x: metrics.roc_auc_score(x['heldout_link_truth'], x['heldout_link_predprob']), axis=1) #assigndf['f1'] = assigndf.apply(lambda x: metrics.f1_score(x['heldout_link_truth'], x['heldout_link_predprob']), axis=1) # # fraction of mass in top N types res.append(assigndf) alldf = pandas.concat(res) pickle.dump(alldf, open(outfile, 'w'), -1)
def acc_ari(X, lbls_true, lbls_pred, reject, strat_lbl_inds, use_strat=False): if use_strat: ari = metrics.adjusted_rand_score(lbls_true[strat_lbl_inds], lbls_pred[strat_lbl_inds]) perc = np.int(np.float(len(strat_lbl_inds))/np.float(lbls_true.size) * 100.0) desc = ('ARI (strat={0})'.format(perc), 'ARI') else: ari = metrics.adjusted_rand_score(lbls_true, lbls_pred) desc = ('ARI', 'ARI') return ari, desc
def test_affinities(): X, y = make_blobs(n_samples=40, random_state=1, centers=[[1, 1], [-1, -1]], cluster_std=0.4) # nearest neighbors affinity sp = SpectralClustering(n_clusters=2, affinity="nearest_neighbors", random_state=0) labels = sp.fit(X).labels_ assert_equal(adjusted_rand_score(y, labels), 1) sp = SpectralClustering(n_clusters=2, gamma=2, random_state=0) labels = sp.fit(X).labels_ assert_equal(adjusted_rand_score(y, labels), 1)
def kmeans(input_file, n_clusters, Output): lvltrace.lvltrace("LVLEntree dans kmeans unsupervised") ncol=tools.file_col_coma(input_file) data = np.loadtxt(input_file, delimiter=',', usecols=range(ncol-1)) X = data[:,1:] y = data[:,0] sample_size, n_features = X.shape k_means=cluster.KMeans(init='k-means++', n_clusters=n_clusters, n_init=10) k_means.fit(X) reduced_data = k_means.transform(X) values = k_means.cluster_centers_.squeeze() labels = k_means.labels_ k_means_cluster_centers = k_means.cluster_centers_ print "#########################################################################################################\n" #print y #print labels print "K-MEANS\n" print('homogeneity_score: %f'%metrics.homogeneity_score(y, labels)) print('completeness_score: %f'%metrics.completeness_score(y, labels)) print('v_measure_score: %f'%metrics.v_measure_score(y, labels)) print('adjusted_rand_score: %f'%metrics.adjusted_rand_score(y, labels)) print('adjusted_mutual_info_score: %f'%metrics.adjusted_mutual_info_score(y, labels)) print('silhouette_score: %f'%metrics.silhouette_score(X, labels, metric='euclidean', sample_size=sample_size)) print('\n') print "#########################################################################################################\n" results = Output+"kmeans_scores.txt" file = open(results, "w") file.write("K-Means Scores\n") file.write("Homogeneity Score: %f\n"%metrics.homogeneity_score(y, labels)) file.write("Completeness Score: %f\n"%metrics.completeness_score(y, labels)) file.write("V-Measure: %f\n"%metrics.v_measure_score(y, labels)) file.write("The adjusted Rand index: %f\n"%metrics.adjusted_rand_score(y, labels)) file.write("Adjusted Mutual Information: %f\n"%metrics.adjusted_mutual_info_score(y, labels)) file.write("Silhouette Score: %f\n"%metrics.silhouette_score(X, labels, metric='euclidean', sample_size=sample_size)) file.write("\n") file.write("True Value, Cluster numbers, Iteration\n") for n in xrange(len(y)): file.write("%f, %f, %i\n"%(y[n],labels[n],(n+1))) file.close() import pylab as pl from itertools import cycle # plot the results along with the labels k_means_cluster_centers = k_means.cluster_centers_ fig, ax = plt.subplots() im=ax.scatter(X[:, 0], X[:, 1], c=labels, marker='.') for k in xrange(n_clusters): my_members = labels == k cluster_center = k_means_cluster_centers[k] ax.plot(cluster_center[0], cluster_center[1], 'w', color='b', marker='x', markersize=6) fig.colorbar(im) plt.title("Number of clusters: %i"%n_clusters) save = Output + "kmeans.png" plt.savefig(save) lvltrace.lvltrace("LVLsortie dans kmeans unsupervised")
def compare_results(block,kmA,kmL): blockB = [[int(b==l) for b in block] for l in xrange(6)] for l in xrange(5): print "Block "+repr(l)+" results:" num_diff = vn.num_diff_w_perms(blockB[l], kmA) ari = adjusted_rand_score(blockB[l],kmA) print "Adjacency: num error="+repr(num_diff)+" ari="+repr(ari) num_diff = vn.num_diff_w_perms(blockB[l], kmL) ari = adjusted_rand_score(blockB[l],kmL) print "Laplacian: num error="+repr(num_diff)+" ari="+repr(ari)
def compute_cluster_metrics_raw(chains, cells): all_chains = [] for chain_i, chain in enumerate(chains): sample_latent = chain['state'] cell_assignment = np.array(sample_latent['domains']['d1']['assignment']) ca = irm.util.canonicalize_assignment(cell_assignment) cells['cluster'] = ca canon_true_fine = irm.util.canonicalize_assignment(cells['type_id']) canon_true_coarse = irm.util.canonicalize_assignment(cells['coarse']) ari = metrics.adjusted_rand_score(canon_true_fine, ca) ari_coarse = metrics.adjusted_rand_score(canon_true_coarse, ca) ami = metrics.adjusted_mutual_info_score(canon_true_fine, ca) ami_coarse = metrics.adjusted_mutual_info_score(canon_true_coarse, ca) jaccard = rand.compute_jaccard(canon_true_fine, ca) jaccard_coarse = rand.compute_jaccard(canon_true_coarse, ca) ss = rand.compute_similarity_stats(canon_true_fine, ca) # other statistics # cluster count # average variance x vars = cells.groupby('cluster').var() # average variance y # average variance z chain_info = {'ari' : ari, 'ari_coarse' : ari_coarse, 'ami' : ami, 'ami_coarse' : ami_coarse, 'jaccard' : jaccard, 'jaccard_coarse' : jaccard_coarse, 'n11' : ss['n11'], 'vars' : vars, 'cluster_n' : len(np.unique(cells['cluster'])), 'chain_i' : chain_i, 'score' : chain['scores'][-1], 'df' : cells, } all_chains.append(chain_info) df = pandas.DataFrame(all_chains) return df
def ARI_CrossCat(Xc, Xrv, XRc, XRrv): ''' Adjusted Rand Index (ARI) calculation for a CrossCat clustered table To calculate ARI based on the CrossCat partition, each cell in the table is considered as an instance to be assigned to a cluster. A cluster is defined by both the view index AND the category index. In other words, if, and only if, two cells, regardless of which columns and rows they belong to, are lumped into the same view and category, the two cells are considered to be in the same cluster. For a table of size Nrow x Ncol Xc: (1 x Ncol) array of view assignment for each column. Note: It is assumed that the view indices are consecutive integers starting from 0. Hence, the number of views is equal to highest view index plus 1. Xrv: (Nrow x Nview) array where each row is the assignmennt of categories for the corresponding row in the data table. The i-th element in a row corresponds to the category assignment of the i-th view of that row. XRc and XRrv have the same format as Xr and Xrv respectively. The ARI index is calculated from the comparison of the table clustering define by (XRc, XRrv) and (Xc, Xrv). ''' Xrv = Xrv.T XRrv = XRrv.T # Find the highest category index of all views max_cat_index = numpy.max(Xrv) # re-assign category indices so that they have different values in # different views Xrv = Xrv + numpy.arange(0,Xrv.shape[1])*(max_cat_index+1) # similarly for the reference partition max_cat_index = numpy.max(XRrv) XRrv = XRrv + numpy.arange(0,XRrv.shape[1])*(max_cat_index+1) # Table clustering assignment for the first partition CellClusterAssgn = numpy.zeros((Xrv.shape[0], Xc.size)) for icol in range(Xc.size): CellClusterAssgn[:,icol]=Xrv[:,Xc[icol]] # Flatten the table to a 1-D array compatible with the ARI function CellClusterAssgn = CellClusterAssgn.reshape(CellClusterAssgn.size) # Table clustering assignment for the second partition RefCellClusterAssgn = numpy.zeros((Xrv.shape[0], Xc.size)) for icol in range(Xc.size): RefCellClusterAssgn[:,icol]=XRrv[:,XRc[icol]] # Flatten the table RefCellClusterAssgn = RefCellClusterAssgn.reshape(RefCellClusterAssgn.size) # Compare the two partitions using ARI ARI = metrics.adjusted_rand_score(RefCellClusterAssgn, CellClusterAssgn) ARI_viewonly = metrics.adjusted_rand_score(Xc, XRc) return ARI, ARI_viewonly
def results(self, algo, hasgnc = False, filename="_"): title = self.__class__.__name__ AMI_increase = [] ARI_increase = [] rounds = 1 if hasgnc: rounds = 10 print "Runing ", algo.__name__, "for", rounds, "rounds" for i in range(rounds): vd = algo(self.g, weights = [ (lambda w: max(w,0) )(w) for w in self.g.es["weight"]] ) try: vc = vd.as_clustering() except: vc = vd #in case a VertexCluster instance is returned self.write_vertex_clustering(vc, "_weighted%s" % filename) if hasgnc: for cc in range(len(vc)): for cci in vc[cc]: self.g.vs[cci]["fastgreedy_withweight"] = str(cc) vd = algo(self.g) try: vc = vd.as_clustering() except: vc = vd #in case a VertexCluster instance is returned self.write_vertex_clustering(vc, "_unweighted%s" % filename) if hasgnc: for cc in range(len(vc)): for cci in vc[cc]: self.g.vs[cci]["fastgreedy_withoutweight"] = str(cc) #self.g.write_gml("%s.gml" % title) #print "%s.gml written with attributes" % title, #print self.g.vs.attributes() if hasgnc: #print "Weighted:" #print "Adjusted Mutual Information:", ami_weight = metrics.adjusted_mutual_info_score(self.g.vs["fastgreedy_withweight"], self.g.vs["comm"]) #print "Adjusted Rand index:", ari_weight = metrics.adjusted_rand_score(self.g.vs["fastgreedy_withweight"], self.g.vs["comm"]) #print "~"*30 #print "Unweighted:" #print "Adjusted Mutual Information:", ami_unweight = metrics.adjusted_mutual_info_score(self.g.vs["fastgreedy_withoutweight"], self.g.vs["comm"]) #print "Adjusted Rand index:", ari_unweight = metrics.adjusted_rand_score(self.g.vs["fastgreedy_withoutweight"], self.g.vs["comm"]) AMI_increase.append(ami_weight - ami_unweight) ARI_increase.append(ari_weight - ari_unweight) if hasgnc: print "Adjusted Mutual Information increases by", print 1.0 * sum(AMI_increase) / len(AMI_increase) print "Adjusted Rand index increases by", print 1.0 * sum(ARI_increase) / len(ARI_increase) print "-" * 20 return AMI_increase
def tracking(self, d_start=gb.D_START_TRACKING, d_end=gb.D_END_TRACKING, path=""): print("\n --------- tracking ...") times_fsp, axes_fsp, labels_fsp = [], [], [] times_ssp, axes_ssp, labels_ssp = [], [], [] timedelta = datetime.timedelta( milliseconds=60 * 60 * 1000) # read chunk by chunk (each chunk is of 'timedelta' milliseconds) date = d_start while date < d_end: if date + timedelta >= d_end: timedelta = d_end - date times, axes, labels = self.predict_fsp(d_start=date, d_end=date + timedelta) # self.plot_colored_signals(times, axes, labels, path, figname="_FSP.png") times_fsp += times; axes_fsp += axes; labels_fsp += labels times, axes, labels = self.predict_ssp(d_start=date, d_end=date + timedelta, update=True) # self.plot_colored_signals(times, axes, labels, path, figname="_SSP.png") times_ssp += times; axes_ssp += axes; labels_ssp += labels date += timedelta # ---------------------------- if gb.ARTIFICIAL: times, values, true_labels = self.sigReaders[0].getSignal(start=d_start, end=d_end, dated=gb.DATED, get_modes=True) ari_fps = adjusted_rand_score(true_labels, labels_fsp); ari_sps = adjusted_rand_score(true_labels, labels_ssp) ami_fps = adjusted_mutual_info_score(true_labels, labels_fsp); ami_sps = adjusted_mutual_info_score(true_labels, labels_ssp) ho_fps, com_fps, vm_fps = homogeneity_completeness_v_measure(true_labels, labels_fsp); ho_sps, com_sps, vm_sps = homogeneity_completeness_v_measure(true_labels, labels_ssp) print("---------------------------------------------------") print("adjusted_rand_score \t (ari_fps, ari_sps)", (ari_fps, ari_sps)) print("adjusted_mutual_info \t (ami_fps, ami_sps)", (ami_fps, ami_sps)) print("homogeneity \t (ho_fps, ho_sps)", (ho_fps, ho_sps)) print("completeness \t (com_fps, com_sps)", (com_fps, com_sps)) print("v_measure \t (vm_fps, vm_sps)", (vm_fps, vm_sps)) #return (ari_fps, ari_sps), (ami_fps, ami_sps), (ho_fps, ho_sps), (com_fps, com_sps), (vm_fps, vm_sps) return ((ari_fps, ari_sps), (ami_fps, ami_sps), (ho_fps, ho_sps), (com_fps, com_sps), (vm_fps, vm_sps)), (times_fsp,axes_fsp,labels_fsp,times_ssp,axes_ssp,labels_ssp) else: return 0., 0.
def test_evaluate_meta_tree_result(self): scores = evaluate_meta_tree_result( self.true_events, self.pred_events, self.all_entry_ids, methods=[metrics.adjusted_rand_score] ) assert_almost_equal( metrics.adjusted_rand_score([2, 2, 1, 1, 1, 1], [2, 2, 0, 1, 0, 1]), scores["adjusted_rand_score"] ) assert_almost_equal( metrics.adjusted_rand_score([2, 2, 0, 0, 1, 1, 1, 1, 0, 0], [2, 2, 0, 0, 0, 1, 0, 1, 1, 0]), scores["adjusted_rand_score(all)"], ) assert_almost_equal(0.8, scores["precision"]) assert_almost_equal(2 / 3.0, scores["recall"]) assert_almost_equal(8 / 11.0, scores["f1"])
def kmeans_analysis(G): block = nx.get_node_attributes(G,'block').values() xA, xL = get_embedding(G,2) cA,kmA,_ = k_means(xA,2) cB,kmL,_ = k_means(xL,2) # plt.subplot(221); plt.scatter(xA[:,0],xA[:,1],c=block) # plt.subplot(222); plt.scatter(xA[:,0],xA[:,1],c=kmA) # plt.subplot(223); plt.scatter(xL[:,0],xL[:,1],c=block) # plt.subplot(224); plt.scatter(xL[:,0],xL[:,1],c=kmL) ax = plt.subplot(121); plt.scatter(xA[:,0],xA[:,1],c=block,marker='x') ax.set_aspect('equal','datalim') lim = plt.axis() a = cA[0,:]-cA[1,:] a = np.array([1, -a[0]/a[1]]) b = np.mean(cA,axis=0) x = np.array([b+a,b-a]) plt.plot(x[:,0],x[:,1],'k--',linewidth=1) plt.axis(lim) ax = plt.subplot(122); plt.scatter(xL[:,0],xL[:,1],c=block,marker='x') ax.set_aspect('equal','datalim') lim = plt.axis() a = cB[0,:]-cB[1,:] a = np.array([1, -a[0]/a[1]]) b = np.mean(cB,axis=0) x = np.array([b+a,b-a]) plt.plot(x[:,0],x[:,1],'k--',linewidth=1) plt.axis(lim) compare_results(block,kmA,kmL) _,kmA,_ = k_means(xA,5) _,kmL,_ = k_means(xL,5) print "ALL FIVE" num_diff = vn.num_diff_w_perms(block, kmA) ari = adjusted_rand_score(block,kmA) print "Adjacency: num error="+repr(num_diff)+" ari="+repr(ari) num_diff = vn.num_diff_w_perms(block, kmL) ari = adjusted_rand_score(block,kmL) print "Laplacian: num error="+repr(num_diff)+" ari="+repr(ari)
def test_spectral_clustering(eigen_solver, assign_labels): S = np.array([[1.0, 1.0, 1.0, 0.2, 0.0, 0.0, 0.0], [1.0, 1.0, 1.0, 0.2, 0.0, 0.0, 0.0], [1.0, 1.0, 1.0, 0.2, 0.0, 0.0, 0.0], [0.2, 0.2, 0.2, 1.0, 1.0, 1.0, 1.0], [0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0], [0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0], [0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0]]) for mat in (S, sparse.csr_matrix(S)): model = SpectralClustering(random_state=0, n_clusters=2, affinity='precomputed', eigen_solver=eigen_solver, assign_labels=assign_labels ).fit(mat) labels = model.labels_ if labels[0] == 0: labels = 1 - labels assert adjusted_rand_score(labels, [1, 1, 1, 0, 0, 0, 0]) == 1 model_copy = pickle.loads(pickle.dumps(model)) assert model_copy.n_clusters == model.n_clusters assert model_copy.eigen_solver == model.eigen_solver assert_array_equal(model_copy.labels_, model.labels_)
def check_clustering(name, Alg): X, y = make_blobs(n_samples=50, random_state=1) X, y = shuffle(X, y, random_state=7) X = StandardScaler().fit_transform(X) n_samples, n_features = X.shape # catch deprecation and neighbors warnings with warnings.catch_warnings(record=True): alg = Alg() set_fast_parameters(alg) if hasattr(alg, "n_clusters"): alg.set_params(n_clusters=3) set_random_state(alg) if name == 'AffinityPropagation': alg.set_params(preference=-100) alg.set_params(max_iter=100) # fit alg.fit(X) # with lists alg.fit(X.tolist()) assert_equal(alg.labels_.shape, (n_samples,)) pred = alg.labels_ assert_greater(adjusted_rand_score(pred, y), 0.4) # fit another time with ``fit_predict`` and compare results if name is 'SpectralClustering': # there is no way to make Spectral clustering deterministic :( return set_random_state(alg) with warnings.catch_warnings(record=True): pred2 = alg.fit_predict(X) assert_array_equal(pred, pred2)
def eval_clustering(labels_true, labels_guess): """ Given the ground truth and our guessed clustering assignment, use the Adjusted Rand index to measure assignment similarity :return: Rand Index """ return metrics.adjusted_rand_score(labels_true, labels_guess)
def cluster(X,label): labels_true=X[label] X=X.drop(['churn','appetency','upselling',label],axis='columns') name='AffinityPropagation' est=AffinityPropagation(preference=-50) adjustD={} clusters_n={} db = est.fit(X) labels = db.labels_ adjustD[name]=metrics.adjusted_rand_score(labels_true,labels) n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0) clusters_n[name]=n_clusters_ print('Estimated estimator: %s' % name) print('Estimated number of clusters: %d' % n_clusters_) print("Adjusted Rand Index: %0.3f" % metrics.adjusted_rand_score(labels_true, labels))
def cluster_evaluation(D, y_true, n_clusters, eps=0.8, min_samples=10): ############################################################################## # Extract Y true labels_true = y_true ############################################################################## # transform distance matrix into a similarity matrix S = 1 - D ############################################################################## # compute DBSCAN #db = DBSCAN(eps=eps, min_samples=min_samples).fit(S) db = Ward(n_clusters=n_clusters).fit(S) #core_samples = db.core_sample_indices_ labels = db.labels_ # number of clusters in labels, ignoring noise if present n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0) print 'Number of clusters: %d' % n_clusters_ print 'Homogeneity: %0.3f' % metrics.homogeneity_score(labels_true, labels) print 'Completeness: %0.3f' % metrics.completeness_score(labels_true, labels) print 'V-meassure: %0.3f' % metrics.v_measure_score(labels_true, labels) print 'Adjusted Rand Index: %0.3f' % metrics.adjusted_rand_score(labels_true, labels) print 'Adjusted Mutual Information: %0.3f' % metrics.adjusted_mutual_info_score(labels_true, labels) print 'Silhouette Coefficient: %0.3f' % metrics.silhouette_score(D, labels, metric='precomputed')
def compare(method1, method2, fig=False): X1 = np.load('{0}_{1}_X_2d.npy'.format(species, method1)) X2 = np.load('{0}_{1}_X_2d.npy'.format(species, method2)) print 'n_cluster\tHomo\tCompl\tNMI\tARI' for i in range(2, 6): clust1 = Clustering(species, method1, X1, None, n_clusters=i) clust2 = Clustering(species, method2, X2, None, n_clusters=i) clust1.agglomerative(linkage='ward') clust2.agglomerative(linkage='ward') label1 = clust1.pred_labels('ward') label2 = clust2.pred_labels('ward') if i == 3 and fig: names = np.unique(label1) figName = '{0}_{1}_on_{2}'.format(species, method1, method2) plot2d(X2, label1, names, figName, figName) names = np.unique(label2) figName = '{0}_{1}_on_{2}'.format(species, method2, method1) plot2d(X1, label2, names, figName, figName) print '{0}\t{1}\t{2}\t{3}\t{4}\n'.format(i, metrics.homogeneity_score(label1, label2), metrics.completeness_score(label1, label2), metrics.normalized_mutual_info_score(label1, label2), metrics.adjusted_rand_score(label1, label2))
def cluster(): eps_set = 0.5 * np.arange(1, 7) npt_set = np.arange(1, 6) scores = [] global res res = [] for eps in eps_set: for npt in npt_set: est = DBSCAN(eps=eps, min_samples=npt) est.fit(x) ari = metrics.adjusted_rand_score(y, est.labels_) scores.append(ari) n_noise = len([ l for l in est.labels_ if l == -1]) res.append((ari, np.max(est.labels_) + 1 , n_noise)) print ari max_score = np.max(scores) max_idx = scores.index(max_score) max_eps = eps_set[max_idx / len(npt_set)] max_npt = npt_set[max_idx % len(npt_set)] print max_score, max_eps, max_npt scores = np.array(scores).reshape(len(eps_set), len(npt_set)) pl.imshow(scores, interpolation='nearest', cmap=pl.cm.spectral) pl.colorbar() pl.xticks(np.arange(len(npt_set)), npt_set) pl.yticks(np.arange(len(eps_set)), eps_set) pl.ylabel('eps') pl.xlabel('min_samples') pl.show()
def test_affinities(): X, y = make_blobs(n_samples=40, random_state=1, centers=[[1, 1], [-1, -1]], cluster_std=0.4) # nearest neighbors affinity sp = SpectralClustering(n_clusters=2, affinity='nearest_neighbors', random_state=0) labels = sp.fit(X).labels_ assert_equal(adjusted_rand_score(y, labels), 1) sp = SpectralClustering(n_clusters=2, gamma=2, random_state=0) labels = sp.fit(X).labels_ assert_equal(adjusted_rand_score(y, labels), 1) # raise error on unknown affinity sp = SpectralClustering(n_clusters=2, affinity='<unknown>') assert_raises(ValueError, sp.fit, X)
def compare_direct_undir(): from sklearn import metrics g = gt.Graph.Read_GraphML('ed_tag.graphml') gt.net_stat(g) gu = gt.Graph.Read_GraphML('ed_tag_undir.graphml') gt.net_stat(gu) com = g.community_infomap(edge_weights='weight', vertex_weights='weight') comu1 = gu.community_infomap(edge_weights='weight', vertex_weights='weight') comu2 = gu.community_infomap(edge_weights='weight', vertex_weights='weight') mem = com.membership memu1 = comu1.membership memu2 = comu2.membership print metrics.adjusted_rand_score(mem, memu1) print metrics.normalized_mutual_info_score(mem, memu1) print metrics.adjusted_rand_score(memu2, memu1) print metrics.normalized_mutual_info_score(memu2, memu1)
def compute_metrics(answers, predictions): aris = [] vscores = [] fscores = [] weights = [] for k in answers.keys(): idx = np.argsort(np.array(answers[k][0])) true = np.array(answers[k][1])[idx] pred = np.array(predictions[k][1]) weights.append(pred.shape[0]) if len(np.unique(true)) > 1: aris.append(adjusted_rand_score(true, pred)) vscores.append(v_measure_score(true, pred)) fscores.append(compute_fscore(true, pred)) # print '%s: ari=%f, vscore=%f, fscore=%f' % (k, aris[-1], vscores[-1], fscores[-1]) aris = np.array(aris) vscores = np.array(vscores) fscores = np.array(fscores) weights = np.array(weights) print 'number of one-sense words: %d' % (len(vscores) - len(aris)) print 'mean ari: %f' % np.mean(aris) print 'mean vscore: %f' % np.mean(vscores) print 'weighted vscore: %f' % np.sum(vscores * (weights / float(np.sum(weights)))) print 'mean fscore: %f' % np.mean(fscores) print 'weighted fscore: %f' % np.sum(fscores * (weights / float(np.sum(weights)))) return np.mean(aris),np.mean(vscores)
def test(): global est est = DBSCAN(eps=1, min_samples=1) est.fit(x) print est.labels_ ari = metrics.adjusted_rand_score(y, est.labels_) print ari
def check_iris(): iris = ds.load_iris() data = iris.data[:100] # data y_iris = iris.target[:100] # clusters # pred_optics = OPTICS(eps=10, min_pts=4).fit_predict(data, dbscan=True, dbscan_eps=0.75) pred_optics = OPTICS(eps=0.6, min_pts=5).fit_predict(data, xi=0.3) pl.subplot(2, 2, 1) pl.scatter(data[:, 0], data[:, 1], c=y_iris, cmap=pl.cm.RdBu, lw=0, s=30) pl.xlabel('Sepal length, reference clusters') pl.ylabel('Sepal width') pl.subplot(2, 2, 2) pl.scatter(data[:, 2], data[:, 3], c=y_iris, cmap=pl.cm.RdBu, lw=0, s=30) pl.xlabel('Petal length, reference clusters') pl.ylabel('Petal width') pl.subplot(2, 2, 3) pl.scatter(data[:, 0], data[:, 1], c=pred_optics, cmap=pl.cm.RdBu, lw=0, s=30) pl.xlabel('Sepal length, optics clusters') pl.ylabel('Sepal width') pl.subplot(2, 2, 4) pl.scatter(data[:, 2], data[:, 3], c=pred_optics, cmap=pl.cm.RdBu, lw=0, s=30) pl.xlabel('Petal length, optics clusters') pl.ylabel('Petal width') pl.show() print "Adjusted Rand index for iris is: %.2f" % smt.adjusted_rand_score(y_iris, pred_optics)
def get_ari(self): if self.mclustRes is None: print "No results yet. Use run(x)" if self._ari is None: mc_class = np.array(self.mclustRes['classification']).astype(int)-1 self._ari = metrics.adjusted_rand_score(mc_class,self.labels) return self._ari
def rand_compare(truth_file, estimate_file, topics_only=False, randomize_pov=False): truth_labels = read_clustering_file(truth_file, topics_only=topics_only) estimate_labels = read_clustering_file(estimate_file, topics_only=topics_only, randomize_pov=randomize_pov) return metrics.adjusted_rand_score(truth_labels, estimate_labels)
def run_clustering( clusterer, data, labels ): """ Cluster: Using a predefined and parameterized clustering algorithm, fit some dataset and perform metrics given a set of ground-truth labels. clusterer: the clustering algorithm, from sklearn data: array-like dataset input labels: vector of ground-truth labels """ # Time the operation t0 = time() clusterer.fit(data) t1 = time() # Perform metrics runtime = (t1 - t0) homogeneity = metrics.homogeneity_score( labels, clusterer.labels_ ) completeness = metrics.completeness_score( labels, clusterer.labels_ ) v_measure = metrics.v_measure_score( labels, clusterer.labels_ ) adjusted_rand = metrics.adjusted_rand_score( labels, clusterer.labels_ ) adjusted_mutual = metrics.adjusted_mutual_info_score( labels, clusterer.labels_ ) # Output to logs logging.info(" |- Execution time: %fs" % runtime) logging.info(" |- Homogeneity: %0.3f" % homogeneity) logging.info(" |- Completeness: %0.3f" % completeness) logging.info(" |- V-measure: %0.3f" % v_measure) logging.info(" |- Adjusted Rand-Index: %.3f" % adjusted_rand) logging.info(" |- Adjusted Mutual Info: %.3f" % adjusted_mutual)
def test_spectral_clustering_with_arpack_amg_solvers(): # Test that spectral_clustering is the same for arpack and amg solver # Based on toy example from plot_segmentation_toy.py # a small two coin image x, y = np.indices((40, 40)) center1, center2 = (14, 12), (20, 25) radius1, radius2 = 8, 7 circle1 = (x - center1[0]) ** 2 + (y - center1[1]) ** 2 < radius1 ** 2 circle2 = (x - center2[0]) ** 2 + (y - center2[1]) ** 2 < radius2 ** 2 circles = circle1 | circle2 mask = circles.copy() img = circles.astype(float) graph = img_to_graph(img, mask=mask) graph.data = np.exp(-graph.data / graph.data.std()) labels_arpack = spectral_clustering( graph, n_clusters=2, eigen_solver='arpack', random_state=0) assert len(np.unique(labels_arpack)) == 2 if amg_loaded: labels_amg = spectral_clustering( graph, n_clusters=2, eigen_solver='amg', random_state=0) assert adjusted_rand_score(labels_arpack, labels_amg) == 1 else: assert_raises( ValueError, spectral_clustering, graph, n_clusters=2, eigen_solver='amg', random_state=0)
def test_KMeans_scores(self): digits = datasets.load_digits() df = pdml.ModelFrame(digits) scaled = pp.scale(digits.data) df.data = df.data.pp.scale() self.assert_numpy_array_almost_equal(df.data.values, scaled) clf1 = cluster.KMeans(init='k-means++', n_clusters=10, n_init=10, random_state=self.random_state) clf2 = df.cluster.KMeans(init='k-means++', n_clusters=10, n_init=10, random_state=self.random_state) clf1.fit(scaled) df.fit_predict(clf2) expected = m.homogeneity_score(digits.target, clf1.labels_) self.assertEqual(df.metrics.homogeneity_score(), expected) expected = m.completeness_score(digits.target, clf1.labels_) self.assertEqual(df.metrics.completeness_score(), expected) expected = m.v_measure_score(digits.target, clf1.labels_) self.assertEqual(df.metrics.v_measure_score(), expected) expected = m.adjusted_rand_score(digits.target, clf1.labels_) self.assertEqual(df.metrics.adjusted_rand_score(), expected) expected = m.homogeneity_score(digits.target, clf1.labels_) self.assertEqual(df.metrics.homogeneity_score(), expected) expected = m.silhouette_score(scaled, clf1.labels_, metric='euclidean', sample_size=300, random_state=self.random_state) result = df.metrics.silhouette_score(metric='euclidean', sample_size=300, random_state=self.random_state) self.assertAlmostEqual(result, expected)
def test_n_trials(self): c = PinchRatioClustering(2, similarity.KNN(9), n_trials=5) data, labels = self.make_blobs(150) c.fit(data) score = metrics.adjusted_rand_score(c.labels, labels) self.assertEqual(score, 1.0)
def test_adjusted_rand_score(self): result = self.df.metrics.adjusted_rand_score() expected = metrics.adjusted_rand_score(self.target, self.pred) self.assertEqual(result, expected)
def performance(): randind_kmean = [] purity_kmean = [] randind_gmm = [] purity_gmm = [] num_trials = 3 data = pd.read_csv(DATASET) X, y = preprocess(data) kf = KFold(n_splits=num_trials) kf.get_n_splits(X) i = 1 print("Running {} train and evaluate iterations".format(num_trials)) for train_index, test_index in kf.split(X): print("Iteration {} out of {}".format(i, num_trials)) X_train, X_test, y_train, y_test = X.iloc[train_index], X.iloc[ test_index], y.iloc[train_index], y.iloc[test_index] split_data = [X_train, X_test, y_train, y_test] kmean, gmm, X_test, y_test = train(False, split_data) kmean_pred = kmean.predict(X_test) gmm_pred = gmm.predict(X_test) randind_kmean.append(adjusted_rand_score(y_test, kmean_pred)) randind_gmm.append(adjusted_rand_score(y_test, gmm_pred)) purity_kmean.append(purity(y_test, kmean_pred)) purity_gmm.append(purity(y_test, gmm_pred)) i += 1 randind_gmm_av = pd.DataFrame(randind_gmm).mean() purity_gmm_av = pd.DataFrame(purity_gmm).mean() randind_kmean_av = pd.DataFrame(randind_kmean).mean() purity_kmean_av = pd.DataFrame(purity_kmean).mean() print('({:d} trials)'.format(num_trials)) print('_________________K-Means Model____________________') print("The average adjusted rand score for testing data: %.2f" % randind_kmean_av.iloc[0]) print("The average purity for testing data: %.2f" % purity_kmean_av.iloc[0]) print('_________________GMM Model____________________') print("The average adjusted rand score for testing data: %.2f" % randind_gmm_av.iloc[0]) print("The average purity for testing data: %.2f" % purity_gmm_av.iloc[0]) print('******************************************************* ') with open(os.path.join(PERFORMANCE_DIR, PERFORMANCE_NAME), 'w') as outfile: headers = ['Model', 'Adjusted Rand Score', 'Purity'] kmean_results = pd.DataFrame( [['K-Means', randind_kmean_av.iloc[0], purity_kmean_av.iloc[0]]], columns=headers) gmm_results = pd.DataFrame([[ 'Guassian Mixture', randind_gmm_av.iloc[0], purity_gmm_av.iloc[0] ]], columns=headers) results = pd.DataFrame(columns=headers) results = results.append(kmean_results) results = results.append(gmm_results) print("Writing results to {}".format(outfile.name)) results.to_csv(path_or_buf=outfile, index=False)
def news_cluster_advanced(): opt = Config() data = get_divided_data() stop_words = get_stopwords() X, vectorizer, svd = extract_traits(data, stop_words) labels = data['DividedContent'] true_k = 31 # 聚类数量 if opt.minibatch: km = MiniBatchKMeans(n_clusters=true_k, init='k-means++', n_init=2, init_size=1000, batch_size=1500, verbose=opt.verbose) else: km = KMeans(n_clusters=true_k, init='k-means++', max_iter=300, n_init=5, n_jobs=-1, verbose=opt.verbose) print("对稀疏数据(Sparse Data) 采用 %s" % km) t0 = time.time() km.fit(X) print("完成所耗费时间:%0.3fs" % (time.time() - t0)) print() print("Homogeneity值: %0.3f" % metrics.homogeneity_score(labels, km.labels_)) print("Completeness值: %0.3f" % metrics.completeness_score(labels, km.labels_)) print("V-measure值: %0.3f" % metrics.v_measure_score(labels, km.labels_)) print("Adjusted Rand-Index值: %.3f" % metrics.adjusted_rand_score(labels, km.labels_)) print("Silhouette Coefficient值: %0.3f" % metrics.silhouette_score(X, km.labels_, sample_size=1000)) print() # 用训练好的聚类模型反推文档的所属的主题类别 label_prediction = km.predict(X) label_prediction = list(label_prediction) if not opt.use_hashing: print("每个聚类的TOP关键词:") if opt.n_components: original_space_centroids = svd.inverse_transform( km.cluster_centers_) order_centroids = original_space_centroids.argsort()[:, ::-1] else: order_centroids = km.cluster_centers_.argsort()[:, ::-1] terms = vectorizer.get_feature_names() for i in range(true_k): print("簇群 %d " % (i + 1), end='') print( "该簇群所含文档占比为", '%.4f%%' % (int(label_prediction.count(i)) / int(len(data['DividedContent'])))) print("簇群关键词:") for ind in order_centroids[i, :80]: print(' %s,' % terms[ind], end='') print( '\n------------------------------------------------------------------------------------------------' )
clusters = unique(yhat) # # Exibição dos gráficos 2D e 3D e componentes # plot_pca_2d(x_pca,yhat) plot_pca_3d(x_pca,yhat,elevacao,azimute) plot_componentes(df_pca_componentes) # # salvando em um dataframe/csv o resultado do modelo # df_algoritimos["HC"]=yhat # # Análise ARI # st.title("Similaridade entre os clusters") st.header("(Adjusted Rand Score Index)") lista=["KMeans","mean_s", "dbscan","GMM","HC"] ari_score={} for mod1 in lista: reg={} for mod2 in lista: ari=adjusted_rand_score(df_algoritimos[mod1].values, df_algoritimos[mod2].values) reg[mod2]=ari ari_score[mod1]=reg df_ari_score=pd.DataFrame(ari_score) sns.heatmap(df_ari_score,annot=True, cmap="Blues") st.pyplot()
plt.xticks([], []) plt.xlabel('') plt.yticks([], []) plt.ylabel('') #plt.show() plt.savefig('./clustering/' + name + '.png') muestra_agrupacion(X, y, "No Clustering") METRICAS = pd.DataFrame(columns=['Resultado']) for i in range(2, 8): kmeans = KMeans(n_clusters=i, random_state=0).fit(X) #muestra_agrupacion(X, kmeans.labels_) print("kmean with k=" + str(i)) METRICAS.loc['K=' + str(i)] = metrics.adjusted_rand_score( y, kmeans.labels_, "K" + str(i)) print(METRICAS) #seleccion RESULTADOS_SELEC = pd.DataFrame(columns=['Accuracy', "Tiempo"]) clasif = KNeighborsClassifier(n_neighbors=1, n_jobs=-1) selector = SelectKBest(chi2, k=20) RESULTADOS_SELEC["No selector"] = experimento_clas(clasif, X_train, y_train, X_test, y_test) selector = SelectKBest(chi2, k=20) selector.fit_transform(X, y) X_train_sel, X_test_sel, y_train_sel, y_test_sel = train_test_split( X, y, test_size=0.8) RESULTADOS_SELEC["chi2"] = experimento_clas(clasif, X_train_sel, y_train_sel,
def GMM(data, labels, n_components): gmm = mixture.GaussianMixture(n_components=n_components, covariance_type='full', random_state=0) gmm = gmm.fit(data) return metrics.adjusted_rand_score(labels, gmm.predict(data))
def get_ar(y_true, y_pred): return metrics.adjusted_rand_score(y_true, y_pred)
print("\t n_samples %d, \t n_features %d" % (n_samples, n_features)) print(82 * '_') print('init\t\ttime\tinertia\thomo\tcompl\tv-meas\tARI\tAMI\tsilhouette') #clustering on raw data t0 = time.time() kmeans = KMeans(init='k-means++', n_clusters=53, n_init=10) kmeans.fit(data) print('%-9s\t%.2fs\t%i\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f' % ('k-means++', (time.time() - t0), kmeans.inertia_, metrics.homogeneity_score(labels, kmeans.labels_), metrics.completeness_score(labels, kmeans.labels_), metrics.v_measure_score(labels, kmeans.labels_), metrics.adjusted_rand_score(labels, kmeans.labels_), metrics.adjusted_mutual_info_score( labels, kmeans.labels_, average_method='arithmetic'), metrics.silhouette_score( data, kmeans.labels_, metric='euclidean', sample_size=sample_size))) t0 = time.time() kmeans = KMeans(init='random', n_clusters=53, n_init=10) kmeans.fit(data) print('%-9s\t%.2fs\t%i\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f' % ('random', (time.time() - t0), kmeans.inertia_, metrics.homogeneity_score(labels, kmeans.labels_), metrics.completeness_score(labels, kmeans.labels_), metrics.v_measure_score(labels, kmeans.labels_), metrics.adjusted_rand_score(labels, kmeans.labels_), metrics.adjusted_mutual_info_score(
def test_DBSCAN(*data): X, labels_true = data clst = cluster.DBSCAN(); predict_labels = clst.fit_predict(X) print("ARI:%s" % adjusted_rand_score(labels_true, predict_labels)) print("Core sample num:%d" % len(clst.core_sample_indices_))
h = persim.heat(originalPers, comparePers) h_u = persim.heat(originalPers, upscalePers) print "Computing wasserstein..." ws = persim.wasserstein(originalPers, comparePers) ws_u = persim.wasserstein(originalPers, upscalePers) end = time.time() #gh = persim.gromov_hausdorff(originalPers, comparePers) #gh_u = persim.gromov_hausdorff(originalPers, upscalePers) baselineCluster = np.genfromtxt(outDir + "kmeans++/reducedData.csv") print "Calculating ARI with k-means as the baseline comparison..." #measure (dis)similarity between clusterings #sklearn.metrics.adjusted_rand_score(labels_true, labels_pred) #placeholder for now, need to confrim with nick kmeansARI = met.adjusted_rand_score(baselineCluster, baselineCluster) agglomWardARI = met.adjusted_rand_score( baselineCluster, outDir + "agglomerativeWard/reducedData.csv") agglomSingleARi = met.adjusted_rand_score( baselineCluster, outDir + "agglomerativeSingle/reducedData.csv") hdbscanARI = met.adjusted_rand_score(baselineCluster, outDir + "hdbscan/reducedData.csv") randomARI = met.adjusted_rand_score(baselineCluster, outDir + "random/reducedData.csv") print "Calculating Silhouette score " #silhouette sore - shows how close each point in one cluster is to points in neighboring clusters - used to find optimum num clusters #sklearn.metrics.silhouette_score(X, labels, metric='euclidean', sample_size=None, random_state=None, **kwds) #kmeansARI = met.silhouette_score( #agglomWardARI = met.silhouette_score( #agglomSingleARi = met.silhouette_score( #hdbscanARI = met.silhouette_score(
p = 10 * np.median(S) ############################################################################## # Compute Affinity Propagation af = AffinityPropagation().fit(S, p) cluster_centers_indices = af.cluster_centers_indices_ labels = af.labels_ n_clusters_ = len(cluster_centers_indices) print 'Estimated number of clusters: %d' % n_clusters_ print "Homogeneity: %0.3f" % metrics.homogeneity_score(labels_true, labels) print "Completeness: %0.3f" % metrics.completeness_score(labels_true, labels) print "V-measure: %0.3f" % metrics.v_measure_score(labels_true, labels) print "Adjusted Rand Index: %0.3f" % \ metrics.adjusted_rand_score(labels_true, labels) ############################################################################## # Plot result import pylab as pl from itertools import cycle pl.close('all') pl.figure(1) pl.clf() colors = cycle('bgrcmykbgrcmykbgrcmykbgrcmyk') for k, col in zip(range(n_clusters_), colors): class_members = labels == k cluster_center = X[cluster_centers_indices[k]] pl.plot(X[class_members, 0], X[class_members, 1], col + '.')
init='k-means++', n_init=1, init_size=1000, batch_size=1000) print("Clustering sparse data with %s" % km) t0 = time() km.fit(X) print("done in %0.3fs" % (time() - t0)) print() print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels, km.labels_)) print("Completeness: %0.3f" % metrics.completeness_score(labels, km.labels_)) print("V-measure: %0.3f" % metrics.v_measure_score(labels, km.labels_)) print("Adjusted Rand-Index: %.3f" % metrics.adjusted_rand_score(labels, km.labels_)) print("Silhouette Coefficient: %0.3f" % metrics.silhouette_score(X, km.labels_, sample_size=1000)) print() print("Top terms per cluster:") #original_space_centroids = svd.inverse_transform(km.cluster_centers_) #order_centroids = original_space_centroids.argsort()[:, ::-1] # #terms = vectorizer.get_feature_names() #for i in range(true_k): # print("Cluster %d:" % i, end='') # for ind in order_centroids[i, :10]: # print(' %s' % terms[ind], end='')
def train(row_id_str, ds_id, hdfs_feat_dir, local_out_dir, ml_opts_jstr, sp_master, spark_rdd_compress, spark_driver_maxResultSize, sp_exe_memory, sp_core_max, zipout_dir, zipcode_dir, zip_file_name, mongo_tuples, labelnameflag, fromweb, src_filename, jobname, model_data_folder): # create zip files for Spark workers ================= ================ zip_file_path = ml_build_zip_file(zipout_dir, zipcode_dir, zip_file_name, prefix='zip_feature_util') print "INFO: zip_file_path=", zip_file_path #data_folder = hdfs_feat_dir + "/" #local_out_dir = local_out_dir + "/" if not os.path.exists(local_out_dir): os.makedirs(local_out_dir) # ML model filename ==== model_fname = os.path.join(model_data_folder, row_id_str + '.pkl') print "INFO: model_data_folder=", model_data_folder # create out folders and clean up old model files ==== ml_util.ml_prepare_output_dirs(row_id_str, local_out_dir, model_data_folder, model_fname) # init Spark context ==== sc = ml_util.ml_get_spark_context(sp_master, spark_rdd_compress, spark_driver_maxResultSize, sp_exe_memory, sp_core_max, jobname, [zip_file_path]) # start here =================================================================== =============== t0 = time() ### load libsvm file: may or may not be PCA-ed ### libsvm_data_file = os.path.join(hdfs_feat_dir, src_filename) print "INFO: libsvm_data_file=", libsvm_data_file # feature count is a variable if PCA feature_count = 0 # samples_rdd may be from PCAed data # load sample RDD from text file # format (LabeledPoint,hash) from str2LabeledPoint_hash() samples_rdd, feature_count = zip_feature_util.get_sample_rdd( sc, libsvm_data_file, feature_count, '') # collect all data to local for processing =============== all_data = samples_rdd.collect() total_sample_count = len(all_data) # 2-D array, may be PCAed features_list = [x.features.toArray() for x, _ in all_data] # label array labels_list_all = [x.label for x, _ in all_data] # hash array hash_list_all = [x for _, x in all_data] # convert to np array features_array_reduced = np.array(features_list) hash_list_all = np.array(hash_list_all) labels_list_all = np.array(labels_list_all) true_label_array = np.array(labels_list_all, dtype=np.int8) print "INFO: total_sample_count=", total_sample_count print "INFO: features_array_reduced.shape=", features_array_reduced.shape print "INFO: labels_list_all.shape=", labels_list_all.shape print "INFO: true_label_array.shape=", true_label_array.shape t1 = time() print 'INFO: data generating time: %f' % (t1 - t0) ############################################### ########## build learning model ############### ############################################### ### parse parameters and generate the model ### (model, alg, n_clusters) = parse_para_and_get_model(ml_opts_jstr) if model is None: return labels_kmeans = None #### fit the model to training dataset #### try: model.fit(features_array_reduced) labels_kmeans = model.labels_ #'numpy.ndarray' except: print "ERROR: Error in model.fit(): ", "model=", model, ", sys.exc_info:", sys.exc_info( )[0] return #### save clf for future use #### #joblib.dump(model, model_data_folder + row_id_str+'.pkl') joblib.dump(model, model_fname) #print "**model:intercept***" #print clf.intercept_ print "INFO: model type=", type(model), " model=", model ################################################### ### generate label names (family names) ########### ### connect to database to get the column list which contains all column number of the corresponding feature#### ################################################### if labelnameflag == 1: key = "dic_name_label" jstr_filter = '{"rid":' + row_id_str + ',"key":"' + key + '"}' jstr_proj = '{"value":1}' # get parent dataset's data if ds_id != row_id_str: jstr_filter = '{"rid":' + ds_id + ',"key":"' + key + '"}' doc = query_mongo.find_one_t(mongo_tuples, jstr_filter, jstr_proj) dic_list = doc['value'] label_dic = {} for i in range(0, len(dic_list)): for key in dic_list[i]: label_dic[dic_list[i][key]] = key.encode('UTF8') print "INFO: label_dic:", label_dic else: label_dic = {} label_set = set(labels_list_all) for label_value in label_set: label_dic[int(label_value)] = str(int(label_value)) print "INFO: generated label_dic:", label_dic labels_list = [] for key in sorted(label_dic): labels_list.append(label_dic[key]) print "INFO: labels_list=", labels_list #Adjusted Mutual Information between two clusterings amis = adjusted_mutual_info_score(labels_list_all, labels_kmeans) print "INFO: Adjusted_mutual_info_score=", amis #Similarity measure between two clusterings ars = adjusted_rand_score(labels_list_all, labels_kmeans) print "INFO: Adjusted_rand_score=", ars ################################################### #######plot histogram #### ################################################### plot_col_num = int(math.ceil(math.sqrt(n_clusters))) figsize = (4 * plot_col_num, 3 * int(math.ceil(n_clusters * 1.0 / plot_col_num))) print "INFO: labels_list_all.shape=", labels_list_all.shape, "labels_kmeans.shape=", labels_kmeans.shape print "INFO: labels_list_all t=", type( labels_list_all), "labels_kmeans t=", type(labels_kmeans) print "INFO: n_clusters=", n_clusters, ",label_dic=", label_dic print "INFO: plot_col_num=", plot_col_num, ",figsize=", figsize, ",local_out_dir=", local_out_dir # kmeans histogram _, p_true = ml_plot_kmeans_histogram_subfigures(labels_list_all, labels_kmeans, n_clusters, names=label_dic, plot_col_num=plot_col_num, figsize=figsize, folder=local_out_dir, rid=row_id_str) # normalized kmeans histogram _, p_true_norm = ml_plot_kmeans_histogram_subfigures( labels_list_all, labels_kmeans, n_clusters, names=label_dic, plot_col_num=plot_col_num, figsize=figsize, normalize=True, folder=local_out_dir, rid=row_id_str) ####plot "reverse" histogram with labels #### #num_bars = len(np.unique(labels_list_all)) num_bars = max(labels_list_all) + 1 figsize = (4 * plot_col_num, 3 * int(math.ceil(num_bars * 1.0 / plot_col_num))) _, p_cluster = ml_plot_kmeans_histogram_subfigures( labels_kmeans, labels_list_all, num_bars, names=label_dic, plot_col_num=plot_col_num, figsize=figsize, reverse=True, folder=local_out_dir, rid=row_id_str) #### plot dot figures #### #mtx_label = model.labels_ mtx_center = model.cluster_centers_ # dot plot for Kmeans =========== filename = os.path.join(local_out_dir, row_id_str + '_cluster.png') filename_3d = os.path.join(local_out_dir, row_id_str + '_cluster_3d.json') ml_plot_kmeans_dot_graph_save_file(features_array_reduced, labels_kmeans, mtx_center, n_clusters, figsize=(10, 7), filename=filename, title='KMeans', filename_3d=filename_3d) # dot plot for True Labels =========== filename = os.path.join(local_out_dir, row_id_str + '_cluster_tl.png') filename_3d = os.path.join(local_out_dir, row_id_str + '_cluster_3d_tl.json') ml_plot_kmeans_dot_graph_save_file(features_array_reduced, true_label_array, mtx_center, n_clusters, figsize=(10, 7), filename=filename, title='True Labels', filename_3d=filename_3d) dataset_info = { "training_fraction": 1, "class_count": n_clusters, "dataset_count": total_sample_count } # only update db for web request =========== if fromweb == "1": #print "database update" str_sql="UPDATE atdml_document set accuracy = '" \ +"', status = 'learned', processed_date ='"+str(datetime.datetime.now()) \ +"', total_feature_numb='"+str(feature_count) \ +"', perf_measures='{}" \ +"', dataset_info='"+json.dumps(dataset_info) \ +"' where id="+row_id_str ret = exec_sqlite.exec_sql(str_sql) print "INFO: Data update done! ret=", str(ret) else: print "INFO: accuracy = '" + str(accuracy * 100) + "%" t1 = time() print 'INFO: running time: %f' % (t1 - t0) #print 'Finished!' return 0
X, y = make_blobs(n_samples=500, n_features=2, centers=4, cluster_std=1, center_box=(-10.0, 10.0), shuffle=True, random_state=1) plot_data(X, y) kmeans_model = cluster.KMeans(n_clusters=5, random_state=1) kmeans_model.fit(X) kmeans_model.cluster_centers_ kmeans_model.labels_ #metrics when target labels are not known silhouette_avg = metrics.silhouette_score(X, kmeans_model.labels_, metric='euclidean') print(silhouette_avg) silhouette_samples = metrics.silhouette_samples(X, kmeans_model.labels_, metric='euclidean') print(silhouette_samples) ch_score = metrics.calinski_harabaz_score(X, kmeans_model.labels_) print(ch_score) #metrics when target labels are known df = pd.DataFrame({'GT': y, 'Pred': kmeans_model.labels_}) print(metrics.adjusted_rand_score(y, kmeans_model.labels_)) print(metrics.adjusted_mutual_info_score(y, kmeans_model.labels_))
label='mini-batch k-means') C_mbkm_wr = mini_batch_kmeans(X, C_init, b=b, t=t, replacement=True) plt.plot(C_mbkm_wr[:, 0], C_mbkm_wr[:, 1], 'mo', markersize=10, label='mini-batch k-means w/o rep.') # from sklearn.cluster import MiniBatchKMeans # mbkm_skl = MiniBatchKMeans(n_clusters=k, max_iter=1, max_no_improvement=None, tol=0.0, batch_size=b, init=C_init, compute_labels=False) # mbkm_skl.fit(X) # C_mbkm_skl = mbkm_skl.cluster_centers_ # plt.plot(C_mbkm_skl[:,0], C_mbkm_skl[:,1], 'co', markersize=10, label='mini-batch k-means SKL') plt.legend(numpoints=1, loc='lower right') labels_init = compute_labels(X, C_init) labels_kmeans = compute_labels(X, C_kmeans) labels_mbkm = compute_labels(X, C_mbkm) labels_mbkm_wr = compute_labels(X, C_mbkm_wr) print("Adjusted rand scores:") print("labels_kmeans, labels_init =", adjusted_rand_score(labels_kmeans, labels_init)) print("labels_kmeans, labels_mbkm =", adjusted_rand_score(labels_kmeans, labels_mbkm)) print("labels_kmeans, labels_mbkm_wr =", adjusted_rand_score(labels_kmeans, labels_mbkm_wr)) plt.show()
linkage='complete').fit(data) labels = clustering.labels_ #Kmeans (Euclidean) # clustering = KMeans(n_clusters=k, random_state=0).fit(data) #labels = clustering.labels_ #Fuzzy c-means #cntr, u, u0, dat, jm, p, fpc = cmeans(np.transpose(data), c=k, m=2, error = 0.005, maxiter=100, init=None, seed=None) #labels = np.argmax(u, axis=0) #Kmedoids #labels, sse_all, j, closest_observations_prev = Kmedoids(timeseries, k, max_iter, window_size) MI = adjusted_mutual_info_score(classes, labels) RS = adjusted_rand_score(classes, labels) HS = metrics.homogeneity_score(classes, labels) CS = metrics.completeness_score(classes, labels) FMS = metrics.fowlkes_mallows_score(classes, labels) RI = rand_index_score(classes, labels) counter_data_set = counter_data_set + 1 output_row['dataset'] = d output_row['adjusted Mutual information'] = MI output_row['adjusted Rand index'] = RS output_row['Homogeneity'] = HS output_row['Completeness'] = CS output_row['Fowlkes Mallows'] = FMS output_row['Rand index'] = RI
def gen_nets_comm(ks, data_path=None, data_file_name='data.npy', target_file_name='target.npy', metric='euclidean', community_method='community_multilevel', n_jobs=-1): if not data_path: raise ValueError('data_path not specified.') path = Path(data_path) x = np.load(path / data_file_name) y = np.load(path / target_file_name) similarity_matrix = 1 / ( 1 + pairwise_distances(x, metric=metric, n_jobs=n_jobs)) result_modularity = [] result_ari = [] result_nmi = [] connec_point = None for k in ks: M = similarity_matrix.copy() to_remove = M.shape[0] - (k + 1) # add 1 to eliminate loops for vec in M: vec[vec.argsort()[:to_remove]] = 0 g = Graph.Weighted_Adjacency(M.tolist(), mode=ADJ_UNDIRECTED, loops=False) g.vs['name'] = y # Verify in which k the network is connected if not connec_point and not g.is_connected(): connec_point = k y_pred, modularity = detect_community(g, community_method) path_save = Path(path) / 'nets' path_save.mkdir(parents=True, exist_ok=True) net_name = 'net_%s_k_%i.xnet' % (metric, k) labels_name = 'net_%s_k_%i_labels_comm.txt' % (metric, k) to_xnet(g, path_save / net_name, names=True) np.savetxt(path_save / labels_name, y_pred, fmt='%s') metrics.adjusted_rand_score(y, y_pred) result_modularity.append(modularity) result_ari.append(metrics.adjusted_rand_score(y, y_pred)) result_nmi.append(metrics.normalized_mutual_info_score(y, y_pred)) path_results = path / 'results' path_results.mkdir(parents=True, exist_ok=True) df = pd.DataFrame({ 'NMI': result_nmi, 'ARI': result_ari, 'Modularity': result_modularity }) df.to_csv(path_results / ('%s.csv' % metric)) df.index = sorted(ks) #df.index + 1 plot = df.plot(xticks=[1] + list(range(0, max(ks) + 1, 5))[1:], ylim=(0, 1), use_index=True) plot.set_xlabel('k') plot.axvline(connec_point, color='k', linestyle='--') plot.text(connec_point + 0.01, 0.98, 'connected', rotation=90) fig = plot.get_figure() fig.savefig(path_results / ('%s.pdf' % metric))