# In[ ]: mat = np.matrix(pairwise_similarity.A) # In[ ]: temp_dict = {} # In[ ]: start = time.time() for i in range(10, 20, 1): if (i == 13): continue nombre = "C_" + str(i) Abstracts_Category = SpectralClustering( i, affinity='precomputed').fit_predict(mat) temp_dict[nombre] = Abstracts_Category end = time.time() #print((end-start)/60) # In[ ]: df_category = pd.DataFrame(temp_dict) # In[ ]: #df_category # In[ ]: df_temp = pd.concat([Abstracts, df_category], axis=1)
y = y.reshape(-1, 16) print(np.shape(y)) kmeans = KMeans(n_clusters=2, random_state=0).fit(y) #print(used_data) print("Calving count: " + str(calved_count)) print("Random count: " + str(random_count)) labels = kmeans.labels_ print("KMeans") tabulate2Clusters(labels, calved_count) spectral = SpectralClustering(n_clusters=2, assign_labels="kmeans").fit(y) print("\n\nSpectral Clustering") tabulate2Clusters(spectral.labels_, calved_count) agg = AgglomerativeClustering(n_clusters=2, linkage="average").fit(y) print("\n\nAgglomerative Clustering AVG") tabulate2Clusters(agg.labels_, calved_count) agg = AgglomerativeClustering(n_clusters=2, linkage="complete").fit(y) print("\n\nAgglomerative Clustering COMPLETE") tabulate2Clusters(agg.labels_, calved_count) agg = AgglomerativeClustering(n_clusters=2, linkage="ward").fit(y) print("\n\nAgglomerative Clustering WARD") tabulate2Clusters(agg.labels_, calved_count)
def evaluation(y_pred, cluster_method="Kmeans", num_cluster=25, n_neighbors=20, min_dist=0.0): if cluster_method == "Kmeans": embedding = umap.UMAP(n_neighbors=n_neighbors, min_dist=min_dist, n_components=num_cluster, metric="euclidean").fit_transform(y_pred) kmeans = KMeans(n_clusters=num_cluster, random_state=1).fit(embedding) centroid = kmeans.cluster_centers_.copy() y_label = kmeans.labels_.copy() y_pseudo = np.zeros((y_pred.shape[0], num_cluster)) elif cluster_method == "SC": embedding = umap.UMAP(n_neighbors=n_neighbors, min_dist=min_dist, n_components=num_cluster, metric="euclidean").fit_transform(y_pred) clustering = SpectralClustering(n_clusters=num_cluster, assign_labels="discretize", random_state=0).fit(embedding) y_label = clustering.labels_.copy() centroid = pd.DataFrame(embedding.copy()) centroid['label'] = y_label centroid = centroid.groupby('label').mean().values y_pseudo = np.zeros((y_pred.shape[0], num_cluster)) else: embedding = umap.UMAP(n_neighbors=n_neighbors, min_dist=min_dist, n_components=num_cluster, metric="euclidean").fit_transform(y_pred) gmm = GaussianMixture(n_components=num_cluster).fit(embedding) y_label = gmm.predict(embedding) centroid = pd.DataFrame(embedding.copy()) centroid['label'] = y_label centroid = centroid.groupby('label').mean().values y_pseudo = np.zeros((y_pred.shape[0], num_cluster)) ##t-student distribution kernel soft-assignment,alpha=1 #for j in range(centroid.shape[0]): # y_pseudo[:,j]=(np.linalg.norm(embedding-centroid[j,:],axis=1)+1)**(-1) ##cosine distance #y_pseudo[:,j]=((1-cosine_similarity(embedding,centroid[j,:].reshape(1,embedding.shape[1]))+1)**(-1))[:,0] #y_pseudo = pd.DataFrame(y_pseudo) #y_pseudo2=np.zeros((y_pred.shape[0],centroid.shape[0])) #for j in range(centroid.shape[0]): # y_pseudo2[:,j]=y_pseudo.iloc[:,j].values/np.sum( # y_pseudo[y_pseudo.columns.difference([j])].values,axis=1) #y_pseudo = y_pseudo2 ##distance based soft-assignment for j in range(centroid.shape[0]): ##euclidean distance y_pseudo[:, j] = 1 / np.linalg.norm(embedding - centroid[j, :], axis=1) ##cosine similarity #y_pseudo[:,j]=1/(1-cosine_similarity(embedding,centroid[j,:].reshape(1,embedding.shape[1])))[:,0] y_pseudo = softmax(y_pseudo, axis=1) ##auxiliary target distribution f = np.sum(np.square(y_pseudo) / np.sum(y_pseudo, axis=0), axis=1) y2 = np.square(y_pseudo) / np.sum(y_pseudo, axis=0) au_tar = (y2.T / f).T return au_tar, y_label, embedding
def spectralClustering(df, k): spectral = SpectralClustering(n_clusters=k, random_state=0).fit(df) return spectral
def get_spectral_clusters(A, k): from sklearn.cluster import SpectralClustering spec = SpectralClustering(n_clusters=k, random_state = 0, affinity = 'precomputed', assign_labels = 'discretize') return spec.fit_predict(A)
def main(): parser = argparse.ArgumentParser() parser.add_argument("config", help="Name of the config file.") args = parser.parse_args() if not exists(args.config): raise FileNotFoundError(args.config + " not found.") with open(args.config, "r") as config_file: config = yaml.load(config_file, Loader=yaml.Loader) patch_data_path = config['patch_data_path'] base_neuron_act_path = config['base_neuron_act_path'] train_activations = config['train_activations'] val_activations = config['val_activations'] test_activations = config['test_activations'] output_path = config['output_path'] cluster_types = config['cluster_types'] n_clusters = config['n_clusters'] n_samps = config['n_samps'] plot_mode = config['plot_mode'] seed = config['seed'] if not exists(output_path): makedirs(output_path) train = pd.read_csv(join(base_neuron_act_path, train_activations)) if plot_mode == 'train': plot_data = train elif plot_mode == 'val': plot_data = pd.read_csv(join(base_neuron_act_path, val_activations)) elif plot_mode == 'test': plot_data = pd.read_csv(join(base_neuron_act_path, test_activations)) cluster_dfs = {} for cluster_type in cluster_types: for n_cluster in n_clusters: if cluster_type == 'GMM': cluster_dfs[cluster_type] = plot_data X = train.loc[:, train.columns.str.contains('neuron')] mod = GaussianMixture(n_components=n_cluster, **config['GMM_kwargs']).fit(X) cluster_prob = mod.predict_proba( cluster_dfs[cluster_type].loc[:, cluster_dfs[cluster_type]. columns.str.contains('neuron' )]) cluster_dfs[cluster_type]['cluster_prob'] = cluster_prob.max( axis=1) cluster_dfs[cluster_type]['cluster'] = mod.predict( cluster_dfs[cluster_type].loc[:, cluster_dfs[cluster_type]. columns.str.contains('neuron' )]) plot_prob_dist(cluster_dfs[cluster_type], output_path, cluster_type, n_cluster) plot_prob_cdf(cluster_dfs[cluster_type], output_path, cluster_type, n_cluster) elif cluster_type == 'Spectral': cluster_dfs[cluster_type] = train.sample(n_samps, random_state=seed) X = cluster_dfs[cluster_type].loc[:, cluster_dfs[cluster_type]. columns.str.contains('neuron' )] mod = SpectralClustering(n_clusters=n_cluster, **config["Spectral_kwargs"]).fit(X) cluster_dfs[cluster_type]['cluster'] = mod.labels_ joblib.dump( mod, join(output_path, f'{cluster_type}_{n_cluster}_clusters.mod')) plot_cluster_dist(cluster_dfs[cluster_type], output_path, cluster_type, n_cluster) plot_storm_clusters(patch_data_path, output_path, cluster_dfs[cluster_type], cluster_type, seed, **config['plot_kwargs']) return
def interchromosomal_clusters(cf,k,cluster_file,algorithm='eigh-kmeans',interchr_mat=None,use_builtin_ice=False,use_ice=False,use_cpb=False,save_mat=False,mat_save_dir='.',plot_mat=False,out_file='global_clusters.out'): if algorithm not in ['eigh-gmix','eigh-kmeans','spec-kmeans']: print "error: algorithm must be either 'eigh-gmix', 'eigh-kmeans' or 'spec-kmeans'" clusters = {} print "[interchromosomal_clusters] k={}, cluster_file={}, out_file={}, algorithm={}".format(k,cluster_file,out_file,algorithm) # Read and parse intrachromosomal clusters. fc = open(cluster_file,"r") for line in fc: m = line.rstrip().split('\t') chr = m[0] clust = m[1].split(',') if not clusters.has_key(chr): clusters[chr] = [] clusters[chr].append(np.array([int(v) for v in clust])) # Compute interchromosomal sums. if interchr_mat is None: print "computing interchromosomal matrix..." mat_shape = 0 offset = {} for chr in clusters: offset[chr] = mat_shape mat_shape += len(clusters[chr]) mat = np.zeros((mat_shape,mat_shape)) normat = np.zeros((mat_shape,mat_shape)) # Iterate over all interchromosomal combinations for i in xrange(0,len(clusters)): chr_i = clusters.keys()[i] ploid_i = (1 if chr_i != 'chrX' else 2) sys.stdout.write("{} -> ".format(chr_i)); for j in xrange(i+1,len(clusters)): chr_j = clusters.keys()[j] sys.stdout.write("{},".format(chr_j)) ploid_j = (1 if chr_j != 'chrX' else 2) hic_m = cf.matrix(balance=use_builtin_ice).fetch(chr_i,chr_j) if use_ice: hic_m = inter_ice(hic_m,20) elif use_cpb: hic_m = inter_cpb(hic_m,20) # Cluster combinations. for ki,clu_i in enumerate(clusters[chr_i]): for kj,clu_j in enumerate(clusters[chr_j]): sumv = np.sum(hic_m[clu_i,:][:,clu_j]) * ploid_i * ploid_j mat[offset[chr_i]+ki,offset[chr_j]+kj] = mat[offset[chr_j]+kj,offset[chr_i]+ki] = sumv normat[offset[chr_i]+ki,offset[chr_j]+kj] = normat[offset[chr_j]+kj,offset[chr_i]+ki] = sumv/float(len(clu_i))/float(len(clu_j)) # Newline. sys.stdout.write('\n') # Store matrices for future processing if save_mat: np.save('{}/interchr_normat.npy'.format(mat_save_dir),normat) np.save('{}/interchr_sums.npy'.format(mat_save_dir),mat) else: print "Using user-provided interchromosomal matrix for cluster computation." normat = interchr_mat N = normat.shape[0] print "computing clusters, algorithm {}...".format(algorithm) if algorithm == 'spec-kmeans': spect_clu = SpectralClustering(n_clusters=k, eigen_solver='arpack', affinity='rbf', assign_labels='kmeans', n_jobs=8) hic_clust = spect_clu.fit_predict(normat) else: w, v = scipy.linalg.eigh(normat, eigvals=(N-k,N-1)) if algorithm == 'eigh-gmix': gmix = mixture.GaussianMixture(n_components=k, covariance_type='full', tol=1e-4, max_iter=1000) gmix.fit(v) hic_clust = gmix.predict(v) elif algorithm == 'eigh-kmeans': km = KMeans(n_clusters=k,n_jobs=8) hic_clust = km.fit_predict(np.sqrt(w)*v) with open('{}'.format(out_file+'.weig'),'w+') as outdata: for (c,ev) in zip(hic_clust,w*v): outdata.write(str(c)+'\t'+'\t'.join([str(x) for x in ev[::-1]]) + '\n') clu_idx = np.argsort(hic_clust) P = np.zeros(normat.shape) P[np.arange(0,len(clu_idx)),clu_idx] = 1 # Permute rows and columns. W_clust = np.dot(np.dot(P,normat),np.linalg.inv(P)) if plot_mat: plt.matshow(W_clust,cmap=plt.cm.bwr) clust_cnt = [(g[0], len(list(g[1]))) for g in itertools.groupby(sorted(hic_clust))] # Compute cluster limits cnt = np.zeros(k+1,dtype=int) for i in xrange(k): cnt[i] += clust_cnt[i][1] cmcnt = np.cumsum(cnt) l = W_clust.shape[0]-1 if plot_mat: for x in cmcnt: plt.plot([0,l], [x,x], color='k', linestyle='-', linewidth=1) plt.plot([x,x], [0,l], color='k', linestyle='-', linewidth=1) print "writing results to {}...".format(out_file) with open(out_file,'w+') as fo: c_id = 0 for i in xrange(0,len(clusters)): chr_i = clusters.keys()[i] for ki,clu_i in enumerate(clusters[chr_i]): fo.write("{}\t{}\t{}\t{}\n".format(clusters.keys()[i],ki,hic_clust[c_id],','.join([str(n) for n in clu_i]))) c_id += 1 print "[interchromosomal_clusters] Done."
# in this case the seeding of the centers is deterministic, hence we run the # kmeans algorithm only once with n_init=1 pca = PCA(n_components=n_digits).fit(data) bench_k_means(KMeans(init=pca.components_, n_clusters=n_digits, n_init=1), name="PCA-based", data=data) bench_k_means(AffinityPropagation(), name="AP", data=data) bench_k_means(MeanShift(), name="MS", data=data) bench_k_means(AgglomerativeClustering(n_clusters=n_digits), name="AC", data=data) bench_k_means(AgglomerativeClustering(n_clusters=n_digits), name="WD", data=data) bench_k_means(DBSCAN(), name="DB", data=data) bench_k_means(SpectralClustering(n_clusters=n_digits), name="SC", data=data) GM = GaussianMixture(n_components=10) t0 = time() GM.fit(data) print('%-5s\t%.2fs\t%.3f\t%.3f\t%.3f' % ('GM', (time() - t0), metrics.normalized_mutual_info_score(labels, GM.predict(data)), metrics.homogeneity_score(labels, GM.predict(data)), metrics.completeness_score(labels, GM.predict(data)))) print(82 * '_') # ############################################################################# # Visualize the results on PCA-reduced data # reduced_data = PCA(n_components=2).fit_transform(data)
strsub1 = 'K'+str(K)+'N'+str(N)+'c'+str(c)+'la'+str(lambda_n)+'rd'+str(rand) # for saving results # simulate graph G = SBM.SBM_simulate_fast(model_sbm1) ln, nodeslist = get_label_list(G) # algo1: proposed deepwalk algorithm model_w2v = SBM.SBM_learn_deepwalk_1(G, num_paths, length_path, emb_dim, rw_filename, emb_filename, winsize) X = model_w2v[nodeslist] k_means = KMeans(n_clusters=K, max_iter=100, precompute_distances=False) k_means.fit(X) y_our = k_means.labels_ nmi_arry, ccr_arry, ars_arry = summary_res(nmi_arry, ccr_arry, ars_arry, ln, y_our, 'deep', 'N', N, rand) # algo2: spectral clustering A = nx.to_scipy_sparse_matrix(G) print 'start spectral clustering' if N<10000: sc = SpectralClustering(n_clusters = K, affinity = 'precomputed', eigen_solver='arpack') sc.fit(A) y_sc= sc.labels_ else: y_sc = ln nmi_arry, ccr_arry, ars_arry = summary_res(nmi_arry, ccr_arry, ars_arry, ln, y_sc, 'sc', 'N', N, rand) # algo3: belief propogation print 'start ABP algorithm' r = 3 m, mp, lambda1 = SBM.abp_params(model_sbm1) y_abp = SBM.SBM_ABP(G, r, lambda1, m, mp) nmi_arry, ccr_arry, ars_arry = summary_res(nmi_arry, ccr_arry, ars_arry, ln, y_abp, 'abp', 'N', N, rand) import pickle savename = 'exp3-3.pkl' res = [nmi_arry, ccr_arry, ars_arry] pickle.dump(res, open(savename, 'wb'), protocol=2)
def Spectral_clustering(): #4 global cluster global labels cluster = SpectralClustering(n_clusters=110).fit_predict(text) print("Spectral clustering NMI:%s" % (metrics.normalized_mutual_info_score(labels, cluster)))
data =cPickle.load(open(text_path, "r")) wdict = data['wdict'] s_l_train = data['s_l_train'] y_l_train = data['y_l_train'] return [wdict, s_l_train, y_l_train] wdict, s_l_train, y_l_train = load_data( text_path) data =cPickle.load(open(data_path,'rb')) X = data['z'] tsne = TSNE(n_components=3) X = tsne.fit_transform(X) cluster_num = 8 stat = np.zeros((cluster_num, 20000),dtype='float64') cluster_label = SpectralClustering(n_clusters=cluster_num).fit(X).labels_ print len(cluster_label), len(s_l_train) idict = dict() for k in wdict.keys(): idict[wdict[k]] = k idict[0] = '<eos>' idict[1] = '<unk>' c_num = np.zeros((cluster_num,)) for i, s in enumerate(s_l_train): c_num[cluster_label[i]] += 1.0 l = [] for j in s: l.extend(j)
# x = pca.fit_transform(data1[:,2:]) x = TSNE(n_components=3).fit_transform(data1[:,2:]) print(x.shape) # Kmeans Clustering # https://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html#sklearn.cluster.KMeans pred = KMeans(n_clusters=4,max_iter=30000,random_state=0).fit_predict(x) fs = f1_score(y_true, pred, average='weighted') print("Kmeans F1 Score:",fs) # Spectral Clustering # https://scikit-learn.org/stable/modules/generated/sklearn.cluster.SpectralClustering.html#sklearn.cluster.SpectralClustering pred = SpectralClustering(n_clusters=4).fit_predict(x) fs = f1_score(y_true, pred, average='weighted') print("Spectral F1 Score:",fs) # Agglomerative Clustering # https://scikit-learn.org/stable/modules/generated/sklearn.cluster.AgglomerativeClustering.html#sklearn.cluster.AgglomerativeClustering pred = AgglomerativeClustering(n_clusters=4).fit_predict(x) fs = f1_score(y_true, pred, average='weighted') print("Agglomerative F1 Score:",fs) # Density Based # needs work https://scikit-learn.org/stable/modules/generated/sklearn.cluster.DBSCAN.html#sklearn.cluster.DBSCAN pred = DBSCAN(eps=1, min_samples=2, ).fit_predict(x) fs = f1_score(y_true, pred, average='weighted') print("Density Based F1 Score:",fs)
ms.fit_predict(xtrain_lsa) #cluster_centers_indices = ms.cluster_centers_indices_ labels = ms.labels_ # get clusters n_clusters = len(labels) plt.title("Mean Shift") plt.scatter(xtrain_lsa[:,0], xtrain_lsa[:,1], c =labels ) plt.show() #print("Silhouette Coefficient: %0.3f" # % metrics.silhouette_score(xtrain_lsa, labels, metric='sqeuclidean')) #print("mean shift, cluster_centers_indices", n_clusters) """ #SKIP=True if DO_ALL is True: sc = SpectralClustering(n_clusters=6) #n_clusters) sc.fit(xtrain_lsa) predict_sc = sc.fit_predict(xtrain_lsa) # Predicted clusters labels = sc.labels_ plt.title("SpectralClustering") plt.scatter(xtrain_lsa[:, 0], xtrain_lsa[:, 1], c=labels) plt.show() clusters = len(labels) #pd.crosstab(predict_sc, c) print("spectral: Silhouette Coefficient: %0.3f" % metrics.silhouette_score(xtrain_lsa, labels, metric='sqeuclidean')) #print(len(labels)," clusters") #KMeans
### So if average number of gates from user is 3 (n_gates_users_avg = 3), n_clusters should be 4 n_c_avg_users = n_gates_users_avg + 1 n_cluster_list = [int(n_c_avg_users)] #n_cluster_list = [int(n_c_avg_users -1), int(n_c_avg_users), int(n_c_avg_users +1)] df_pixel_final_labels_hybrid_all_n_c = pd.DataFrame() final_e_df_example_all_n_c = pd.DataFrame() for n_clusters in n_cluster_list: print(n_clusters) clustering_SC = SpectralClustering(n_clusters=n_clusters, affinity="precomputed", assign_labels=choose_assign_labels, random_state=0).fit(hybrid_matrix) pixel_final_labels_hybrid = clustering_SC.labels_[:len( df_pixel_info_example["x_pixel_coor"])] df_pixel_final_labels_hybrid = pd.DataFrame( {"pixel_final_label": pixel_final_labels_hybrid}) ### After clustering the pixels, we can go back to our events and label the events inside each pixel. ### First, we update our dataframe that contains pixel information with the labels we found for pixels. ### Update the dataframe with pixel labels df_pixel_info_example["pixel_final_label"] = pixel_final_labels_hybrid
A[tup[0], tup[1]] = tup[2]*10 A[tup[1], tup[0]] = tup[2]*10 adj_mat = [[3,2,2,0,0,0,0,0,0], [2,3,2,0,0,0,0,0,0], [2,2,3,1,0,0,0,0,0], [0,0,1,3,3,3,0,0,0], [0,0,0,3,3,3,0,0,0], [0,0,0,3,3,3,1,0,0], [0,0,0,0,0,1,3,1,1], [0,0,0,0,0,0,1,3,1], [0,0,0,0,0,0,1,1,3]] #adj_mat = np.array(adj_mat) sc = SpectralClustering(100, affinity='precomputed', n_init=100) sc.fit(A) print('spectral clustering') print (type(sc.labels_)) #print (len(sc.labels_)) #print(sc.labels_.tolist()) node_map = {} cluster = sc.labels_.tolist() for i in range(len(cluster)): if cluster[i] in node_map: node_map[cluster[i] ].append(i) else: node_map[cluster[i] ] = []
def spectral_cluster(vectors): print('spectral clustering...') return SpectralClustering(n_clusters=4).fit_predict(vectors)
X.append(data[:, :, i].reshape(h * w)) y.append(tag) return X, y data = sio.loadmat("../ExtYaleB10.mat") data_train, data_test = data["train"], data["test"] X_train, y_train = [], [] for tag in range(10): X_add, y_add = flatImageByClass(data_train[0][tag], tag) X_train += X_add y_train += y_add X_train, y_train = np.array(X_train), np.array(y_train) X_test, y_test = [], [] for tag in range(10): X_add, y_add = flatImageByClass(data_test[0][tag], tag) X_test += X_add y_test += y_add X_test, y_test = np.array(X_test), np.array(y_test) for gamma in [.01, .1, 1, 10, 100]: clf = SpectralClustering(n_clusters=10, gamma=gamma) labels = clf.fit_predict(X_train) # Error error = 0 for i in range(len(labels)): if labels[i] != y_train[i]: error += 1 print("gamma = %f: Error ratio: %f" % (gamma, float(error) / len(labels)))
# For reproducibility np.random.seed(1000) nb_samples = 1000 if __name__ == '__main__': # Create dataset X, Y = make_moons(n_samples=nb_samples, noise=0.05) # Try different gammas with a RBF affinity Yss = [] gammas = np.linspace(0, 12, 4) for gamma in gammas: sc = SpectralClustering(n_clusters=2, affinity='rbf', gamma=gamma) Yss.append(sc.fit_predict(X)) # Show data fig, ax = plt.subplots(1, 4, figsize=(30, 10), sharey=True) for x in range(4): ax[x].grid() ax[x].set_title('Gamma = %.0f' % gammas[x]) for i in range(nb_samples): c = Yss[x][i] if c == 0: ax[x].scatter(X[i, 0], X[i, 1], marker='o', color='r') else:
def cluster_compartments(cf,k,chrlist,eig_dim=None,contact_thr=1,max_sample_size=50000,outlier_pctl=90,corr_outlier_pctl=[5,95],balance_corr_median=False,rm_diags=0,coeffs=None,coeffs_gw=None,seed=None,max_resampling_attempts=10,rearrange_clusters=False,use_oe=True,use_builtin_ice=False,use_ice=False,use_cpb=False,compute_abscore=True,algorithm='eigh-kmeans',outdir='.',out_allchr='clusters_all.txt'): if algorithm not in ['eigh-gmix','eigh-kmeans','spec-kmeans']: print "error: algorithm must be either 'eigh-gmix', 'eigh-kmeans' or 'spec-kmeans'" return print "[intrachromosomal_clusters] k={}, outdir={}, algorithm={}".format(k,outdir,algorithm) if eig_dim == None: eig_dim = k if coeffs == None: coeffs = {} clusters = {} sample_idx = {} clusters_idx = {} snapshot_mat = None snapshot_cor = None for chr in chrlist: if os.path.isfile('{}/clusters_{}.txt'.format(outdir,chr)): print "Warning: {} clusters ({}/clusters_{}.txt) already exist. Skipping chromosome.".format(chr,outdir,chr) continue print "[{}] balancing matrix...".format(chr) m = cf.matrix(balance=use_builtin_ice).fetch(chr) # Threshold contacts. m[np.where(m<contact_thr)] = 0 print "[{}] computing O/E...".format(chr) if use_oe is True: if coeffs_gw is not None: m_oe = oe_apply(m,coeffs_gw).toarray() else: if not coeffs.has_key(chr): coeffs[chr] = oe_coeffs_mask(cf,chr,use_builtin_ice=use_builtin_ice)[chr] m_oe = oe_apply(m,coeffs[chr]).toarray() else: m_oe = m # Remove diagonals if rm_diags > 0: m_oe = np.triu(m_oe, k=rm_diags) + np.tril(m_oe,k=-rm_diags) # Get idx of high quality regions (measured in raw matrix). samp_idx = matrix_mask_idx(m_oe) sample_idx[chr] = samp_idx print "[{}] removing low-quality regions (matrix rows: {}, sample rows: {})...".format(chr,m.shape[0],samp_idx.shape[0]) # High-quality matrix size l = len(samp_idx) ssize = min(l,max_sample_size) # Sample iteration (keep sampling while clustering fails). np.random.seed(seed) successful = False cnt = 0 while not successful and cnt < max_resampling_attempts: cnt += 1 # Get sample if ssize < l: s = np.sort(np.random.choice(samp_idx,ssize,replace=False)) else: s = np.array(samp_idx) m_samp = m_oe[s,:][:,s] # Apply CPB. if use_cpb: print "[{}] computing CPB...".format(chr) m_samp = cpb(m_samp,20) elif use_ice: print "[{}] computing ICE...".format(chr) m_samp = ice(m_samp,20) # Relax outliers m_max = np.percentile(m_samp[np.where(m_samp>0)],outlier_pctl) m_samp[np.where(m_samp > m_max)] = m_max if (~m_samp.any(axis=1)).any(): print "[{}] sample contains empty rows (singular matrix). resampling ({})...".format(chr,cnt) continue # Compute correlation and remove diagonal print "[{}] computing correlation matrix and balancing...".format(chr) m_cor = np.corrcoef(m_samp) np.fill_diagonal(m_cor,0) # Increase correlation contrast (5-95 percentiles by default) if balance_corr_median: m_cor = m_cor - np.median(m_cor[np.triu_indices(ssize,1)]) min_cor_val = np.percentile(m_cor[np.triu_indices(ssize,1)],corr_outlier_pctl[0]) max_cor_val = np.percentile(m_cor[np.triu_indices(ssize,1)],corr_outlier_pctl[1]) m_cor[np.where(m_cor < min_cor_val)] = min_cor_val m_cor[np.where(m_cor > max_cor_val)] = max_cor_val N = m_cor.shape[0] eig_dim = min(N,eig_dim) if compute_abscore: print "[{}] computing a/b scores...".format(chr) eigv, abscore = eigs_ab_score(m_cor) # Write weighted eigenvectors with open('{}/clusters_{}.abscore'.format(outdir,chr),'w') as outdata: for i in xrange(0,len(abscore)): outdata.write(str(sample_idx[chr][i])+'\t'+str(abscore[i])+'\n') try: print "[{}] computing clusters, algorithm {}...".format(chr,algorithm) if algorithm == 'spec-kmeans': # some chromosomes crash when using precomputed similarity matrices. # however using RBF seems to give meaningful clustering. spect_clu = SpectralClustering(n_clusters=k, eigen_solver='arpack', affinity='precomputed', assign_labels='kmeans', n_jobs=8) hic_clust = spect_clu.fit_predict(m_cor) else: print "[{}] computing eigh...".format(chr) w, v = scipy.linalg.eigh(m_cor, eigvals=(N-eig_dim,N-1)) if algorithm == 'eigh-gmix': # Cluster eigenvectors using Gaussian Mixture gmix = mixture.GaussianMixture(n_components=k, covariance_type='full', tol=1e-4, max_iter=1000) gmix.fit(v) hic_clust = gmix.predict(v) elif algorithm == 'eigh-kmeans': # Cluster eigenvalue/eigenvector products with kmeans. print "[{}] computing clusters (k-means)...".format(chr) km = KMeans(n_clusters=k,n_jobs=8) weig = np.sqrt(w)*v hic_clust = km.fit_predict(weig) # Write weighted eigenvectors with open('{}/clusters_{}.weig'.format(outdir,chr),'w') as outdata: for i in xrange(0,len(hic_clust)): outdata.write(str(sample_idx[chr][i])+'\t'+str(hic_clust[i])+'\t'+'\t'.join([str(x) for x in weig[i][::-1]])+'\n') except Exception, e: print "[{}] error while clustering: {}".format(chr,cnt,str(e)) cnt = max_resampling_attempts break successful = True if cnt >= max_resampling_attempts: print "[{}] max reampling attempts reached, skipping chromosome.".format(chr) continue # Rearrange clusters for visualization # Make cluster index list clu_idx = [list() for _ in xrange(k)] for i,c in enumerate(hic_clust): clu_idx[c].append(i) if not rearrange_clusters: # Map again to matrix indices clusters_idx[chr] = [sample_idx[chr][x] for x in clu_idx] else: print "[{}] rearranging clusters by similarity...".format(chr) for i in xrange(k): clu_idx[i] = np.array(clu_idx[i]) clusters[chr] = list() # Find most distant blocks l_r = (0,0) val = np.inf d_sum = np.zeros((k,k)) for i in xrange(k): l_i = len(clu_idx[i]) for j in xrange(i+1,k): l_j = len(clu_idx[j]) d_sum[i,j] = np.sum(m_cor[clu_idx[i],:][:,clu_idx[j]]) d = float(d_sum[i,j])/(l_i*l_j) if d < val: l_r = (i,j) val = d # Pop left and right blocks (important to do it in this order for index consistency). r_idx = clu_idx.pop(l_r[1]) l_idx = clu_idx.pop(l_r[0]) r_clusters = [r_idx.copy(),] l_clusters = [l_idx.copy(),] iters = len(clu_idx)/2 + len(clu_idx)%2 for i in xrange(iters): # Find nearest blocks to L/R. len_l = len(l_idx) len_r = len(r_idx) min_d = np.inf max_d = -np.inf min_idx = 0 max_idx = 0 for i in xrange(len(clu_idx)): len_block = len(clu_idx[i]) d_l = float(np.sum(m_cor[l_idx,:][:,clu_idx[i]]))/(len_l*len_block) - val d_r = float(np.sum(m_cor[r_idx,:][:,clu_idx[i]]))/(len_r*len_block) - val r = d_l/d_r if r < min_d: min_idx = i min_d = r if r >= max_d: max_idx = i max_d = r # Pop from idx and add to L/R. if min_idx > max_idx: r_clusters.append(clu_idx[min_idx].copy()) l_clusters.append(clu_idx[max_idx].copy()) r_idx = np.append(clu_idx.pop(min_idx),r_idx) l_idx = np.append(l_idx,clu_idx.pop(max_idx)) elif min_idx < max_idx: r_clusters.append(clu_idx[min_idx].copy()) l_clusters.append(clu_idx[max_idx].copy()) l_idx = np.append(l_idx,clu_idx.pop(max_idx)) r_idx = np.append(clu_idx.pop(min_idx),r_idx) else: l_clusters.append(clu_idx[max_idx].copy()) l_idx = np.append(l_idx,clu_idx.pop(max_idx)) # Make final index list. clu_idx = np.append(l_idx,r_idx) # Make final cluster index list. clusters[chr] = l_clusters + list(reversed(r_clusters)) # Map again to matrix indices clusters_idx[chr] = [sample_idx[chr][x] for x in clusters[chr]] # Store in disk print "[{}] writing clusters to {}/clusters_{}.txt...".format(chr,outdir,chr) fout = open('{}/clusters_{}.txt'.format(outdir,chr),'w+') for c in clusters_idx[chr]: fout.write("{}\t".format(chr)) fout.write(','.join([str(i) for i in c])) fout.write('\n') fout.close() fall = open('{}/{}'.format(outdir,out_allchr),"a") for c in clusters_idx[chr]: fall.write("{}\t".format(chr)) fall.write(','.join([str(i) for i in c])) fall.write('\n') fall.close() '''
import os # Change the current sys path os.chdir( "/Users/davidlin/Desktop/School/Master/2021_secondSem/SC/image-segmentation/" ) from os import walk from Code.lib import tools import numpy as np from skimage import segmentation, color from hmmlearn import hmm import matplotlib.pyplot as plt import seaborn as sns sns.set() from sklearn.cluster import KMeans, SpectralClustering from time import time import cv2 from PIL import Image # --------------------------------------------------------------------------------------- #################################################################### ### ### ### Part 1: Vector Quantization using K-means with different K ### ### ### #################################################################### # K-means for VQ # Specify the number of colors n1_colors = 5 n2_colors = 10 n3_colors = 64 # Load the photo and convert from BGR to RGB model img_bgr = cv2.imread("./Images/shiba.jpeg")
def spectral_clustering(data, n_clusters): spec = SpectralClustering(n_clusters=n_clusters, gamma=2.) spec.fit(data) return spec.labels_
X_id = [] y = [] labels = [] # a subset of X_id [] X_IDs = [] # generate X_train using X_id #ID_list = df.ix[:,0] ID_bipart = df_bipart.ix[:, 0] data_bipart = df_bipart.ix[:, 1:] # need to determin how do you get num_group num_group = 3 random.seed(17) kmeans_bipart = KMeans(n_clusters=num_group, random_state=0).fit(data_bipart) labels_bipart = kmeans_bipart.labels_ labels_bipart = SpectralClustering(num_group, gamma=0.5, affinity='rbf').fit(data_bipart).labels_ # add new column group to data df_bipart['group'] = labels_bipart ############# #group data by their 'group' # df_bipart = df_bipart.sort_values('group') #divide by group i # @i means i is a variable in group global_y = 0 global_len = 0 global_truth = [] global_fitted = [] print "number of groups" + str(num_group) for i_group in range(num_group): df_bipart_i = df_bipart.query('group == @i_group ')
clusters = np.array(range(len(latlongs))) else: if CLST_METHOD == 1: clObj = KMeans(n_clusters=clustersNo, random_state=seed, n_jobs=4) clustersObj = clObj.fit(latlongs) (clusters, centroids) = (clustersObj.labels_, clustersObj.cluster_centers_) elif CLST_METHOD == 2: clObj = AgglomerativeClustering(n_clusters=clustersNo, affinity='euclidean', compute_full_tree='auto', memory='mycachedir') clustersObj = clObj.fit(latlongs) (clusters) = (clustersObj.labels_) elif CLST_METHOD == 3: clObj = SpectralClustering(n_clusters=clustersNo, random_state=seed) clustersObj = clObj.fit(latlongs) (clusters) = (clustersObj.labels_) ########################################################################### # Clustering the pointset ########################################################################### cLatlongs = aux.appendClustersToLatlongs(latlongs, clusters) csvPath = PATH + PLACE + "_CLS_" + str( CLST_METHOD) + "_" + namePad + ".csv" csvPathSz = PATH + PLACE + "_CLL_" + str( CLST_METHOD) + "_" + namePad + ".csv" clustersSizes = [sorted(clusters).count(x) for x in range(clustersNo)] aux.exportListToCSV(csvPath, cLatlongs) np.savetxt(csvPathSz, clustersSizes, fmt='%i', delimiter='\n') ########################################################################### # Plotting
class main(): with open('/Users/kat/Desktop/Kaggle/Graph.csv', 'rb') as csvfile1: graphreader = csv.reader(csvfile1, delimiter=' ', quotechar='|') adjgraph = np.empty((6000, 6000)) adjgraph.fill(0) for row in graphreader: arr = row[0].split(",") adjgraph[int(arr[0]) - 1][int(arr[1]) - 1] = 1 adjgraph[int(arr[1]) - 1][int(arr[0]) - 1] = 1 # get features into matrix with open('/Users/kat/Desktop/reduced_1035_all_points.csv', 'rb') as csvfile3: EF = csv.reader(csvfile3, delimiter=' ', quotechar='|') newEF = [] for row in EF: arr = row[0].split(",") arr2 = np.asarray(arr) arr3 = arr2.astype(np.float) newEF.append(arr3) with open('/Users/kat/Desktop/reduced_3000.csv', 'rb') as csvfile3: EF = csv.reader(csvfile3, delimiter=' ', quotechar='|') adj_new = [] for row in EF: arr = row[0].split(",") arr2 = np.asarray(arr) arr3 = arr2.astype(np.float) adj_new.append(arr3) # spectral clustering on adjacency matrix spectral = SpectralClustering(10, affinity="precomputed") new_plot = spectral.fit_predict( adj_new) #6000 x 1 Array with cluster labels matching = { 0: [], 1: [], 2: [], 3: [], 4: [], 5: [], 6: [], 7: [], 8: [], 9: [] } clusters = { 0: [], 1: [], 2: [], 3: [], 4: [], 5: [], 6: [], 7: [], 8: [], 9: [] } # get cluster matchings for first 60 points with open('/Users/kat/Desktop/Kaggle/Seed.csv', 'rb') as csvfile2: seedreader = csv.reader(csvfile2, delimiter=' ', quotechar='|') for row in seedreader: arr = row[0].split(",") findClust = new_plot[int(arr[0]) - 1] matching[int(arr[1])].append( [int(arr[0]), newEF[int(arr[0]) - 1], findClust]) for i in range(1, 6001): findClust = new_plot[i - 1] clusters[findClust].append(newEF[i - 1]) for i in range(10): print "item is " + str(i) for item in matching[i]: print item[2] filtered_features = [] filtered_features_idx = [] cluster_5_digit_6 = [] cluster_8_digit_1 = [] for i in range(len(new_plot)): if new_plot[i] == 5: cluster_5_digit_6.append([i + 1, red_pca[i]]) elif new_plot[i] == 8: cluster_8_digit_1.append([i + 1, red_pca[i]]) else: filtered_features.append(red_pca[i]) filtered_features_idx.append(i + 1) cluster_centers_pca_8_clusters = [] for i in range(10): newarray = [] if i == 1 or i == 6: pass else: for j in range(len(matching[i])): newarray.append(np.asarray(matching[i][j][1])) newa = np.asarray(newarray) cluster_centers_pca_8_clusters.append(newa.mean(axis=0)) cluster_centers_pca_8_clusters = np.asarray(cluster_centers_pca_8_clusters) kmeans_8 = KMeans( n_clusters=8, init=cluster_centers_pca_8_clusters).fit_predict(filtered_features) updated_matching = { 0: [], 1: [], 2: [], 3: [], 4: [], 5: [], 6: [], 7: [], 8: [], 9: [] } with open('/Users/kat/Desktop/Kaggle/Seed.csv', 'rb') as csvfile2: seedreader = csv.reader(csvfile2, delimiter=' ', quotechar='|') for row in seedreader: arr = row[0].split(",") if int(arr[1]) == 1 or int(arr[1]) == 6: pass else: try: idx = filtered_features_idx.index(int(arr[0])) updated_matching[int(arr[1])].append( [int(arr[0]), red_pca[int(arr[0]) - 1], kmeans_8[idx]]) except ValueError: pass finalmatches = {0: 4, 1: 2, 2: 5, 3: 9, 4: 7, 5: 3, 6: 0, 7: 8, 8: 1, 9: 6} adjustedcluster = {} for i in range(10): index = finalmatches[i] adjustedcluster[i] = clusters[index] cluster_centers = [] for i in range(10): newa = np.asarray(adjustedcluster[i]) cluster_centers.append(newa.mean(axis=0)) #for i in range(10): # newarray = [] # if i == 1 or i == 6: # pass # else: # for j in range(len(updated_matching[i])): # newarray.append(np.asarray(updated_matching[i][j][1])) # newa = np.asarray(newarray) # cluster_centers_kmeans.append(newa.mean(axis=0)) digit_6_feat_pca = [] for item in cluster_5_digit_6: digit_6_feat_pca.append(item[1]) digit_1_feat_pca = [] for item in cluster_8_digit_1: digit_1_feat_pca.append(item[1]) digit_6_feat_pca = np.asarray(digit_6_feat_pca) digit_6_centroid = digit_6_feat_pca.mean(axis=0) digit_1_feat_pca = np.asarray(digit_1_feat_pca) digit_1_centroid = digit_1_feat_pca.mean(axis=0) cluster_centers_kmeans.insert(1, digit_1_centroid) cluster_centers_kmeans.insert(6, digit_6_centroid) finalclusters = [[0 for i in range(2)] for j in range(4001)] finalclusters[0][0] = 'Id' finalclusters[0][1] = 'Label' for i in range(1, 4001): finalclusters[i][0] = 6000 + i newdist = [] for j in range(10): newdist.append(dist.euclidean(newEF[i + 5999], cluster_centers[j])) label = np.argmin(newdist) finalclusters[i][1] = label with open('submission7.csv', "w") as output: writer = csv.writer(output, lineterminator='\n') writer.writerows(finalclusters)
def parse_clustering(key, content): """Parse the options of the clustering step. This function does the same as parse_preproc but works on the clustering options. Parameters ----------- key : class or str, like {'KMeans', 'AP', 'MS', 'Spectral', 'Hierarchical'} The selected clustering algorithm. In case in which key is a `class`, it must contain a `fit` method. content : dict A dictionary containing parameters for each clustering class. Each parameter can be a list; in this case for each combination of parameters a different pipeline will be created. Returns ----------- tpl : tuple A tuple made like ('clust_name', clust_obj, 'clustering'), where clust_obj implements the `fit` method. """ if inspect.isclass(key): cl = key(**content) key = cl.__class__.__name__.lower() elif 'auto' in (content.get('n_clusters', ''), content.get('preference', '')) \ and key.lower() != 'hierarchical': # Wrapper class that automatically detects the best number of clusters # via 10-Fold CV content.pop('n_clusters', '') content.pop('preference', '') kwargs = { 'param_grid': [], 'n_jobs': -1, 'scoring': silhouette_score, 'cv': 10 } if key.lower() == 'kmeans': content.setdefault('init', 'k-means++') content.setdefault('n_jobs', 1) kwargs['estimator'] = KMeans(**content) elif key.lower() == 'ap': kwargs['estimator'] = AffinityPropagation(**content) kwargs['affinity'] = kwargs['estimator'].affinity else: logging.error("n_clusters = 'auto' specified outside kmeans or " "ap. Trying to create GridSearchCV pipeline anyway " " ...") cl = GridSearchCV(**kwargs) elif 'auto' in (content.get('n_clusters', ''), content.get('preference', '')) \ and key.lower() == 'hierarchical': # TODO implement this # from adenine.utils.extensions import AgglomerativeClustering cl = AgglomerativeClustering(**content) else: if key.lower() == 'kmeans': content.setdefault('n_jobs', -1) cl = KMeans(**content) elif key.lower() == 'ap': content.setdefault('preference', 1) cl = AffinityPropagation(**content) elif key.lower() == 'ms': cl = MeanShift(**content) elif key.lower() == 'spectral': cl = SpectralClustering(**content) elif key.lower() == 'hierarchical': cl = AgglomerativeClustering(**content) else: cl = DummyNone() return (key, cl, 'clustering')
def run_SpectralClustering(data, clusters): estimator = SpectralClustering(n_clusters=clusters).fit(data) return estimator.labels_
5: 'sixth' }) x = df_segm['COMPONENT 2'] y = df_segm['COMPONENT 1'] plt.figure(figsize=(10, 8)) sns.scatterplot(x, y, hue=df_segm['Segment'], palette=['g', 'r', 'c', 'm', '#95a5a6', '#3498db']) plt.title("CLUSTERS BY PCA COMPONENTS") plt.show() sns.scatterplot ######SPECTRAL CLUSTERING ON MFEAT-MO from sklearn.cluster import SpectralClustering spectral_model_rbf = SpectralClustering(n_clusters=6, affinity='rbf') labels_rbf = spectral_model_rbf.fit_predict(s) colours = {} colours[0] = 'b' colours[1] = 'y' colours[2] = 'g' colours[3] = 'c' colours[4] = 'r' colours[5] = 'm' cvec = [colours[label] for label in labels_rbf] b = plt.scatter(s[:, 0], s[:, 1], color='b') y = plt.scatter(s[:, 0], s[:, 1], color='y') g = plt.scatter(s[:, 0], s[:, 1], color='g') c = plt.scatter(s[:, 0], s[:, 1], color='c') r = plt.scatter(s[:, 0], s[:, 1], color='r') m = plt.scatter(s[:, 0], s[:, 1], color='m')
X = lsa.fit_transform(X) print("done in %fs" % (time() - t0)) explained_variance = svd.explained_variance_ratio_.sum() # print("Explained variance of the SVD step: {}%".format( # int(explained_variance * 100))) # print() # ############################################################################# # Do the actual clustering sp = SpectralClustering(n_clusters=4, affinity='nearest_neighbors') print("Clustering sparse data with %s" % sp) t0 = time() sp.fit(X) print("done in %0.3fs" % (time() - t0)) print() #metrics.normalized_mutual_info_score print("Normalized: %0.3f" % metrics.normalized_mutual_info_score(labels, sp.labels_)) print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels, sp.labels_)) print("Completeness: %0.3f" % metrics.completeness_score(labels, sp.labels_)) print()
from sklearn import metrics from sklearn.datasets.samples_generator import make_blobs from sklearn.preprocessing import StandardScaler import matplotlib.pyplot as plt import matplotlib as mpl # Load cluster data X = np.loadtxt('data1.txt') # Compute SpectralClustering spc = SpectralClustering( n_clusters=6, # eigen_solver='arpack', # {None, ‘arpack’, ‘lobpcg’, or ‘amg’} kernel_params='', # chi2_kernel assign_labels='kmeans', # {‘kmeans’, ‘discretize’} affinity="nearest_neighbors").fit( X) #‘nearest_neighbors’, ‘precomputed’, ‘rbf’ or # cosine_similarity, Linear kernel, Polynomial kernel, Sigmoid kernel, RBF kernel, Laplacian kernel, # Chi-squared kernel # Hiperparameters # Affinity matrix construction: distance and kernel; # kernel parameter (scaling factor); # number of clusters k; # clustering method core_samples_mask = np.zeros_like(spc.labels_, dtype=bool) #core_samples_mask[spc.core_sample_indices_] = True
aaa = aaa.reshape(length, length) aaa = np.transpose(aaa) # aaa=np.log(aaa+1) # aaa=(aaa-aaa.min())/(aaa.max()-aaa.min()) p = count_percent(D3, D2) p = p * aaa D = getD(p) L = getL(D, p) eigvec = getEigen(L, n) eigvec = np.real(eigvec) clf = KMeans(n_clusters=n) s = clf.fit(eigvec) C = s.labels_ print('processed data using sc ARI:', metrics.adjusted_rand_score(y, C)) print('NMI:', normalized_mutual_info_score(y, C)) print('ACC:', acc(y, C)) from sklearn.cluster import SpectralClustering sc1 = SpectralClustering(n_clusters=n, affinity='nearest_neighbors', n_neighbors=KNN_for_neighbord) print('SC KNN ARI:', metrics.adjusted_rand_score(y, sc1.fit_predict(x1))) c = 'ARI:' + str(metrics.adjusted_rand_score(y, C)) + '\n' + 'NMI:' + str( normalized_mutual_info_score(y, C)) + '\n' c = c + 'ACC:' + str(acc(y, C)) + '\n' + 'SKARI' + str( metrics.adjusted_rand_score(y, sc1.fit_predict(x1))) fh = open('performanceusoskinimproved.txt', 'w', encoding='utf-8') fh.write(c) fh.close()