def test_plot_voronoi(self): kmeans = KMeans(n_clusters=15) kmeans.fit([data]) ax = plot_voronoi(kmeans, xlabel='x', ylabel='y') assert isinstance(ax, SubplotBase)
def test_plot_voronoi(): kmeans = KMeans(n_clusters=15) kmeans.fit([data]) ax = plot_voronoi(kmeans) assert isinstance(ax, Subplot)
def cluster_kmeans(tica_dir, data_dir, traj_dir, n_clusters, lag_time): clusterer_dir = "%s/clusterer_%dclusters.h5" % (tica_dir, n_clusters) if (os.path.exists(clusterer_dir)): print("Already clustered") else: print("Clustering by KMeans") reduced_data = verboseload(data_dir) trajs = np.concatenate(reduced_data) clusterer = KMeans(n_clusters=n_clusters, n_jobs=-1) clusterer.fit_transform(reduced_data) verbosedump(clusterer, clusterer_dir)
def cluster_kmeans(tica_dir, data_dir, traj_dir, n_clusters, lag_time): clusterer_dir = "%s/clusterer_%dclusters.h5" %(tica_dir, n_clusters) if (os.path.exists(clusterer_dir)): print "Already clustered" else: print "Clustering by KMeans" reduced_data = verboseload(data_dir) trajs = np.concatenate(reduced_data) clusterer = KMeans(n_clusters = n_clusters, n_jobs=-1) clusterer.fit_transform(reduced_data) verbosedump(clusterer, clusterer_dir)
def cluster_msm(sequences,n_states, lag_times): for n in n_states: states = KMeans(n_clusters=n) states.fit(sequences) io.dump(states,str(n)+'n_cl.pkl') ts=np.zeros(5) for lag_time in lag_times: msm = MarkovStateModel(lag_time=lag_time, verbose=False,n_timescales=5) msm.fit(states.labels_) ts1=msm.timescales_ ts=np.vstack((ts,ts1)) io.dump(msm,str(n)+'n_'+str(lag_time)+'lt_msm.pkl') ts=np.delete(ts, (0), axis=0) io.dump(ts,str(n)+'n_timescales.pkl')
def fit_protein_kmeans(yaml_file,mini=True,pca=False): mdl_dir = yaml_file["mdl_dir"] mdl_params = yaml_file["mdl_params"] current_mdl_params={} for i in mdl_params.keys(): if i.startswith("cluster__"): current_mdl_params[i.split("cluster__")[1]] = mdl_params[i] if mini: current_mdl_params["batch_size"] = 100*current_mdl_params["n_clusters"] kmeans_mdl = MiniBatchKMeans(**current_mdl_params) else: kmeans_mdl = KMeans(**current_mdl_params) data = [] for protein in yaml_file["protein_list"]: with enter_protein_mdl_dir(yaml_file, protein): if pca: tica_data = verboseload("pca_data.pkl") else: tica_data = verboseload("tica_data.pkl") # get all traj sorted_list = sorted(tica_data.keys(), key=keynat) data.extend([tica_data[i] for i in sorted_list]) kmeans_mdl.fit(data) kmeans_mdl_path = os.path.join(mdl_dir, "kmeans_mdl.pkl") verbosedump(kmeans_mdl, kmeans_mdl_path) return
def Kmeans_score(dataset, Max_clusters): print( "Start to analyze the dependence of inertia on the number of clusters\n" ) scores_in = [] #the elbow indicates the good cluster number scores_sc = [ ] #s=b-a/max(a,b) a: The mean distance between a sample and all other points in the same class, # b: The mean distance between a sample and all other points in the next nearest cluster. scores_ch = [] # Variance Ratio Criterion, tightness of the cluster scores_rt = [ ] # Variance ratio, As the ratio inherently rises with cluster count, # one looks for an “elbow” in the curve where adding another cluster does not add much new information, as done in a scree test scores_db = [ ] # Values closer to zero indicate a better partition. sum of cluster i and j diameter over the distance between cluster centroids i and j. smaller the better. for i in range(Max_clusters - 2): kmeans_model = KMeans(n_clusters=i + 2, init='k-means++', n_init=10, max_iter=300, tol=0.001, precompute_distances='auto', verbose=0, random_state=None, copy_x=True, n_jobs=1).fit(dataset) labels = kmeans_model.labels_ scores_in.append(kmeans_model.inertia_) scores_sc.append( metrics.silhouette_score(dataset[0], labels[0], metric='euclidean')) scores_ch.append(metrics.calinski_harabaz_score(dataset[0], labels[0])) scores_rt.append(ssr_sst_ratio(dataset[0], labels[0])) scores_db.append(metrics.davies_bouldin_score(dataset[0], labels[0])) print("Done generating scores for " + str(Max_clusters) + " clusters\n") return scores_in, scores_sc, scores_ch, scores_rt, scores_db
def cluster_project_wrapper(proj_folder,feature_dict,n_states): if os.path.exists(proj_folder+"/assignments.pkl"): return verboseload(proj_folder+"/cluster_mdl.pkl"),verboseload(proj_folder+"/assignments.pkl") elif os.path.exists(proj_folder+"/cluster_mdl.pkl"): cluster_mdl = verboseload(proj_folder+"/cluster_mdl.pkl") else: cluster_mdl = KMeans(n_clusters = n_states) cluster_mdl.fit([feature_dict[i] for i in feature_dict.keys()]) assignments={} for i in feature_dict.keys(): assignments[i] = cluster_mdl.transform([feature_dict[i]]) verbosedump(cluster_mdl,proj_folder+"/cluster_mdl.pkl") verbosedump(assignments,proj_folder+"/assignments.pkl") return cluster_mdl,assignments
def cluster_features(features, clusterer, n_clusters=8): ''' Input features : list of arrays, length n_trajs, each of shape (n_samples, n_features) Output clst : msmbuilder.cluster object, with attributes cluster_centers_ : (n_clusters, n_features) labels_ : list of arrays, each of shape (n_samples, ) ''' if clusterer == 'KMeans': from msmbuilder.cluster import KMeans clst = KMeans(n_clusters=n_clusters) elif clusterer == 'KCenters': from msmbuilder.cluster import KCenters clst = KCenters(n_clusters=n_clusters) elif clusterer == 'KMedoids': from msmbuilder.cluster import KMedoids clst = KMedoids(n_clusters=n_clusters) elif clusterer == 'MiniBatchKMeans': from msmbuilder.cluster import MiniBatchKMeans clst = MiniBatchKMeans(n_clusters=n_clusters) elif clusterer == 'MiniBatchKMedoids': from msmbuilder.cluster import MiniBatchKMedoids clst = MiniBatchKMedoids(n_clusters=n_clusters) clusters = clst.fit_transform(features) return clst
def cluster_project_wrapper(proj_folder, feature_dict, n_states): if os.path.exists(proj_folder + "/assignments.pkl"): return verboseload(proj_folder + "/cluster_mdl.pkl"), verboseload(proj_folder + "/assignments.pkl") elif os.path.exists(proj_folder + "/cluster_mdl.pkl"): cluster_mdl = verboseload(proj_folder + "/cluster_mdl.pkl") else: cluster_mdl = KMeans(n_clusters=n_states) cluster_mdl.fit([feature_dict[i] for i in feature_dict.keys()]) assignments = {} for i in feature_dict.keys(): assignments[i] = cluster_mdl.transform([feature_dict[i]]) verbosedump(cluster_mdl, proj_folder + "/cluster_mdl.pkl") verbosedump(assignments, proj_folder + "/assignments.pkl") return cluster_mdl, assignments
def cluster(): ''' This function perfomes K-means clustering on the tICA space and saves assignsment files for each trajectory. Cluster centers are also saved at `microstate_centers.txt` file. ''' cluster = KMeans(n_clusters=n_states,n_jobs=-1,verbose=0, max_iter=100, tol=0.0001,) dataset, ev0, ev1 = [], [], [] print "Loading projected data..." for i in tqdm(range(start_traj, end_traj+1)): a = io.loadh('%s/traj%d_%s.h5' %(proj_path,i,traj_name))['arr_0'] a = a[:,0:2] dataset.append(a) ev0.extend(a[:,0]) ev1.extend(a[:,1]) print "Clustering %d datapoints..." %len(ev0) cluster.fit(dataset) for i in range(start_traj,end_traj+1): np.savetxt('%s/assigns_%d.txt' %(out_path,i),np.array(cluster.labels_[i-start_traj]),fmt='%d') np.savetxt('%s/microstate_centers.txt' %out_path,np.array(cluster.cluster_centers_)) print "Saved microstate assignments and microstate centers at %s" %out_path return cluster.cluster_centers_, np.array(ev0), np.array(ev1)
def clustering(N_cluster_opt, dataset, traj): cluster = KMeans(n_clusters=N_cluster_opt, init='k-means++', n_init=10, max_iter=300, tol=0.001, precompute_distances='auto', verbose=0, random_state=None, copy_x=True, n_jobs=2).fit(dataset) cluster_centers = cluster.cluster_centers_ print("center lenghth: " + str(len(cluster_centers)) + "\n") clusters = [[] for i in range(0, N_cluster_opt)] clusters_xyz = [[] for i in range(0, N_cluster_opt)] clusters_xyz_center = [] fileout_labels = open( "./AlleyCat-Ca-constrained/Labels_for_" + str(N_cluster_opt) + "_clusters.dat", 'w') for i in range(0, len(cluster.labels_[0])): fileout_labels.write("snapshot " + str(i + 1) + " corresponds to Cluster " + str(cluster.labels_[0][i] + 1) + "\n") for j in range(0, N_cluster_opt): if cluster.labels_[0][i] == j: clusters[j].append(dataset[0][i]) clusters_xyz[j].append(traj[i].xyz) fileout = open( "./AlleyCat-Ca-constrained/population_for_" + str(N_cluster_opt) + "_clusters.dat", 'w') for l in range(0, N_cluster_opt): clusters_xyz_center.append( np.average(np.array(clusters_xyz[l]), axis=0)[0]) fileout.write('The population of cluster ' + str(l) + ' is ' + str(len(clusters[l])) + '\n') print('The population of cluster ' + str(l) + ' is ' + str(len(clusters[l]))) fileout_labels.close() fileout.close() return clusters_xyz, clusters_xyz_center, cluster_centers, clusters, cluster.labels_[ 0]
n_samples = 200 topFile='NarK-strip.pdb' dataset = [] ls = [] for i in sorted(glob.glob('*.npy')): a = np.load(i) b = np.array(a) dataset.append(b) ls.append(i) print(i) np.save('list', ls) #trajs = [np.load('data.npy')] # make cluster of the tICs trajectories cluster = KMeans(n_clusters=myn_clusters) cluster.fit(dataset) l = cluster.labels_ T = [] for trj in glob.glob('*strip.mdcrd'): T.append(trj) T.sort() # Write the output file, which have the information about population of each cluster, # trajectory name and frame number of corresponding frame asFunctions.writeOPF(l, T, myn_clusters, n_samples) # Based on information in output file, build the cpptraj input file asFunctions.CpptrajInGen_commonTop(topFile) #pickle.dump( cluster , open( "tICCluster.pkl", "wb"))
f = DihedralFeaturizer(sincos=False) dump(f, "raw_featurizer.pkl") feat = f.transform(trj_list) dump(feat, "raw_features.pkl") f = load("./featurizer.pkl") dump(f, "featurizer.pkl") df1 = pd.DataFrame(f.describe_features(trj_list[0])) dump(df1, "feature_descriptor.pkl") feat = f.transform(trj_list) dump(feat, "features.pkl") t = tICA(lag_time=100, n_components=2, kinetic_mapping=False) tica_feat = t.fit_transform(feat) dump(t, "tica_mdl.pkl") dump(tica_feat, "tica_features.pkl") kmeans_mdl = KMeans(50) ass = kmeans_mdl.fit_predict(tica_feat) msm_mdl = MarkovStateModel(100) msm_mdl.fit(ass) dump(kmeans_mdl, "kmeans_mdl.pkl") dump(ass, "assignments.pkl") dump(msm_mdl, "msm_mdl.pkl")
def calculate_tica_components(self, cluster_method, calculate_strides=False, feats=None): '''Load in the features, calculate a given number of tICA components (tica_components) given a lagtime (lag_time), and save tICA coordinates and eigenvector data. It then creates and populates a list for each desired component, clusters the data, saving normalized populations as populations.dat and saving each cluster center as a .pdb. tICA plots are created and saved, and implied timescales are calculated, saved, and plotted. ''' # tICA parameters tica_lagtime = 10 # determine from implied timescales tica_components = 8 # how many tICs to compute n_clusters = 100 # denotes number of microstates n_timescales = tica_components # plot all eigenvalues --> timescales md_time_step = 0.02 # ns subsampled_time_step = 1. # ns multiplier of timescales and lagtimes in implied timescale plot stride = int(subsampled_time_step / md_time_step) #time step stride for sub-sampling equil_time = 1. # ns equil_steps = 1 #int(equil_time / md_time_step) time steps to be removed from start lagtimes = np.array([1,2,4,8,16,32,64,128,256,512,1024]) cluster_method = 'kcenters' # 'kcenters/kmeans' all_ticas = list(itertools.permutations(range(1,tica_components+1), 2)) # all combinations all_ticas = [[1,2]] # override: just show analysis for first two components cluster_percentage_cutoff = 5 # clusters with a relative population less than this # number will not be labeled on plot i.e. 0 : all clusters labeled verbose = False print("\nCalculating tICA components...") # Load in feature files THIS WILL NEED TO BE CHANGED if feats == None: if calculate_strides: self.calculate_stride_distances(stride, equil_steps) data = np.load('/home/server/git/fah-scripts/DataAnalysisScripts/stride_dist/stride_dist_%d.npy' % self.proj_num) else: data = self.data else: data = np.load(feats) features = [] for run in data: for clone in run: gen_seq = [] for gen in clone: if gen is not None and gen[0] is not None: if calculate_strides or feats is not None: gen_seq.append(gen) else: gen_seq.append(gen[::stride]) if len(gen_seq) > 0: gen_cat = np.concatenate(gen_seq) if calculate_strides: features.append(gen_cat) else: features.append(gen_cat[equil_steps:]) features = np.asarray(features) print(features.shape) print(features[0].shape) tica_coordinates = tICA(lag_time=tica_lagtime, n_components=int(tica_components)).fit_transform(features) np.save('%s/lag_%d_coord_%d.npy' %(self.tICA_dir, tica_lagtime, tica_components), tica_coordinates) # Initiate and populate an array for each component for i in range(tica_components): exec('tica_' + str(i+1) + ' = []') for i in tqdm.tqdm(range(len(features))): for j in range(len(tica_coordinates[i])): for k in range(tica_components): exec('tica_' + str(k+1) + '.append(tica_coordinates[i][j][k])') # Perform clustering based on the cluster_method parameter. if cluster_method == 'kcenters': print("Clustering via KCenters...") clusters = KCenters(n_clusters) elif cluster_method == 'kmeans': print("Clustering via KMeans...") clusters = KMeans(n_clusters) else: sys.exit("Invalid cluster_method. Use kmeans or kcenters.") # Determine cluster assignment for each frame. sequences = clusters.fit_transform(tica_coordinates) np.save('%s/lag_%d_clusters_%d_sequences.npy' %(self.tICA_dir, tica_lagtime, n_clusters), sequences) np.save('%s/lag_%d_clusters_%d_center.npy' %(self.tICA_dir, tica_lagtime, n_clusters), clusters.cluster_centers_) # Determine cluster populations, normalize the counts, and save as percentages for # labeling if a cluster contains more than cluster_percentage_cutoff percent of the data. # Finally, save normalized counts. print("\nDetermining cluster populations...") if not os.path.exists('%s/cluster_centers' % self.tICA_dir): os.makedirs('%s/cluster_centers' % self.tICA_dir) counts = np.array([len(np.where(np.concatenate(sequences)==i)[0]) for i in range(n_clusters)]) normalized_counts = counts/float(counts.sum()) percentages = [ i*100 for i in normalized_counts ] population_labels = [ [i,"%.2f"%percentages[i]] for i in range(len(percentages)) if percentages[i] > cluster_percentage_cutoff ] np.savetxt('%s/cluster_centers/populations.dat' % self.tICA_dir, normalized_counts) # Plot all unique combinations of tICA components print("\nPlotting tICA components with cluster centers...") all_ticas = list(itertools.permutations(range(1,tica_components+1), 2)) for j in tqdm.tqdm(range(len(all_ticas))): # For each pair if all_ticas[j][0] < all_ticas[j][1]: plt.figure(j, figsize=(20,16)) plt.hexbin(eval("tica_"+str(all_ticas[j][0])), eval("tica_"+str(all_ticas[j][1])), bins='log') x_centers = [clusters.cluster_centers_[i][all_ticas[j][0]-1] for i in range(len(clusters.cluster_centers_))] y_centers = [clusters.cluster_centers_[i][all_ticas[j][1]-1] for i in range(len(clusters.cluster_centers_))] high_pop_x_centers = [ x_centers[i] for i in range(len(x_centers)) if percentages[i] > cluster_percentage_cutoff ] high_pop_y_centers = [ y_centers[i] for i in range(len(y_centers)) if percentages[i] > cluster_percentage_cutoff ] plt.plot(x_centers, y_centers, color='y', linestyle="", marker="o") plt.plot(eval("tica_"+str(all_ticas[j][0])+'[0]'), eval("tica_"+str(all_ticas[j][1])+'[0]'), color='k', marker='*',markersize=24) plt.xlabel('tic'+str(all_ticas[j][0])) plt.ylabel('tic'+str(all_ticas[j][1])) plt.title(self.proj_num) # Add labels for high-population cluster centers for label, x, y in zip(population_labels, high_pop_x_centers, high_pop_y_centers): plt.annotate( label, xy = (x, y), xytext = (-15, 15), textcoords = 'offset points', ha = 'right', va = 'bottom', bbox = dict(boxstyle = 'round,pad=0.5', fc = 'yellow', alpha = 0.5), arrowprops = dict(arrowstyle = '->', connectionstyle = 'arc3,rad=0')) plt.savefig('%s/tica_' % (self.tICA_dir) +str(all_ticas[j][0])+'_'+str(all_ticas[j][1])+'.png') plt.close() ########################################################################### for filename in os.listdir(self.tICA_dir + '/cluster_centers'): if filename.endswith('.pdb'): os.remove(self.tICA_dir + '/cluster_centers/' + filename) # Write out PDBs for each cluster center print("Performing cluster analytics and saving center PDBs...\n") runs, clones, gens = data.shape[0], data.shape[1], data.shape[2] x, y, z = 0, 0, 0 for i in range(len(features)): if i % clones == 0 and i != 0: x += 1 if i % gens == 0: y = 0 n_snapshots = len(clusters.distances_[i]) # Determine frames that are cluster centers cluster_indices = np.arange(n_snapshots)[ (clusters.distances_[i] < 1e-6) ] # Determine number of each cluster, correlates to populations.dat cluster_labels = sequences[i][cluster_indices] # Save each cluster center as a pdb if list(cluster_indices): # load center-containing xtcs to check length traj_cat = [] print('x: %d, y: %d, z: %d' % (x, y, z)) while True: try: traj = base_dir + 'PROJ%s/RUN%s/CLONE%s/results%s/traj_comp.xtc' % (self.proj_num, x, y, z) traj_cat.append(md.load(traj, top=self.gro_file)) z += 1 except: break if len(traj_cat) > 0: trajectory_file = md.join(traj_cat) xtc_len = len(trajectory_file) y += 1 z = 0 for j in range(len(cluster_indices)): frames = range(xtc_len) # map the strided frame number back to xtc frame number strided_frames = frames[equil_steps:][::stride] xtc_frame = frames.index(strided_frames[cluster_indices[j]]) cluster_traj = trajectory_file[xtc_frame] cluster_traj.save_pdb('%s/cluster_centers/state_%d_%.3f.pdb'%(self.tICA_dir, cluster_labels[j],percentages[cluster_labels[j]])) if verbose: print('Successfully saved PDB for cluster: %d, (rel.pop: %.3f)'%(cluster_labels[j],percentages[cluster_labels[j]])) print('traj_file: %s (%d/%d)'%(trajectory_file,i,len(features))) print('frame: %d (%d/%d centers from this trajectory)'%(cluster_indices[j],j,len(cluster_indices))) print('strided: npy_frame/npy_len = %d/%d = %f'%(cluster_indices[j],n_snapshots,cluster_indices[j]/n_snapshots)) print('re-mapped: orig_frame/xtc_len = %d/%d = %f\n'%(xtc_frame,xtc_len,xtc_frame/xtc_len))
import numpy as np from msmbuilder.cluster import KMeans, KCenters import mdtraj.io as io # 100 microstates cluster = KMeans( n_clusters=100, n_jobs=-1, verbose=0, max_iter=100, tol=0.0001, ) dataset = [] for i in range(4): a = io.loadh('../on_tica_l20_s1_%d.h5' % i)['arr_0'] a = a[:, 0:3] # using first 3 tICs dataset.append(a) print a.shape for i in range(20): a = io.loadh('../on_tica_l20_s2_%d.h5' % i)['arr_0'] a = a[:, 0:3] dataset.append(a) print a.shape for i in range(20): a = io.loadh('../on_tica_l20_s3_%d.h5' % i)['arr_0'] a = a[:, 0:3] dataset.append(a) print a.shape for i in range(20): a = io.loadh('../on_tica_l20_s4_%d.h5' % i)['arr_0']
from msmbuilder.decomposition import tICA from msmbuilder.cluster import KMeans,KCenters,KMedoids from msmbuilder.msm import MarkovStateModel verbose = True tica_data=np.load('../../ticas_n_8.npy') reduced_data = [] for i in range(len(tica_data)): reduced_data.append(tica_data[i][::100,:]) if verbose: print "Clustering." kmeans = KMeans(n_clusters=1200).fit(reduced_data) Gen_fn = "Gens.npy" np.save(Gen_fn,kmeans.cluster_centers_) if verbose: print "Wrote: %s"%Gen_fn model_dir = "kmeans_model_n_1200" if not os.path.exists(model_dir): os.makedirs(model_dir) model_fn = os.path.join(model_dir,'kmeans-combined.pkl') joblib.dump(kmeans,model_fn) if verbose: print "Saved cluster model to %s"%model_fn if verbose: print "Assigning.." assignments = kmeans.predict(tica_data) if verbose:
print "%s not exists!" % tica_fn continue tica_data = np.load(tica_fn) results = [] n_clusters = [100, 200, 400, 600, 800, 1000, 1200, 1500, 2000, 2500, 3000] #n_clusters = [1200,1500,2000] #n_clusters = [3500,3500,4000,4500,5000,6000] lagtime = 50 for n in n_clusters: kmeans = KMeans(n_clusters=n, n_jobs=-1) print "Clustering data to %d clusters..." % n for fold in range(nFolds): train_data = [] test_data = [] for i in range(len(tica_data)): cv = KFold(len(tica_data[i]), n_folds=nFolds) for current_fold, (train_index, test_index) in enumerate(cv): if current_fold == fold: train_data.append(tica_data[i][train_index]) test_data.append(tica_data[i][test_index]) reduced_train_data = sub_sampling_data(train_data, stride=100) kmeans.fit(reduced_train_data) assignments_train = kmeans.predict(train_data) assignments_test = kmeans.predict(test_data) msm = MarkovStateModel(lag_time=lagtime)
def compute_tica_components(): '''Load in the features, calculate a given number of tICA components (tica_components) given a lagtime (lag_time), and save tICA coordinates and eigenvector data. It then creates and populates a list for each desired component, clusters the data, saving normalized populations as populations.dat and saving each cluster center as a .pdb. tICA plots are created and saved, and implied timescales are calculated, saved, and plotted. ''' verbose = False save_pdb = True color_by = 'cluster' if verbose: print("\nCalculating tICA components...") if not os.path.exists(project_title + '/tica_%d'%n_clusters): os.mkdir(project_title + '/tica_%d'%n_clusters) # load in feature files and determine indices of unbiased ensembles feature_files = [] for i in range(runs): run_files = sorted(glob.glob(/features/' + "P*R%d_*npy"%i)) feature_files += run_files if i in unbiased_runs: unbiased_indices = [len(feature_files) - len(run_files),len(feature_files)] features = [np.load(x) for x in feature_files] # perform tICA calculation and extract score / eigenvectors tica_coordinates = tICA(lag_time=tica_lagtime, n_components=int(n_components)).fit_transform(features) tica_components = tICA(lag_time=tica_lagtime, n_components=int(n_components)).fit(features) eigenvectors = np.transpose(tica_components.eigenvectors_) tica_score = tica_components.score(features) np.save('%s/tica_%d/tica_coords-lag_%d-comp_%d.npy' %( project_title, n_clusters, tica_lagtime, n_components), tica_coordinates) np.save('%s/tica_%d/tica_comps-lag_%d-comp_%d.npy' %( project_title, n_clusters, tica_lagtime, n_components), tica_components) # Perform clustering based on the cluster_method parameter. if verbose: print('Clustering via %s'%cluster_method) if cluster_method == 'kcenters': clusters = KCenters(n_clusters) elif cluster_method == 'kmeans': clusters = KMeans(n_clusters) elif cluster_method == 'kmedoids': clusters = KMedoids(n_clusters) else: sys.exit('Invalid cluster_method. Use kcenters/kmeans/kmedoids.') # Cluster unbiased data and fit biased data to these centers new_assignments = [] sequences = clusters.fit_transform(tica_coordinates[unbiased_indices[0]:unbiased_indices[1]]) for i in tqdm.tqdm_notebook(range(unbiased_indices[0])): tica_traj = tica_coordinates[i] if isinstance(tica_traj, np.ndarray): if not (tica_traj.dtype == 'float32' or tica_traj.dtype == 'float64'): tica_traj = tica_traj.astype('float64') labels, inertia = msmbuilder.libdistance.assign_nearest( tica_traj, clusters.cluster_centers_, metric='euclidean') new_assignments.append(labels) new_assignments += sequences # tack the unbiased assignments back on to the end. np.save('%s/tica_%d/lag_%d_clusters_%d_assignments.npy' %( project_title, n_clusters, tica_lagtime, n_clusters), new_assignments) np.save('%s/tica_%d/lag_%d_clusters_%d_center.npy' %( project_title, n_clusters, tica_lagtime, n_clusters), clusters.cluster_centers_) # Determine cluster populations, normalize the counts, and save as percentages for # labeling if a cluster contains more than cluster_percentage_cutoff percent of the data. # Finally, save normalized counts. if verbose: print("\nDetermining cluster populations...") if not os.path.exists('%s/tica_%d/%s_clusters'%(project_title,n_clusters,cluster_method)): os.mkdir('%s/tica_%d/%s_clusters'%(project_title,n_clusters,cluster_method)) if not os.path.exists('%s/tica_%d/plots'%(project_title,n_clusters)): os.mkdir('%s/tica_%d/plots'%(project_title,n_clusters)) counts = np.array([len(np.where(np.concatenate(sequences)==i)[0]) for i in range(n_clusters)]) normalized_counts = counts/float(counts.sum()) percentages = [ i*100 for i in normalized_counts ] population_labels = [ [i,"%.2f"%percentages[i]] for i in range(len(percentages)) if percentages[i] > cluster_percentage_cutoff ] np.savetxt('%s/tica_%d/%s_clusters/populations.dat' %(project_title,n_clusters,cluster_method), normalized_counts) # Plot all unique combinations of tICA components if verbose: print("\nPlotting tICA components...") tica_coordinates = np.concatenate(tica_coordinates) new_assignments = np.concatenate(new_assignments) cluster_colors = matplotlib.cm.rainbow(np.linspace(0,1,n_clusters)) for j in tqdm.tqdm_notebook(range(len(all_ticas)),leave=False): # For each pair if all_ticas[j][0] < all_ticas[j][1]: plt.figure(j, figsize=(20,16)) tICx, tICy = all_ticas[j][0]-1, all_ticas[j][1]-1 plt.hexbin(tica_coordinates[:,tICx],tica_coordinates[:,tICy], bins='log') for l in tqdm.tqdm(range(len(tica_coordinates))[::stride*2]): if color_by == 'cluster': plt.plot(tica_coordinates[l][tICx], tica_coordinates[l][tICy], color=cluster_colors[new_assignments[l]], linestyle="", marker="o") x_centers = [clusters.cluster_centers_[i][tICx] for i in range(len(clusters.cluster_centers_))] y_centers = [clusters.cluster_centers_[i][tICy] for i in range(len(clusters.cluster_centers_))] high_pop_x_centers = [ x_centers[i] for i in range(len(x_centers)) if percentages[i] > cluster_percentage_cutoff ] high_pop_y_centers = [ y_centers[i] for i in range(len(y_centers)) if percentages[i] > cluster_percentage_cutoff ] plt.plot(x_centers, y_centers, color='y', linestyle="", marker="o") plt.plot(tica_coordinates[:,tICx][0],tica_coordinates[:,tICy][0], color='k', marker='*',markersize=24) plt.xlabel('tIC'+str(all_ticas[j][0])) plt.ylabel('tIC'+str(all_ticas[j][1])) plt.title(project_title) # Add labels for high-population cluster centers for label, x, y in zip(population_labels, high_pop_x_centers, high_pop_y_centers): plt.annotate( label, xy = (x, y), xytext = (-15, 15), textcoords = 'offset points', ha = 'right', va = 'bottom', bbox = dict(boxstyle = 'round,pad=0.5', fc = 'yellow', alpha = 0.5), arrowprops = dict(arrowstyle = '->', connectionstyle = 'arc3,rad=0')) plt.savefig('%s/tica_%d/plots/tica_%d_%d.png'%(project_title,n_clusters, all_ticas[j][0], all_ticas[j][1])) plt.close() # Write out PDBs for each cluster center if verbose: print("Performing cluster analytics and saving center PDBs...\n") if save_pdb: trajectory_files, feature_files, cluster_features = [],[],[] for run in range(runs): # get only xtc files that correlate to cluster-center features trajectory_files += [re.sub('features', 'traj_data/RUN%d'%run,re.sub('npy','xtc',x) ) for x in sorted(glob.glob('%s/features/*R%d_*npy'%( project_title,run)))] feature_files += sorted(glob.glob('%s/features/*R%d_*npy'%(project_title,run))) for i in tqdm.tqdm_notebook(range(len(trajectory_files)),leave=False): n_snapshots = len(clusters.distances_[i]) # Determine frames that are cluster centers cluster_indices = np.arange(n_snapshots)[ (clusters.distances_[i] < 1e-6) ] # Determine number of each cluster, correlates to populations.dat cluster_labels = sequences[i][cluster_indices] # Save each cluster center as a pdb if list(cluster_indices): # load center-containing xtcs to check length xtc_len = len(md.load(trajectory_files[i],top=structure_file)) # map strided frame number back to xtc frame number for j in range(len(cluster_indices)): frames = range(xtc_len) strided_frames = frames[equil_steps:][::stride] xtc_frame = frames.index(strided_frames[cluster_indices[j]]) cluster_traj = md.load_frame(trajectory_files[i], xtc_frame, top=structure_file) cluster_features.append(np.load(feature_files[i])[cluster_indices[j]]) cluster_traj.save_pdb('%s/tica_%d/%s_clusters/state_%d.pdb' %(project_title,n_clusters,cluster_method, cluster_labels[j])) # save cluster information with open('%s/tica_%d/cluster.dat'%(project_title,n_clusters),'w') as f: f.write('\nSuccessfully saved PDB for cluster: %d, (rel.pop: %.3f)'%( cluster_labels[j],percentages[cluster_labels[j]])) f.write('traj_file: %s (%d/%d)'%(trajectory_files[i],i,len(features))) f.write('frame: %d (%d/%d centers from this trajectory)'%( cluster_indices[j],j,len(cluster_indices))) f.write('strided: npy_frame/npy_len = %d/%d = %f'%( cluster_indices[j],n_snapshots,cluster_indices[j]/n_snapshots)) f.write('re-mapped: orig_frame/xtc_len = %d/%d = %f\n'%( xtc_frame,xtc_len,xtc_frame/xtc_len)) f.close() # save features corresponding to each cluster center np.save('%s/tica_%d/cluster_features.npy'%(project_title,n_clusters),cluster_features) return tica_score