def test_kcenters_7(): # are fit_predict and fit().predict() consistent? trj = np.random.RandomState(0).randn(30, 2) k = KCenters(n_clusters=10, random_state=0).fit([trj]) l1 = KCenters(n_clusters=10, random_state=0).fit([trj]).predict([trj])[0] l2 = KCenters(n_clusters=10, random_state=0).fit_predict([trj])[0] eq(l1, l2)
def test_kcenters_3(): # test for predict using euclidean distance m = KCenters(n_clusters=10) data = np.random.randn(100, 2) labels1 = m.fit_predict([data]) labels2 = m.predict([data]) eq(labels1[0], labels2[0]) all_pairs = scipy.spatial.distance.cdist(data, m.cluster_centers_) eq(labels2[0], np.argmin(all_pairs, axis=1))
def test_kcenters_4(): # test for predict() using non-euclidean distance. because of the # way the code is structructured, this takes a different path model = KCenters(n_clusters=10, metric='cityblock') data = np.random.randn(100, 2) labels1 = model.fit_predict([data]) labels2 = model.predict([data]) eq(labels1[0], labels2[0]) all_pairs = scipy.spatial.distance.cdist(data, model.cluster_centers_, metric='cityblock') eq(labels2[0], np.argmin(all_pairs, axis=1))
def test_kcenters_2(): # some data at (0,0), some data at (1,1) and some data at (0.5, 0.5) data = [np.zeros((10,2)), np.ones((10,2)), 0.5*np.ones((10,2))] m = KCenters(n_clusters=2, random_state=0) m.fit(data) # the centers should be [0,0], [1,1] (in either order). This # assumes that the random state seeded the initial center at # either (0,0) or (1,1). A different random state could have # seeded the first cluster at [0.5, 0.5] assert np.all(m.cluster_centers_ == np.array([[0,0], [1,1]])) or \ np.all(m.cluster_centers_ == np.array([[1,1], [0,0]])) # the distances should be 0 or sqrt(2)/2 eq(np.unique(np.concatenate(m.distances_)), np.array([0, np.sqrt(2)/2]))
def test_kcenters_1(): # make sure all the shapes are correct of the fit parameters m = KCenters(n_clusters=3) m.fit([np.random.randn(23,2), np.random.randn(10,2)]) assert isinstance(m.labels_, list) assert isinstance(m.distances_, list) assert len(m.labels_) == 2 eq(m.cluster_centers_.shape, (3,2)) eq(m.labels_[0].shape, (23,)) eq(m.labels_[1].shape, (10,)) eq(m.distances_[0].shape, (23,)) eq(m.distances_[1].shape, (10,)) eq(m.fit_predict([np.random.randn(10, 2)])[0].shape, (10,)) assert np.all(np.logical_not(np.isnan(m.distances_[0])))
def cluster_features(features, clusterer, n_clusters=8): ''' Input features : list of arrays, length n_trajs, each of shape (n_samples, n_features) Output clst : msmbuilder.cluster object, with attributes cluster_centers_ : (n_clusters, n_features) labels_ : list of arrays, each of shape (n_samples, ) ''' if clusterer == 'KMeans': from msmbuilder.cluster import KMeans clst = KMeans(n_clusters=n_clusters) elif clusterer == 'KCenters': from msmbuilder.cluster import KCenters clst = KCenters(n_clusters=n_clusters) elif clusterer == 'KMedoids': from msmbuilder.cluster import KMedoids clst = KMedoids(n_clusters=n_clusters) elif clusterer == 'MiniBatchKMeans': from msmbuilder.cluster import MiniBatchKMeans clst = MiniBatchKMeans(n_clusters=n_clusters) elif clusterer == 'MiniBatchKMedoids': from msmbuilder.cluster import MiniBatchKMedoids clst = MiniBatchKMedoids(n_clusters=n_clusters) clusters = clst.fit_transform(features) return clst
def test_kcenters_8(): X = np.random.RandomState(1).randn(100, 2) X32 = X.astype(np.float32) X64 = X.astype(np.float64) m1 = KCenters(n_clusters=10, random_state=0).fit([X32]) m2 = KCenters(n_clusters=10, random_state=0).fit([X64]) eq(m1.cluster_centers_, m2.cluster_centers_) eq(m1.distances_[0], m2.distances_[0]) eq(m1.labels_[0], m2.labels_[0]) assert np.all(np.logical_not(np.isnan(m1.distances_[0]))) eq(m1.predict([X32])[0], m2.predict([X64])[0]) eq(m1.predict([X32])[0], m1.labels_[0]) eq(float(m1.inertia_), libdistance.assign_nearest(X32, m1.cluster_centers_, "euclidean")[1])
draw_tica_projection(resultdir, tica_trajs, 'tica_12.png', 1, 2) #sample conformations along tIC1 print('now we are sampling representative conformations along tIC1') plt.figure() sampling_along_tIC(resultdir, 'samples_tic1.png', tica_trajs, trajectory_dir, traj_list_array, pdb_name, 1) print("You can use vmd to visualize the tica-dimension-tIC1.xtc file") # In[158]: #step 1.1: split the conformations into hundreds of microstates #perform kCenters on the tIC subspace #input:tICA projections, output:assignments indicating which microstate each conformation is assigned to nMicro = 100 #specified a priori kcenters = KCenters(n_clusters=nMicro, metric='euclidean', random_state=0) microstate_sequences = kcenters.fit(tica_trajs) print("output of msm:", microstate_sequences.labels_) plt.figure() plot_states_on_tic_space(resultdir, 'micorstate.png', tica_trajs, microstate_sequences.labels_, 1, 2) # In[159]: #plot the microstate implied timescale, which will show how many macrostates we need plt.figure() lag_times = range(2, 50, 2) msm_timescales = implied_timescales(microstate_sequences.labels_, lag_times, n_timescales=10,
from itertools import combinations from msmbuilder.featurizer import AtomPairsFeaturizer from msmbuilder.decomposition import tICA from sklearn.pipeline import Pipeline from msmbuilder.example_datasets import fetch_met_enkephalin from matplotlib import pyplot as plt from sklearn.externals import joblib #Featurization t = md.load('conf.gro') trajs = md.load('traj0.xtc', top='conf.gro') #Ind = t.topology.select("(backbone and protein)or name 'CB'") #trajs1=trajs.atom_slice(Ind) print "Preparation done, now begin clustering..." #Cluster kcenters = KCenters(n_clusters=25, metric='rmsd').fit(trajs) traj2 = kcenters.cluster_centers_ traj2.save_pdb('Gens_total.pdb') sys.exit() traj2[0].save_pdb('Gens0.pdb') traj2[1].save_pdb('Gens1.pdb') traj2[2].save_pdb('Gens2.pdb') traj2[3].save_pdb('Gens3.pdb') traj2[4].save_pdb('Gens4.pdb') traj2[5].save_pdb('Gens5.pdb') traj2[6].save_pdb('Gens6.pdb') traj2[7].save_pdb('Gens7.pdb') traj2[8].save_pdb('Gens8.pdb') traj2[9].save_pdb('Gens9.pdb') traj2[10].save_pdb('Gens10.pdb') traj2[11].save_pdb('Gens11.pdb')
and atom.residue == ligand] inds_N.append(iis) print inds_N #sequences of coordinates of ligands sequences_all = [] for this_sim in simulations: if use_COM: this_seq = util.featurize_RawPos(inds_all,this_sim,average = True) else: this_seq = util.featurize_RawPos(inds_N,this_sim) sequences_all.extend(this_seq) seqfile = '/home/shenglan/TryMSMbuilder/output/sequences'+'_s'+str(LOAD_STRIDE)+'.out' pickle.dump(sequences_all, open(seqfile,'wb')) KC_clustering = KCenters(n_clusters = N_CLUSTER) KC_assignments = KC_clustering.fit_predict(sequences_all) KC_centers = KC_clustering.cluster_centers_ KM_clustering = KCenters(n_clusters = N_CLUSTER) KM_assignments = KM_clustering.fit_predict(sequences_all) KM_centers = KM_clustering.cluster_centers_ KC_output_file = '/home/shenglan/TryMSMbuilder/output/KC_centers_c'+str(N_CLUSTER)+'_s'+str(LOAD_STRIDE)+'.out' KM_output_file = '/home/shenglan/TryMSMbuilder/output/KM_centers_c'+str(N_CLUSTER)+'_s'+str(LOAD_STRIDE)+'.out' np.savetxt(KC_output_file,KC_centers,fmt = '%10.4g') np.savetxt(KM_output_file,KM_centers,fmt = '%10.4g') KC_assign_file = '/home/shenglan/TryMSMbuilder/output/KC_assign_'+str(N_CLUSTER)+'_s'+str(LOAD_STRIDE)+'.out' KM_assign_file = '/home/shenglan/TryMSMbuilder/output/KM_assign_'+str(N_CLUSTER)+'_s'+str(LOAD_STRIDE)+'.out' pickle.dump(KC_assignments,open(KC_assign_file,'wb'))
this_lig.extend(md_dist.compute_distances(this_traj,[this_atom_pair])) distances.append(this_lig) dist_path = '/home/shenglan/TryMSMbuilder/output/ten_ligands/dist_to_binding'\ +'_s'+str(LOAD_STRIDE)+'.out' pickle.dump(distances,open(dist_path,'wb')) # get N positions sequences_all = [] for this_sim in simulations: this_seq = util.featurize_RawPos(inds_N,[this_sim]) sequences_all.extend(this_seq) seq_path = '/home/shenglan/TryMSMbuilder/output/ten_ligands/sequences'+'_s'+str(LOAD_STRIDE)+'.out' pickle.dump(sequences_all,open(seq_path,'wb')) clustering = KCenters(n_clusters = N_CLUSTER) geo_assign = clustering.fit_predict(sequences_all) centers = clustering.cluster_centers_ geo_assign_path = '/home/shenglan/TryMSMbuilder/output/ten_ligands/KC_geoassign_c' \ +str(N_CLUSTER)+'_s'+str(LOAD_STRIDE)+'.out' pickle.dump(geo_assign,open(geo_assign_path,'wb')) micro_msm = MarkovStateModel(lag_time=1, reversible_type = 'transpose', ergodic_cutoff = 'off' ,verbose=True).fit(geo_assign) msm_path = '/home/shenglan/TryMSMbuilder/output/ten_ligands/KC_msm_c'+str(N_CLUSTER)+ \ '_s'+str(LOAD_STRIDE)+'.out' pickle.dump(micro_msm,open(msm_path,'wb'))
def calculate_tica_components(): print("Calculating tICA components...") in_files = glob.glob("out*npy") loaded_files = [ np.load(filename) for filename in in_files ] tica = tICA(lag_time=tica_lagtime, n_components=int(tica_components)).fit_transform(loaded_files) np.save('lag_%d_comp_%d.npy' %(tica_lagtime, tica_components), tica) tica_data = 'data_lag_%d_comp_%d' %(tica_lagtime, tica_components) joblib.dump(tica, tica_data) data = np.load('lag_%d_comp_%d.npy' %(tica_lagtime, tica_components)) for i in range(len(glob.glob('out*npy'))): # extract the four tICA components for j in range(len(data[i])): tica_1.append(data[i][j][0]) tica_2.append(data[i][j][1]) tica_3.append(data[i][j][2]) tica_4.append(data[i][j][3]) # Clustering via KCenters if cluster_method == 'kcenters': print("Clustering via KCenters...") clusters = KCenters(n_clusters) elif cluster_method == 'kmeans': print("Clustering via KMeans...") clusters = KMeans(n_clusters) else: sys.exit("Invalid cluster_method. Use kmeans or kcenters.") sequences = clusters.fit_transform(tica) np.save('lag_%d_clusters_%d_sequences.npy' %(tica_lagtime, n_clusters), sequences) np.save('lag_%d_clusters_%d_center.npy' %(tica_lagtime, n_clusters), clusters.cluster_centers_) cluster_data = 'lag_%d_clusters_%d.pkl' %(tica_lagtime, n_clusters) joblib.dump(sequences, cluster_data) # Determining cluster populations print("Determining cluster populations...") counts = np.array([len(np.where(np.concatenate(sequences)==i)[0]) for i in range(n_clusters)]) # how many frames are in each cluster normalized_counts = counts/float(counts.sum()) percentages = [ i*100 for i in normalized_counts ] # Plotting the tICA components print("Plotting tICA components with cluster centers...") plt.figure(0) # plotting tica_1, tica_2 plt.hexbin(tica_1, tica_2, bins='log') #, cmap=cmaps.viridis x_centers = [clusters.cluster_centers_[i][0] for i in range(len(clusters.cluster_centers_))] y_centers = [clusters.cluster_centers_[i][1] for i in range(len(clusters.cluster_centers_))] plt.plot(x_centers, y_centers, 'wo') for label, x, y in zip(["%.4f"%i for i in percentages], x_centers, y_centers): # adds percentage contribution for each cluster plt.annotate( label, xy = (x, y), xytext = (-20, 20), textcoords = 'offset points', ha = 'right', va = 'bottom', bbox = dict(boxstyle = 'round,pad=0.5', fc = 'yellow', alpha = 0.5), arrowprops = dict(arrowstyle = '->', connectionstyle = 'arc3,rad=0')) plt.savefig('tica_1_2.png') plt.figure(1) # plotting tica_1, tica_3 plt.hexbin(tica_1, tica_3, bins='log') x_centers = [clusters.cluster_centers_[i][0] for i in range(len(clusters.cluster_centers_))] y_centers = [clusters.cluster_centers_[i][2] for i in range(len(clusters.cluster_centers_))] plt.plot(x_centers, y_centers, 'wo') for label, x, y in zip([ "%.4f"%i for i in percentages], x_centers, y_centers): plt.annotate( label, xy = (x, y), xytext = (-20, 20), textcoords = 'offset points', ha = 'right', va = 'bottom', bbox = dict(boxstyle = 'round,pad=0.5', fc = 'yellow', alpha = 0.5), arrowprops = dict(arrowstyle = '->', connectionstyle = 'arc3,rad=0')) plt.savefig('tica_1_3.png') plt.figure(2) # plotting tica_2, tica_3 plt.hexbin(tica_2, tica_3, bins='log') x_centers = [clusters.cluster_centers_[j][1] for j in range(len(clusters.cluster_centers_))] y_centers = [clusters.cluster_centers_[j][2] for j in range(len(clusters.cluster_centers_))] plt.plot(x_centers, y_centers, 'wo') for label, x, y in zip(["%.4f"%i for i in percentages], x_centers, y_centers): plt.annotate( label, xy = (x, y), xytext = (-20, 20), textcoords = 'offset points', ha = 'right', va = 'bottom', bbox = dict(boxstyle = 'round,pad=0.5', fc = 'yellow', alpha = 0.5), arrowprops = dict(arrowstyle = '->', connectionstyle = 'arc3,rad=0')) plt.savefig('tica_2_3.png') # Determining cluster entropy ( this yields errors for me ) # print("Determining cluster entropy") # cluster_entropy = (-1.0*normalized_counts*np.log(normalized_counts)).sum() # np.savetxt('cluster_entropy.dat', cluster_entropy) # Determining the cluster populations and writing out PDBs for cluster centers print("Determining cluster populations...") counts = np.array([len(np.where(np.concatenate(sequences)==i)[0]) for i in range(n_clusters)]) # how many frames are in each cluster normalized_counts = counts/float(counts.sum()) np.savetxt('populations.dat', normalized_counts) print("Performing cluster analytics and saving center PDBs...\n") for i in range(len(glob.glob("traj*xtc"))): n_snapshots = len(clusters.distances_[i]) cluster_indices = np.arange(n_snapshots)[ (clusters.distances_[i] < 1e-6) ] # frames that have centers cluster_labels = sequences[i][cluster_indices] # number of cluster if cluster_indices.size != 0: # print only the trajectories that have cluster centers for j in range(len(cluster_labels)): # for each cluster center found in this trajectory print('Cluster center', cluster_labels[j], 'was found in trajectory', str(i) + '.') print('It is found on frame', cluster_indices[j], 'and has a relative population of', "%.4f"%percentages[cluster_labels[j]], '%.') xtcfile = sorted(glob.glob("traj*xtc"))[i] for j in range(len(cluster_indices)): # actually saving the snapshots cluster_traj = md.load_frame(xtcfile, cluster_indices[j], top='structure.gro') cluster_traj.save_pdb('state_%d.pdb' %cluster_labels[j]+1) # Calculating IPTs print("\nCalculating Implied Timescales...") timescales = implied_timescales(sequences, lagtimes, n_timescales=n_timescales, msm=MarkovStateModel(verbose=False)) implied_timescale_data = 'ipt_lag_%d_clusters_%d.pkl' %(tica_lagtime, n_clusters) joblib.dump(timescales, implied_timescale_data) numpy_timescale_data = 'lag_%d_clusters_%d_timescales.npy' %(tica_lagtime, n_clusters) np.savetxt('lagtimes.txt', lagtimes) np.save(numpy_timescale_data, timescales) # Plotting IPTs (lagtimes and timescales) print("Plotting Implied Timescales...") for i in range(n_timescales): plt.figure(42) plt.plot(lagtimes * time_step, timescales[:, i] * time_step, 'o-') plt.yscale('log') plt.xlabel('lagtime (ns)') plt.ylabel('Implied timescales (ns)') plt.savefig('lag_%d_clusters_%d_.png' %(tica_lagtime, n_clusters))
lines = f.read() f.close round_num = int(lines) ## Construct and save the dataframe parser = NumberedRunsParser( traj_fmt="trj-{run}.xtc", top_fn="/scratch/jap12009/msm/fast/try1/frame0nw_startingAPO.pdb", step_ps=240, ) meta = gather_metadata("/scratch/jap12009/msm/fast/try1/trj/trj-*.xtc", parser) save_meta(meta) ## Set up parameters for clustering kcen = KCenters( n_clusters=num_clusters, metric='rmsd', ) ## Try to limit RAM usage def guestimate_stride(): total_data = meta['nframes'].sum() want = kcen.n_clusters * 20 stride = max(1, total_data // want) print("Since we have", total_data, "frames, we're going to stride by", stride, "during fitting, because this is probably adequate for", kcen.n_clusters, "clusters") return stride ## Fit kcen.fit([traj for _, traj in itertrajs(meta, stride=guestimate_stride())])
#sequences of coordinates of ligand aromatic ring and Aps113 sequences_all = [] for this_sim in simulations: this_seq = util.featurize_RawPos(inds,this_sim) sequences_all.extend(this_seq) #print len(sequences_all) #print sequences_all[-1].shape #average position of Asp113 #res_pos_ave = np.mean(res_pos_A_1[0],axis = 0) # time_step = util.calc_time_step(times_path,stride = LOAD_STRIDE) # clustering = KCenters(n_clusters = 10) assignments = clustering.fit_predict(sequences_all) centers = clustering.cluster_centers_ #print len(assignments) #print assignments[1].shape msm = MarkovStateModel(lag_time=180, verbose=True).fit(assignments) countsmat = msm.countsmat_ transmat = msm.transmat_ #print np.sum(countsmat) #np.savetxt('/home/shenglan/TryMSMbuilder/output/assignments.out',assignments, fmt = '%3.0f') np.savetxt('/home/shenglan/TryMSMbuilder/output/countsmat.out',countsmat,fmt = '%8.4g') np.savetxt('/home/shenglan/TryMSMbuilder/output/transmat.out',transmat,fmt = '%10.4g')
show() # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # ~~~~~~~~~~~~~~ MARKOV STATE MODEL ~~~~~~~~~~~~~~~~~~~~~~ # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ msmts0, msmts1 = {}, {} lag_times = [1, 10, 20, 30, 40] n_states = [4, 8, 16, 32, 64] for n in n_states: msmts0[n] = [] msmts1[n] = [] for lag_time in lag_times: assignments = KCenters(n_clusters=n).fit_predict(sequences) msm = MarkovStateModel(lag_time=lag_time, verbose=False).fit(assignments) timescales = msm.timescales_ msmts0[n].append(timescales[0]) msmts1[n].append(timescales[1]) print('n_states=%d\tlag_time=%d\ttimescales=%s' % (n, lag_time, timescales[0:2])) print() figure(figsize=(14, 3)) for i, n in enumerate(n_states): subplot(1, len(n_states), 1 + i) plot(lag_times, msmts0[n]) plot(lag_times, msmts1[n])
(outputdir, line)) temp = temp[:, 0:num_tics_for_clustering] tica_sequences.append(temp) TS2_ticproj_list_array = [] tica_TS2_sequences = [] for line in open("ticproj_TS2"): TS2_ticproj_list_array.append(line.strip()) temp1 = numpy.loadtxt("%s/TS2_project_onto_GS_tics/%s_ticproj.txt" % (outputdir, line.strip())) temp1 = temp1[:, 0:num_tics_for_clustering] tica_TS2_sequences.append(temp1) tmp_counter = 0 kcenters = KCenters(n_clusters=nMicro) #kcenters = KCenters(n_clusters=num_tics_for_clustering) # Fr :) kcenters_sequences = kcenters.fit_predict( tica_sequences) #here it is ground state tica sequences print "begin to plot the microstate implied timescale into the objective dir" #plot implied timescale lag_times = range(10, 100, 10) #adjust variables n_timescales = 5 #adjust variables msm_timescales = implied_timescales(kcenters_sequences, lag_times, n_timescales=n_timescales,
(fold, tica_correlation_time), train_data_projection, test_data_projection, 1, 2) plt.figure() draw_tica_projection_cross_validation( sub_resultdir, 'Fold_%d_tica_lagtime_%d_train_data_proj_tIC13.png' % (fold, tica_correlation_time), train_data_projection, test_data_projection, 1, 3) for n_tics in n_tics_range: for n_Micro in n_Micro_range: print("parameters: fold-", fold, ',tica_lagtime-', tica_correlation_time, ',n_tics-', n_tics, ',n_Micro-', n_Micro) kcenters = KCenters(n_clusters=n_Micro, metric='euclidean', random_state=0) kcenters.fit(train_data_projection) train_data_sequence = kcenters.predict( train_data_projection) test_data_sequence = kcenters.predict(test_data_projection) msm = MarkovStateModel( n_timescales=3, lag_time=100, reversible_type='transpose', verbose=False, sliding_window=True, ergodic_cutoff='on') #the parameters may change msm.fit(train_data_sequence) train_score = msm.score(train_data_sequence) test_score = msm.score(test_data_sequence)
def test_kcenters_5(): model1 = KCenters(n_clusters=10, random_state=0, metric='euclidean') model2 = KCenters(n_clusters=10, random_state=0, metric='sqeuclidean') data = np.random.RandomState(0).randn(100, 2) eq(model1.fit_predict([data])[0], model2.fit_predict([data])[0])
distances.append(this_lig) dist_path = '/home/shenglan/TryMSMbuilder/output/ten_ligands/dist_to_binding'\ +'_s'+str(LOAD_STRIDE)+'.out' pickle.dump(distances, open(dist_path, 'wb')) # get N positions sequences_all = [] for this_sim in simulations: this_seq = util.featurize_RawPos(inds_N, [this_sim]) sequences_all.extend(this_seq) seq_path = '/home/shenglan/TryMSMbuilder/output/ten_ligands/sequences' + '_s' + str( LOAD_STRIDE) + '.out' pickle.dump(sequences_all, open(seq_path, 'wb')) clustering = KCenters(n_clusters=N_CLUSTER) geo_assign = clustering.fit_predict(sequences_all) centers = clustering.cluster_centers_ geo_assign_path = '/home/shenglan/TryMSMbuilder/output/ten_ligands/KC_geoassign_c' \ +str(N_CLUSTER)+'_s'+str(LOAD_STRIDE)+'.out' pickle.dump(geo_assign, open(geo_assign_path, 'wb')) micro_msm = MarkovStateModel(lag_time=1, reversible_type='transpose', ergodic_cutoff='off', verbose=True).fit(geo_assign) msm_path = '/home/shenglan/TryMSMbuilder/output/ten_ligands/KC_msm_c'+str(N_CLUSTER)+ \ '_s'+str(LOAD_STRIDE)+'.out' pickle.dump(micro_msm, open(msm_path, 'wb'))
traj_list_array = [] for line in open("trajlist"): traj_list_array.append(line.strip()) print traj_list_array dataset = [] for trajfile in traj_list_array: t = md.load(xtc_file_dir + trajfile, top='test.pdb', atom_indices=select_atoms) dataset.append(t) print dataset #ww: check whether they have aligned w.r.t reference kcenters = KCenters(n_clusters=nMicro, metric='rmsd', random_state=0) kcenters_sequences = kcenters.fit(dataset) out_assignment_dir = 'Microassignment/' out_kcenters_distances_dir = 'distances/' os.system("mkdir %s" % (out_assignment_dir)) os.system("mkdir %s" % (out_kcenters_distances_dir)) tmp_counter = 0 for ifile in traj_list_array: numpy.savetxt("%s/%s_assignment_.txt" % (out_assignment_dir, ifile[:-4]), kcenters.labels_[tmp_counter], fmt='%d') numpy.savetxt("%s/%s_distances_.txt" % (out_kcenters_distances_dir, ifile[:-4]),
pp.plot(lag_times, msm_timescales[:, 0], 'o-') pp.plot(lag_times, msm_timescales[:, 1], 'o-') pp.plot(lag_times, msm_timescales[:, 2], 'o-') pp.title('Discrete-time MSM Relaxation Timescales') pp.semilogy() pp.show() #ctmsm_timescales = implied_timescales(kmeanslabel, lag_times, n_timescales=n_timescales, msm=ContinuousTimeMSM(verbose=False)) #X_scaled = preprocessing.normalize(npdata_filtered) #sequences2=list(np.transpose(np.reshape(npdata_filtered2[:,3].astype(int),(-1,1)))) ######K-Means-Clustering##### ############################# cluster = KCenters(metric='euclidean', n_clusters=4) #sequences = cluster.fit_transform(seq) #for item in sequences: # print (item) ''' kmeans = KMeans(n_clusters=4,random_state=0).fit_transform(npdata_filtered) #states from kmeans kmeanslabel=kmeans.labels_.tolist() ########Time scale calculations lag_times=list(range(1, 100,2)) n_timescales=10 msm_timescales = implied_timescales(sequences, lag_times, n_timescales=n_timescales, msm=MarkovStateModel(verbose=False))
def compute_tica_components(): '''Load in the features, calculate a given number of tICA components (tica_components) given a lagtime (lag_time), and save tICA coordinates and eigenvector data. It then creates and populates a list for each desired component, clusters the data, saving normalized populations as populations.dat and saving each cluster center as a .pdb. tICA plots are created and saved, and implied timescales are calculated, saved, and plotted. ''' verbose = False save_pdb = True color_by = 'cluster' if verbose: print("\nCalculating tICA components...") if not os.path.exists(project_title + '/tica_%d'%n_clusters): os.mkdir(project_title + '/tica_%d'%n_clusters) # load in feature files and determine indices of unbiased ensembles feature_files = [] for i in range(runs): run_files = sorted(glob.glob(/features/' + "P*R%d_*npy"%i)) feature_files += run_files if i in unbiased_runs: unbiased_indices = [len(feature_files) - len(run_files),len(feature_files)] features = [np.load(x) for x in feature_files] # perform tICA calculation and extract score / eigenvectors tica_coordinates = tICA(lag_time=tica_lagtime, n_components=int(n_components)).fit_transform(features) tica_components = tICA(lag_time=tica_lagtime, n_components=int(n_components)).fit(features) eigenvectors = np.transpose(tica_components.eigenvectors_) tica_score = tica_components.score(features) np.save('%s/tica_%d/tica_coords-lag_%d-comp_%d.npy' %( project_title, n_clusters, tica_lagtime, n_components), tica_coordinates) np.save('%s/tica_%d/tica_comps-lag_%d-comp_%d.npy' %( project_title, n_clusters, tica_lagtime, n_components), tica_components) # Perform clustering based on the cluster_method parameter. if verbose: print('Clustering via %s'%cluster_method) if cluster_method == 'kcenters': clusters = KCenters(n_clusters) elif cluster_method == 'kmeans': clusters = KMeans(n_clusters) elif cluster_method == 'kmedoids': clusters = KMedoids(n_clusters) else: sys.exit('Invalid cluster_method. Use kcenters/kmeans/kmedoids.') # Cluster unbiased data and fit biased data to these centers new_assignments = [] sequences = clusters.fit_transform(tica_coordinates[unbiased_indices[0]:unbiased_indices[1]]) for i in tqdm.tqdm_notebook(range(unbiased_indices[0])): tica_traj = tica_coordinates[i] if isinstance(tica_traj, np.ndarray): if not (tica_traj.dtype == 'float32' or tica_traj.dtype == 'float64'): tica_traj = tica_traj.astype('float64') labels, inertia = msmbuilder.libdistance.assign_nearest( tica_traj, clusters.cluster_centers_, metric='euclidean') new_assignments.append(labels) new_assignments += sequences # tack the unbiased assignments back on to the end. np.save('%s/tica_%d/lag_%d_clusters_%d_assignments.npy' %( project_title, n_clusters, tica_lagtime, n_clusters), new_assignments) np.save('%s/tica_%d/lag_%d_clusters_%d_center.npy' %( project_title, n_clusters, tica_lagtime, n_clusters), clusters.cluster_centers_) # Determine cluster populations, normalize the counts, and save as percentages for # labeling if a cluster contains more than cluster_percentage_cutoff percent of the data. # Finally, save normalized counts. if verbose: print("\nDetermining cluster populations...") if not os.path.exists('%s/tica_%d/%s_clusters'%(project_title,n_clusters,cluster_method)): os.mkdir('%s/tica_%d/%s_clusters'%(project_title,n_clusters,cluster_method)) if not os.path.exists('%s/tica_%d/plots'%(project_title,n_clusters)): os.mkdir('%s/tica_%d/plots'%(project_title,n_clusters)) counts = np.array([len(np.where(np.concatenate(sequences)==i)[0]) for i in range(n_clusters)]) normalized_counts = counts/float(counts.sum()) percentages = [ i*100 for i in normalized_counts ] population_labels = [ [i,"%.2f"%percentages[i]] for i in range(len(percentages)) if percentages[i] > cluster_percentage_cutoff ] np.savetxt('%s/tica_%d/%s_clusters/populations.dat' %(project_title,n_clusters,cluster_method), normalized_counts) # Plot all unique combinations of tICA components if verbose: print("\nPlotting tICA components...") tica_coordinates = np.concatenate(tica_coordinates) new_assignments = np.concatenate(new_assignments) cluster_colors = matplotlib.cm.rainbow(np.linspace(0,1,n_clusters)) for j in tqdm.tqdm_notebook(range(len(all_ticas)),leave=False): # For each pair if all_ticas[j][0] < all_ticas[j][1]: plt.figure(j, figsize=(20,16)) tICx, tICy = all_ticas[j][0]-1, all_ticas[j][1]-1 plt.hexbin(tica_coordinates[:,tICx],tica_coordinates[:,tICy], bins='log') for l in tqdm.tqdm(range(len(tica_coordinates))[::stride*2]): if color_by == 'cluster': plt.plot(tica_coordinates[l][tICx], tica_coordinates[l][tICy], color=cluster_colors[new_assignments[l]], linestyle="", marker="o") x_centers = [clusters.cluster_centers_[i][tICx] for i in range(len(clusters.cluster_centers_))] y_centers = [clusters.cluster_centers_[i][tICy] for i in range(len(clusters.cluster_centers_))] high_pop_x_centers = [ x_centers[i] for i in range(len(x_centers)) if percentages[i] > cluster_percentage_cutoff ] high_pop_y_centers = [ y_centers[i] for i in range(len(y_centers)) if percentages[i] > cluster_percentage_cutoff ] plt.plot(x_centers, y_centers, color='y', linestyle="", marker="o") plt.plot(tica_coordinates[:,tICx][0],tica_coordinates[:,tICy][0], color='k', marker='*',markersize=24) plt.xlabel('tIC'+str(all_ticas[j][0])) plt.ylabel('tIC'+str(all_ticas[j][1])) plt.title(project_title) # Add labels for high-population cluster centers for label, x, y in zip(population_labels, high_pop_x_centers, high_pop_y_centers): plt.annotate( label, xy = (x, y), xytext = (-15, 15), textcoords = 'offset points', ha = 'right', va = 'bottom', bbox = dict(boxstyle = 'round,pad=0.5', fc = 'yellow', alpha = 0.5), arrowprops = dict(arrowstyle = '->', connectionstyle = 'arc3,rad=0')) plt.savefig('%s/tica_%d/plots/tica_%d_%d.png'%(project_title,n_clusters, all_ticas[j][0], all_ticas[j][1])) plt.close() # Write out PDBs for each cluster center if verbose: print("Performing cluster analytics and saving center PDBs...\n") if save_pdb: trajectory_files, feature_files, cluster_features = [],[],[] for run in range(runs): # get only xtc files that correlate to cluster-center features trajectory_files += [re.sub('features', 'traj_data/RUN%d'%run,re.sub('npy','xtc',x) ) for x in sorted(glob.glob('%s/features/*R%d_*npy'%( project_title,run)))] feature_files += sorted(glob.glob('%s/features/*R%d_*npy'%(project_title,run))) for i in tqdm.tqdm_notebook(range(len(trajectory_files)),leave=False): n_snapshots = len(clusters.distances_[i]) # Determine frames that are cluster centers cluster_indices = np.arange(n_snapshots)[ (clusters.distances_[i] < 1e-6) ] # Determine number of each cluster, correlates to populations.dat cluster_labels = sequences[i][cluster_indices] # Save each cluster center as a pdb if list(cluster_indices): # load center-containing xtcs to check length xtc_len = len(md.load(trajectory_files[i],top=structure_file)) # map strided frame number back to xtc frame number for j in range(len(cluster_indices)): frames = range(xtc_len) strided_frames = frames[equil_steps:][::stride] xtc_frame = frames.index(strided_frames[cluster_indices[j]]) cluster_traj = md.load_frame(trajectory_files[i], xtc_frame, top=structure_file) cluster_features.append(np.load(feature_files[i])[cluster_indices[j]]) cluster_traj.save_pdb('%s/tica_%d/%s_clusters/state_%d.pdb' %(project_title,n_clusters,cluster_method, cluster_labels[j])) # save cluster information with open('%s/tica_%d/cluster.dat'%(project_title,n_clusters),'w') as f: f.write('\nSuccessfully saved PDB for cluster: %d, (rel.pop: %.3f)'%( cluster_labels[j],percentages[cluster_labels[j]])) f.write('traj_file: %s (%d/%d)'%(trajectory_files[i],i,len(features))) f.write('frame: %d (%d/%d centers from this trajectory)'%( cluster_indices[j],j,len(cluster_indices))) f.write('strided: npy_frame/npy_len = %d/%d = %f'%( cluster_indices[j],n_snapshots,cluster_indices[j]/n_snapshots)) f.write('re-mapped: orig_frame/xtc_len = %d/%d = %f\n'%( xtc_frame,xtc_len,xtc_frame/xtc_len)) f.close() # save features corresponding to each cluster center np.save('%s/tica_%d/cluster_features.npy'%(project_title,n_clusters),cluster_features) return tica_score
def calculate_tica_components(self, cluster_method, calculate_strides=False, feats=None): '''Load in the features, calculate a given number of tICA components (tica_components) given a lagtime (lag_time), and save tICA coordinates and eigenvector data. It then creates and populates a list for each desired component, clusters the data, saving normalized populations as populations.dat and saving each cluster center as a .pdb. tICA plots are created and saved, and implied timescales are calculated, saved, and plotted. ''' # tICA parameters tica_lagtime = 10 # determine from implied timescales tica_components = 8 # how many tICs to compute n_clusters = 100 # denotes number of microstates n_timescales = tica_components # plot all eigenvalues --> timescales md_time_step = 0.02 # ns subsampled_time_step = 1. # ns multiplier of timescales and lagtimes in implied timescale plot stride = int(subsampled_time_step / md_time_step) #time step stride for sub-sampling equil_time = 1. # ns equil_steps = 1 #int(equil_time / md_time_step) time steps to be removed from start lagtimes = np.array([1,2,4,8,16,32,64,128,256,512,1024]) cluster_method = 'kcenters' # 'kcenters/kmeans' all_ticas = list(itertools.permutations(range(1,tica_components+1), 2)) # all combinations all_ticas = [[1,2]] # override: just show analysis for first two components cluster_percentage_cutoff = 5 # clusters with a relative population less than this # number will not be labeled on plot i.e. 0 : all clusters labeled verbose = False print("\nCalculating tICA components...") # Load in feature files THIS WILL NEED TO BE CHANGED if feats == None: if calculate_strides: self.calculate_stride_distances(stride, equil_steps) data = np.load('/home/server/git/fah-scripts/DataAnalysisScripts/stride_dist/stride_dist_%d.npy' % self.proj_num) else: data = self.data else: data = np.load(feats) features = [] for run in data: for clone in run: gen_seq = [] for gen in clone: if gen is not None and gen[0] is not None: if calculate_strides or feats is not None: gen_seq.append(gen) else: gen_seq.append(gen[::stride]) if len(gen_seq) > 0: gen_cat = np.concatenate(gen_seq) if calculate_strides: features.append(gen_cat) else: features.append(gen_cat[equil_steps:]) features = np.asarray(features) print(features.shape) print(features[0].shape) tica_coordinates = tICA(lag_time=tica_lagtime, n_components=int(tica_components)).fit_transform(features) np.save('%s/lag_%d_coord_%d.npy' %(self.tICA_dir, tica_lagtime, tica_components), tica_coordinates) # Initiate and populate an array for each component for i in range(tica_components): exec('tica_' + str(i+1) + ' = []') for i in tqdm.tqdm(range(len(features))): for j in range(len(tica_coordinates[i])): for k in range(tica_components): exec('tica_' + str(k+1) + '.append(tica_coordinates[i][j][k])') # Perform clustering based on the cluster_method parameter. if cluster_method == 'kcenters': print("Clustering via KCenters...") clusters = KCenters(n_clusters) elif cluster_method == 'kmeans': print("Clustering via KMeans...") clusters = KMeans(n_clusters) else: sys.exit("Invalid cluster_method. Use kmeans or kcenters.") # Determine cluster assignment for each frame. sequences = clusters.fit_transform(tica_coordinates) np.save('%s/lag_%d_clusters_%d_sequences.npy' %(self.tICA_dir, tica_lagtime, n_clusters), sequences) np.save('%s/lag_%d_clusters_%d_center.npy' %(self.tICA_dir, tica_lagtime, n_clusters), clusters.cluster_centers_) # Determine cluster populations, normalize the counts, and save as percentages for # labeling if a cluster contains more than cluster_percentage_cutoff percent of the data. # Finally, save normalized counts. print("\nDetermining cluster populations...") if not os.path.exists('%s/cluster_centers' % self.tICA_dir): os.makedirs('%s/cluster_centers' % self.tICA_dir) counts = np.array([len(np.where(np.concatenate(sequences)==i)[0]) for i in range(n_clusters)]) normalized_counts = counts/float(counts.sum()) percentages = [ i*100 for i in normalized_counts ] population_labels = [ [i,"%.2f"%percentages[i]] for i in range(len(percentages)) if percentages[i] > cluster_percentage_cutoff ] np.savetxt('%s/cluster_centers/populations.dat' % self.tICA_dir, normalized_counts) # Plot all unique combinations of tICA components print("\nPlotting tICA components with cluster centers...") all_ticas = list(itertools.permutations(range(1,tica_components+1), 2)) for j in tqdm.tqdm(range(len(all_ticas))): # For each pair if all_ticas[j][0] < all_ticas[j][1]: plt.figure(j, figsize=(20,16)) plt.hexbin(eval("tica_"+str(all_ticas[j][0])), eval("tica_"+str(all_ticas[j][1])), bins='log') x_centers = [clusters.cluster_centers_[i][all_ticas[j][0]-1] for i in range(len(clusters.cluster_centers_))] y_centers = [clusters.cluster_centers_[i][all_ticas[j][1]-1] for i in range(len(clusters.cluster_centers_))] high_pop_x_centers = [ x_centers[i] for i in range(len(x_centers)) if percentages[i] > cluster_percentage_cutoff ] high_pop_y_centers = [ y_centers[i] for i in range(len(y_centers)) if percentages[i] > cluster_percentage_cutoff ] plt.plot(x_centers, y_centers, color='y', linestyle="", marker="o") plt.plot(eval("tica_"+str(all_ticas[j][0])+'[0]'), eval("tica_"+str(all_ticas[j][1])+'[0]'), color='k', marker='*',markersize=24) plt.xlabel('tic'+str(all_ticas[j][0])) plt.ylabel('tic'+str(all_ticas[j][1])) plt.title(self.proj_num) # Add labels for high-population cluster centers for label, x, y in zip(population_labels, high_pop_x_centers, high_pop_y_centers): plt.annotate( label, xy = (x, y), xytext = (-15, 15), textcoords = 'offset points', ha = 'right', va = 'bottom', bbox = dict(boxstyle = 'round,pad=0.5', fc = 'yellow', alpha = 0.5), arrowprops = dict(arrowstyle = '->', connectionstyle = 'arc3,rad=0')) plt.savefig('%s/tica_' % (self.tICA_dir) +str(all_ticas[j][0])+'_'+str(all_ticas[j][1])+'.png') plt.close() ########################################################################### for filename in os.listdir(self.tICA_dir + '/cluster_centers'): if filename.endswith('.pdb'): os.remove(self.tICA_dir + '/cluster_centers/' + filename) # Write out PDBs for each cluster center print("Performing cluster analytics and saving center PDBs...\n") runs, clones, gens = data.shape[0], data.shape[1], data.shape[2] x, y, z = 0, 0, 0 for i in range(len(features)): if i % clones == 0 and i != 0: x += 1 if i % gens == 0: y = 0 n_snapshots = len(clusters.distances_[i]) # Determine frames that are cluster centers cluster_indices = np.arange(n_snapshots)[ (clusters.distances_[i] < 1e-6) ] # Determine number of each cluster, correlates to populations.dat cluster_labels = sequences[i][cluster_indices] # Save each cluster center as a pdb if list(cluster_indices): # load center-containing xtcs to check length traj_cat = [] print('x: %d, y: %d, z: %d' % (x, y, z)) while True: try: traj = base_dir + 'PROJ%s/RUN%s/CLONE%s/results%s/traj_comp.xtc' % (self.proj_num, x, y, z) traj_cat.append(md.load(traj, top=self.gro_file)) z += 1 except: break if len(traj_cat) > 0: trajectory_file = md.join(traj_cat) xtc_len = len(trajectory_file) y += 1 z = 0 for j in range(len(cluster_indices)): frames = range(xtc_len) # map the strided frame number back to xtc frame number strided_frames = frames[equil_steps:][::stride] xtc_frame = frames.index(strided_frames[cluster_indices[j]]) cluster_traj = trajectory_file[xtc_frame] cluster_traj.save_pdb('%s/cluster_centers/state_%d_%.3f.pdb'%(self.tICA_dir, cluster_labels[j],percentages[cluster_labels[j]])) if verbose: print('Successfully saved PDB for cluster: %d, (rel.pop: %.3f)'%(cluster_labels[j],percentages[cluster_labels[j]])) print('traj_file: %s (%d/%d)'%(trajectory_file,i,len(features))) print('frame: %d (%d/%d centers from this trajectory)'%(cluster_indices[j],j,len(cluster_indices))) print('strided: npy_frame/npy_len = %d/%d = %f'%(cluster_indices[j],n_snapshots,cluster_indices[j]/n_snapshots)) print('re-mapped: orig_frame/xtc_len = %d/%d = %f\n'%(xtc_frame,xtc_len,xtc_frame/xtc_len))
import sys """ xyz = dataset('../xtc/*.xtc', topology = '~/Desktop/tica-projection/Structures/Reference-PRE.pdb') list1=np.loadtxt('atompairs-5pairs-5helix-P') featurizer = AtomPairsFeaturizer(pair_indices=list1) ticadist = xyz.fit_transform_with(featurizer, 'atompairsfeaturizer/', fmt='dir-npy') #ticadist =dataset('../atompairsfeaturizer/',mode='r',fmt='dir-npy',verbose=True) tica_model=tICA(lag_time=400,n_components=2) tica_model=ticadist.fit_with(tica_model) tica_trajs = ticadist.transform_with(tica_model, 'tica/',fmt='dir-npy') """ tica_trajs=dataset('./tica',mode='r',fmt='dir-npy',verbose=True) txx = np.concatenate(tica_trajs) clusterer = KCenters(n_clusters=1000,random_state=8) #clusterer = dataset('./cluster',mode='r',fmt='dir-npy',verbose=True) clustered_trajs = tica_trajs.fit_transform_with(clusterer, 'cluster-test/', fmt='dir-npy') """ from msmbuilder.msm import MarkovStateModel, implied_timescales data=dataset('cluster',mode='r',fmt='dir-npy',verbose=True) lag_times=range(100,1300,100) msm_timescales = implied_timescales(data, lag_times, n_timescales=10,msm=MarkovStateModel(lag_time=250,reversible_type='transpose',ergodic_cutoff='off')) np.savetxt('msm_timescales_2.txt',msm_timescales) #data=np.loadtxt('frame') #data1=np.loadtxt('frame-2211-2216') txx = np.concatenate(tica_trajs) plt.hexbin(txx[:,0], txx[:,1],bins='log', mincnt=0.1, cmap='viridis')
def test_2(): # Tet that PCA it works in a msmbuilder pipeline p = Pipeline([('pca', PCA()), ('cluster', KCenters())]) p.fit(trajs)
inds_N.append(iis) print inds_N #sequences of coordinates of ligands sequences_all = [] for this_sim in simulations: if use_COM: this_seq = util.featurize_RawPos(inds_all, this_sim, average=True) else: this_seq = util.featurize_RawPos(inds_N, this_sim) sequences_all.extend(this_seq) seqfile = '/home/shenglan/TryMSMbuilder/output/sequences' + '_s' + str( LOAD_STRIDE) + '.out' pickle.dump(sequences_all, open(seqfile, 'wb')) KC_clustering = KCenters(n_clusters=N_CLUSTER) KC_assignments = KC_clustering.fit_predict(sequences_all) KC_centers = KC_clustering.cluster_centers_ KM_clustering = KCenters(n_clusters=N_CLUSTER) KM_assignments = KM_clustering.fit_predict(sequences_all) KM_centers = KM_clustering.cluster_centers_ KC_output_file = '/home/shenglan/TryMSMbuilder/output/KC_centers_c' + str( N_CLUSTER) + '_s' + str(LOAD_STRIDE) + '.out' KM_output_file = '/home/shenglan/TryMSMbuilder/output/KM_centers_c' + str( N_CLUSTER) + '_s' + str(LOAD_STRIDE) + '.out' np.savetxt(KC_output_file, KC_centers, fmt='%10.4g') np.savetxt(KM_output_file, KM_centers, fmt='%10.4g') KC_assign_file = '/home/shenglan/TryMSMbuilder/output/KC_assign_' + str(