def featurize(featurizer, meta_data): tops = preload_tops(meta) def feat(irow): i, row = irow traj = md.load(row['traj_fn'], top=tops[row['top_fn']]) feat_traj = featurizer.partial_transform(traj) return i, feat_traj feature_trajs = dict(map(feat, meta.iterrows())) save_trajs(feature_trajs, 'ftrajs', meta) save_generic(featurizer, 'featurizer.pickl') return feature_trajs
def sample_tica_dim(dim=0, n_frames=200, meta=None, ttrajs=None): ## Load if (not meta is None) & (not ttrajs is None): ## Sample # These are apparently ordered according tica value inds = sample_dimension(ttrajs, dimension=dim, n_frames=n_frames, scheme='random') save_generic(inds, "tica-dimension-{}-inds.pickl".format(dim + 1)) ## Get tica components tica_values = np.array( [ttrajs[traj_i][frame_i][dim] for traj_i, frame_i in inds]) tica_values = (tica_values - tica_values.min()) / (tica_values.max() - tica_values.min()) tica_values *= 10 ## Make trajectory top = preload_top(meta) # Use loc because sample_dimension is nice traj = md.join( md.load_frame(meta.loc[traj_i]['traj_fn'], index=frame_i, top=top) for traj_i, frame_i in inds) ## Supperpose ## Save traj_fn = "tica-dimension-{}.dcd".format(dim + 1) backup(traj_fn) traj.save(traj_fn) else: raise ValueError('Specify meta data and trajectory objects')
if __name__ == '__main__': # Load meta = load_meta() tops = preload_tops(meta) # Select featurizer feature_name = 'Positions' reference = md.load('topology.pdb') featurizer = RawPositionsFeaturizer(ref_traj=reference) args = zip(meta.iterrows(), [featurizer] * meta.shape[0], [tops] * meta.shape[0]) # Do it in parallel with Pool() as pool: feature_trajs = dict(pool.imap_unordered(feat, args)) # Plot unscaled features ftrajs = np.concatenate([fx[::100] for fx in feature_trajs.values()]) fig, ax = plt.subplots(figsize=(15, 5)) plot_box(ax, fxx=ftrajs, feature_name='Unscaled {}'.format(feature_name)) fig.tight_layout() fig.savefig("Unscaled-{}-box.pdf".format(feature_name)) ## Save save_trajs(feature_trajs, 'Unscaled-{}-ftraj'.format(feature_name), meta) save_generic(featurizer, 'Unscaled-{}-featurizer.pickl'.format(feature_name))
vec = msm.left_eigenvectors_ n_states = vec.shape[ 0] # may be less than 200 as T may be non-ergodic. fig, axes = plt.subplots(nrows=m, sharex=True) for i in range(m): for j in range(m): mask = pcca_mapping == j axes[i].bar(np.arange(n_states)[mask], vec[mask, i], label='PCCA State {}'.format(j), align='center') axes[i].yaxis.set_major_formatter(FormatStrFormatter('%.2f')) axes[i].legend() axes[i].set_ylabel('Cluster projection') plt.xlabel('Cluster') plt.savefig('figures/rmsd_msm_left_eigenvectors-pcca.png', transparent=True) # Transforms: msm_traj = {} pcca_traj = {} for k, v in ctraj_dict.items(): print(k) msm_traj[k] = msm.partial_transform(np.squeeze(v), mode='fill') pcca_traj[k] = pcca.partial_transform(np.squeeze(v), mode='fill') save_trajs(msm_traj, 'msm-traj-200', meta) save_generic(msm, 'msm-200.pickl') save_trajs(pcca_traj, 'pcca-2-traj', meta) save_generic(pcca, 'pcca-2.pickl')
please cite msmbuilder in any publications """ import mdtraj as md from msmbuilder.io import load_trajs, save_generic, preload_top, backup, load_generic from msmbuilder.io.sampling import sample_msm ## Load meta, ttrajs = load_trajs('ttrajs') msm = load_generic('msm.pickl') kmeans = load_generic('kmeans.pickl') ## Sample # Warning: make sure ttrajs and kmeans centers have # the same number of dimensions inds = sample_msm(ttrajs, kmeans.cluster_centers_, msm, n_steps=200, stride=1) save_generic(inds, "msm-traj-inds.pickl") ## Make trajectory top = preload_top(meta) traj = md.join( md.load_frame(meta.loc[traj_i]['traj_fn'], index=frame_i, top=top) for traj_i, frame_i in inds) ## Save traj_fn = "msm-traj.xtc" backup(traj_fn) traj.save(traj_fn)
- trajs """ import mdtraj as md from msmbuilder.io import load_trajs, save_generic, preload_top, backup, load_generic from msmbuilder.io.sampling import sample_msm ## Load meta, ttrajs = load_trajs('ttrajs') msm = load_generic('msm.pickl') kmeans = load_generic('kmeans.pickl') ## Sample # Warning: make sure ttrajs and kmeans centers have # the same number of dimensions inds = sample_msm(ttrajs, kmeans.cluster_centers_, msm, n_steps=200, stride=1) save_generic(inds, "msm-traj-inds.pickl") ## Make trajectory top = preload_top(meta) traj = md.join( md.load_frame(meta.loc[traj_i]['traj_fn'], index=frame_i, top=top) for traj_i, frame_i in inds ) ## Save traj_fn = "msm-traj.xtc" backup(traj_fn) traj.save(traj_fn)
"""Cluster tICA results {{header}} Meta ---- depends: - ttrajs - meta.pandas.pickl """ from msmbuilder.io import load_trajs, save_trajs, save_generic from msmbuilder.cluster import MiniBatchKMeans ## Load meta, ttrajs = load_trajs('ttrajs') ## Fit dim = 5 kmeans = MiniBatchKMeans(n_clusters=500) kmeans.fit([traj[:, :dim] for traj in ttrajs.values()]) ## Transform ktrajs = {} for k, v in ttrajs.items(): ktrajs[k] = kmeans.partial_transform(v[:, :dim]) ## Save print(kmeans.summarize()) save_trajs(ktrajs, 'ktrajs', meta) save_generic(kmeans, 'kmeans.pickl')
sample.sort_values(by=['Prod_ID', 'Site_ID', 'Time_ps'], inplace=True) g = sns.FacetGrid(sample, col='Prod_ID',hue='Site_ID', col_wrap=10) g.map(plt.scatter, 'Time_ps', 'Trajectory', alpha=0.5) g.set(ylim=(-0.5,num_clusters)) g.fig.tight_layout() plt.savefig('figures/{}_cluster_trajectory.pdf'.format(feature)) # Plot histograms g = sns.FacetGrid(sample, col='Prod_ID',hue='Site_ID', col_wrap=10) g = g.map(plt.hist, 'Trajectory', bins=range(num_clusters), histtype='step', lw='5') g.fig.tight_layout() plt.savefig('figures/{}_cluster_hist.pdf'.format(feature)) # Save dataframe save_generic(df, 'clusters/{}_cluster_trajectory.pickl'.format(feature)) # Sampling (only plot 10 random assortments to_plot = np.random.choice(range(num_clusters), min(10, num_clusters), replace=False) for i in to_plot: num_samples = 100 df_smp = df.ix[df['Trajectory']==i, ['Key', 'Time_ps']].sample(num_samples) inds = zip(df_smp['Key'], df_smp['Time_ps']) # Use loc because sample_dimension is nice traj = md.join( md.load_frame(meta.loc[traj_i]['traj_fn'], index=frame_i, top=meta.loc[traj_i]['top_fn']) for traj_i, frame_i in inds ) # Original trajectories include both BT1 and BT2 so need to superpose
traj = md.load(row['traj_fn'], top=tops[row['top_fn']]) return i, traj traj_dict = dict(map(traj_load, meta.iterrows())) all_trajs = [traj for traj in traj_dict.values()] cluster = LandmarkAgglomerative(n_clusters=200, n_landmarks=int(totframes /200), linkage='ward', metric='rmsd') cluster.fit(all_trajs) # TODO will this work? args = [(k,v,cluster) for k, v in traj_dict.items()] with Pool() as pool: all_ctrajs_dict = dict(pool.imap_unordered(clust, args)) save_generic(cluster, 'cluster-200') save_trajs(all_ctrajs_dict, 'ctraj-200', meta) long_ctrajs = [np.squeeze(traj) for traj in all_ctrajs_dict.values() if traj.shape[0] > 1000] all_ctrajs = [np.squeeze(traj) for traj in all_ctrajs_dict.values()] lags = np.concatenate((np.arange(200, 1000, 200),np.arange(1000, 5000, 500))) all_msms = [] for lag in lags: print('Fitting lag {}'.format(lag)) if lag > 1000: trajs = long_ctrajs else: trajs = all_ctrajs
'cluster__n_clusters': scipy.stats.randint(low=200, high=200), 'tica__n_components': scipy.stats.randint(low=2, high=40), 'tica__lag_time': scipy.stats.randint(low=100, high=999) } pipe = Pipeline(estimators) pipe.set_params(msm__lag_time=500) pipe.set_params(msm__n_timescales=10) if __name__ == "__main__": cvSearch = GridSearchCV(pipe, params, n_jobs=3, verbose=1, cv=5) print("Performing grid search...") print("pipeline:", [name for name, _ in pipe.steps]) print("parameters:") print(params) t0 = time() cvSearch.fit(ftraj) print("done in %0.3fs" % (time() - t0)) print() print("Best score: %0.3f" % cvSearch.best_score_) print("Best parameters set:") best_parameters = cvSearch.best_estimator_.get_params() for param_name in sorted(params.keys()): print("\t%s: %r" % (param_name, best_parameters[param_name])) df = pd.DataFrame(cvSearch.cv_results_) save_generic(df, 'results/random_search.pickl')
def guestimate_stride(): total_data = meta['nframes'].sum() want = kmed.n_clusters * 10 stride = max(1, total_data // want) print("Since we have", total_data, "frames, we're going to stride by", stride, "during fitting, because this is probably adequate for", kmed.n_clusters, "clusters") return stride ## Fit kmed.fit([traj for _, traj in itertrajs(meta, stride=guestimate_stride())]) print(kmed.summarize()) ## Save save_generic(kmed, 'clusterer.pickl') ## Save centroids def frame(traj_i, frame_i): # Note: kmedoids does 0-based, contiguous integers so we use .iloc row = meta.iloc[traj_i] return md.load_frame(row['traj_fn'], frame_i, top=row['top_fn']) centroids = md.join((frame(ti, fi) for ti, fi in kmed.cluster_ids_), check_topology=False) centroids_fn = 'centroids.xtc' backup(centroids_fn) centroids.save("centroids.xtc")
""" import mdtraj as md from msmbuilder.io.sampling import sample_dimension from msmbuilder.io import load_trajs, save_generic, preload_top, backup ## Load meta, ttrajs = load_trajs('ttrajs') ## Sample inds = sample_dimension(ttrajs, dimension=0, n_frames=200, scheme='random') save_generic(inds, "tica-dimension-0-inds.pickl") ## Make trajectory top = preload_top(meta) # Use loc because sample_dimension is nice traj = md.join( md.load_frame(meta.loc[traj_i]['traj_fn'], index=frame_i, top=top) for traj_i, frame_i in inds ) ## Save traj_fn = "tica-dimension-0.xtc" backup(traj_fn) traj.save(traj_fn)
- trajs - top.pdb """ import mdtraj as md from msmbuilder.featurizer import DihedralFeaturizer from msmbuilder.io import load_meta, preload_tops, save_trajs, save_generic from multiprocessing import Pool ## Load meta = load_meta() tops = preload_tops(meta) dihed_feat = DihedralFeaturizer() ## Featurize logic def feat(irow): i, row = irow traj = md.load(row['traj_fn'], top=tops[row['top_fn']]) feat_traj = dihed_feat.partial_transform(traj) return i, feat_traj ## Do it in parallel with Pool() as pool: dihed_trajs = dict(pool.imap_unordered(feat, meta.iterrows())) ## Save save_trajs(dihed_trajs, 'ftrajs', meta) save_generic(dihed_feat, 'featurizer.pickl')
def guestimate_stride(): total_data = meta['nframes'].sum() want = kcen.n_clusters * 20 stride = max(1, total_data // want) print("Since we have", total_data, "frames, we're going to stride by", stride, "during fitting, because this is probably adequate for", kcen.n_clusters, "clusters") return stride ## Fit kcen.fit([traj for _, traj in itertrajs(meta, stride=guestimate_stride())]) print(kcen.summarize()) ## Save save_generic(kcen, 'clusterer' + str(round_num) +'.pickl') ## Save centroids def frame(traj_i, frame_i): # Note: kmedoids does 0-based, contiguous integers so we use .iloc row = meta.iloc[traj_i] return md.load_frame(row['traj_fn'], frame_i, top=row['top_fn']) centroids = md.join((frame(ti, fi) for ti, fi in kcen.cluster_ids_), check_topology=False) centroids_fn = 'centroids_' + str(round_num) + '.xtc' backup(centroids_fn) centroids.save("centroids_" + str(round_num) + ".xtc")
import mdtraj as md import os from msmbuilder.io.sampling import sample_states from msmbuilder.io import load_trajs, save_generic, preload_top, backup, load_generic ## Load meta, ttrajs = load_trajs('ttrajs') kmeans = load_generic("kmeans.pickl") ## Sample inds = sample_states(ttrajs, kmeans.cluster_centers_, k=10) save_generic(inds, "cluster-sample-inds.pickl") ## Make trajectories top = preload_top(meta) out_folder = "cluster_samples" backup(out_folder) os.mkdir(out_folder) for state_i, state_inds in enumerate(inds): traj = md.join( md.load_frame(meta.loc[traj_i]['traj_fn'], index=frame_i, top=top) for traj_i, frame_i in state_inds ) traj.save("{}/{}.xtc".format(out_folder, state_i))
## Try to limit RAM usage def guestimate_stride(): total_data = meta['nframes'].sum() want = kmed.n_clusters * 10 stride = max(1, total_data // want) print("Since we have", total_data, "frames, we're going to stride by", stride, "during fitting, because this is probably adequate for", kmed.n_clusters, "clusters") return stride ## Fit kmed.fit([traj for _, traj in itertrajs(meta, stride=guestimate_stride())]) print(kmed.summarize()) ## Save save_generic(kmed, 'clusterer.pickl') ## Save centroids def frame(traj_i, frame_i): # Note: kmedoids does 0-based, contiguous integers so we use .iloc row = meta.iloc[traj_i] return md.load_frame(row['traj_fn'], frame_i, top=row['top_fn']) centroids = md.join((frame(ti, fi) for ti, fi in kmed.cluster_ids_), check_topology=False) centroids_fn = 'centroids.xtc' backup(centroids_fn) centroids.save("centroids.xtc")
from utilities import plot_box if __name__ == '__main__': # Load meta, feature_trajs = load_trajs('ftraj') # Select scaler featurizer = RobustScaler() # Transform values featurizer.fit_transform(feature_trajs.values()) scaled_trajs = {} for k, v in feature_trajs.items(): scaled_trajs[k] = featurizer.partial_transform(v) # Save sample = np.concatenate([fx for fx in scaled_trajs.values()]) sample = sample[np.random.choice(sample.shape[0], 1000, replace=False), :] variance = np.apply_along_axis(np.var, axis=0, arr=sample) order = np.argsort(variance) ord_var = variance[order] labels = [str(x) for x in ord_var[::10]] ind = range(variance.shape[0]) fig, ax = plt.subplots() ax.plot(ind, ord_var) plt.savefig('ScaledFeatureVariance.png') save_trajs(scaled_trajs, 'straj', meta) save_generic(featurizer, 'scaler.pickl')
# # TIMESCALES # # The data will be loaded with a stride of 10 frames. Each fame is 50ps, so the time per frame will be # 500ps/frame or 0.5ns/frame. # Each trajectory is 1000 frames long # Lag time will be 40 frames (20 ns) based on a visual inspection of /Misc/MSM_lag_time.ipynb features = tica_unstructured_features to_ns = 0.5 msm_lag = int(40 / to_ns) # # MODEL # for feat in tica_unstructured_features: pipe = Pipeline([(feat[0], feat[1]), ('variance_cut', VarianceThreshold()), ('scaling', RobustScaler()), ('tica', tICA(kinetic_mapping=True)), ('cluster', MiniBatchKMeans()), ('msm', MarkovStateModel(lag_time=msm_lag, verbose=False, n_timescales=2))]) # # SAVE MODEL # savedir = 'gp-m52-ei-tica-indv' save_generic(pipe, '{0}/{1}.pickl'.format(savedir, feat[0]))
with Pool() as pool: feature_trajs = dict(pool.imap_unordered(msmb_feat, args)) # # Create save objects # featurizer = dict([(x[0], x[2]) for x in feature_trajs]) # feature_trajs = dict([(x[0], x[1]) for x in feature_trajs]) selector = VarianceThreshold() selector.fit([traj for traj in feature_trajs.values()]) ftrajs = {} for k, v in feature_trajs.items(): ftrajs[k] = np.squeeze(selector.transform([v])) # SAVE save_trajs(ftrajs, 'featurized_trajectories/{}-ftraj'.format(name), meta) save_generic(feat, 'featurized_trajectories/{}-featurizer.pickl'.format(name)) # pyEMMA FEATURIZERS featurizers = [('angles', 'add_angles', angles), ('dihedrals', 'add_dihedrals', dihedrals)] for name, feat, indices in featurizers: print('Featurizing {}'.format(name)) args = zip(meta.iterrows(), [feat] * meta.shape[0], [tops] * meta.shape[0], [indices]*meta.shape[0]) # Fit features with Pool() as pool: feature_trajs = dict(pool.imap_unordered(pyemma_feat, args))
"""Reduce dimensionality with tICA {{header}} Meta ---- depends: - ftrajs - meta.pandas.pickl """ from msmbuilder.io import load_trajs, save_trajs, save_generic from msmbuilder.decomposition import tICA ## Load tica = tICA(n_components=5, lag_time=10, kinetic_mapping=True) meta, ftrajs = load_trajs("ftrajs") ## Fit tica.fit(ftrajs.values()) ## Transform ttrajs = {} for k, v in ftrajs.items(): ttrajs[k] = tica.partial_transform(v) ## Save save_trajs(ttrajs, "ttrajs", meta) save_generic(tica, "tica.pickl")
# cluster print('Attempting to cluster') num_clusters = 20 cluster = LandmarkAgglomerative(n_clusters=num_clusters, n_landmarks=int(totframes / 100), linkage='ward', metric='rmsd') cluster.fit(trajs) # # print('Fitting cluster labels') # ctraj = {} # for k, v in traj_dict.items(): # v = cluster.partial_predict(v) # diff = nframes-v.shape[0] # v = np.append(v, np.zeros(diff)-1) # ctraj[k] = v # Convert to DF for plotting and sampling. # df = to_dataframe(ctraj, nframes, dt=1) print('Fitting cluster labels for MSM') ctraj = {} for k, v in traj_dict.items(): ctraj[k] = cluster.partial_predict(v) # Save dataframe save_generic(df, 'clusters/rmsd_cluster_trajectory.pickl') save_trajs(ctraj, 'ftraj', meta)
""" import mdtraj as md from msmbuilder.featurizer import DihedralFeaturizer from msmbuilder.io import load_meta, preload_tops, save_trajs, save_generic from multiprocessing import Pool ## Load meta = load_meta() tops = preload_tops(meta) dihed_feat = DihedralFeaturizer() ## Featurize logic def feat(irow): i, row = irow traj = md.load(row['traj_fn'], top=tops[row['top_fn']]) feat_traj = dihed_feat.partial_transform(traj) return i, feat_traj ## Do it in parallel with Pool() as pool: dihed_trajs = dict(pool.imap_unordered(feat, meta.iterrows())) ## Save save_trajs(dihed_trajs, 'ftrajs', meta) save_generic(dihed_feat, 'featurizer.pickl')
traj_dict = dict(map(traj_load, meta.iterrows())) trajs = [traj for traj in traj_dict.values() if traj.n_frames > 1000] print(len(trajs)) num_clust = 20 cluster = LandmarkAgglomerative(n_clusters=num_clust, n_landmarks=int(totframes / 100), linkage='ward', metric='rmsd') ctrajs = cluster.fit_transform(trajs) # print('Fitting cluster labels for MSM') # ctraj = {} # count = 0 # for k, v in traj_dict.items(): # print(k, count) # count +=1 # ctraj[k] = cluster.partial_predict(v) # # ctrajs = [traj for traj in ctraj.values() if traj.shape[0] > 1000] print('Fitting MSM') lag = 4000 msm = MarkovStateModel(lag_time=lag, n_timescales=50) msm.fit(ctrajs) # save_trajs(ctraj, 'results/nclusters-{0}-ctraj'.format(num_clust), meta) save_generic(cluster, 'results/clusterer-nclusters-{0}.pickle'.format(num_clust)) save_generic(msm, 'results/msm-lag-{0}-nclusters-{1}.pickl'.format(lag, num_clust))
linkage='ward', metric='rmsd', landmark_strategy='stride', random_state=None, max_landmarks=None, ward_predictor='ward') msm = MarkovStateModel(lag_time=msm_lag) pipe = Pipeline([('cluster', clusterer), ('msm', msm)]) # ------------------------------------------------------------------------- # Set param search object # ------------------------------------------------------------------------- params = { 'cluster__n_clusters': list((np.logspace(-0.5, 2, 10) * n_clusters).astype(int)) } print(params) cv_iter = ShuffleSplit(n_splits=10, test_size=0.5) param_search = GridSearchCV(pipe, param_grid=params, cv=cv_iter) # ------------------------------------------------------------------------- # Search param space and save # ------------------------------------------------------------------------- param_search.fit(trajs) save_generic(param_search, 'models/rmsd_model.pickl') print('Best score of {0} was achieved with \n {1}'.format( param_search.best_score_, param_search.best_params_))
# TIMESCALES # # The data will be loaded with a stride of 10 frames. Each fame is 50ps, so the time per frame will be # 500ps/frame or 0.5ns/frame. # Each trajectory is 1000 frames long # Lag time will be 40 frames (20 ns) based on a visual inspection of /Misc/MSM_lag_time.ipynb features = tica_unstructured_features to_ns = 0.5 msm_lag = int(40/to_ns) # # MODEL # pipe = Pipeline([('features', FeatureSelector(features=tica_unstructured_features)), ('variance_cut', VarianceThreshold()), ('scaling', RobustScaler()), ('tica', tICA(kinetic_mapping=True)), ('cluster', MiniBatchKMeans()), ('msm', MarkovStateModel(lag_time=msm_lag, verbose=False, n_timescales=2))]) # # SAVE MODEL # savedir = 'rand-tica-all' save_generic(pipe, '{}/model.pickl'.format(savedir)) print_feature_names(features, join(savedir, 'feature_list.txt'))
metric='rmsd')), ('msm', MarkovStateModel())] params = {'cluster__n_clusters': [200]} pipe = Pipeline(estimators) pipe.set_params(msm__lag_time=999) pipe.set_params(msm__n_timescales=20) if __name__ == "__main__": cvSearch = GridSearchCV(pipe, params, n_jobs=1, verbose=1, cv=cv_iter) print("Performing grid search...") print("pipeline:", [name for name, _ in pipe.steps]) print("parameters:") print(params) t0 = time() cvSearch.fit(trajs) print("done in %0.3fs" % (time() - t0)) print() print("Best score: %0.3f" % cvSearch.best_score_) print("Best parameters set:") best_parameters = cvSearch.best_estimator_.get_params() for param_name in sorted(params.keys()): print("\t%s: %r" % (param_name, best_parameters[param_name])) df = pd.DataFrame(cvSearch.cv_results_) save_generic(df, 'results/lag999-ncluster200.pickl')
def _save_model(self): """ Save a model to disk in pickle format """ save_generic(self.model, self.model_pkl_fname)
cv_iter = ShuffleSplit(n_splits=10, test_size=0.5, random_state=0) param_grid = [{'n_components': [10, 20, 40], 'lag_time': [1, 10, 100]}] # CV object model = tICA(kinetic_mapping=True) # Do grid search clf = GridSearchCV(estimator=model, param_grid=param_grid, cv=cv_iter, n_jobs=2) clf.fit(X) # Save results results = pd.DataFrame(clf.cv_results_) save_generic(results, '{}-grid-search-results.pickl'.format(feature_name)) # Print Results print("Best parameters set found on development set:") print(clf.best_params_) print() print("Grid scores on development set:") print() means = clf.cv_results_['mean_test_score'] stds = clf.cv_results_['std_test_score'] for mean, std, params in zip(means, stds, clf.cv_results_['params']): print("%0.3f (+/-%0.03f) for %r" % (mean, std * 2, params)) # Fit best estimator to data tica = clf.best_estimator_ ttrajs = {}
# # TIMESCALES # # The data will be loaded with a stride of 10 frames. Each fame is 50ps, so the time per frame will be # 500ps/frame or 0.5ns/frame. # Each trajectory is 1000 frames long # Lag time will be 40 frames (20 ns) based on a visual inspection of /Misc/MSM_lag_time.ipynb to_ns = 0.5 msm_lag = int(40 / to_ns) # # FEATURE INDICES # all_idx = np.load('indices_all.npy') # # OTHER PARAMETERS # ref_traj = md.load('../Data/data/trajectory-1.xtc', top='../Data/data/fs-peptide.pdb') featurizer = FeatureSelector(features=feats) pipe = Pipeline([('features', featurizer), ('variance_cut', VarianceThreshold()), ('scaling', RobustScaler()), ('cluster', MiniBatchKMeans()), ('msm', MarkovStateModel(lag_time=msm_lag, verbose=False))]) save_generic(pipe, 'model.pickl')
"""Make a microstate MSM {{header}} """ from msmbuilder.io import load_trajs, save_trajs, save_generic from msmbuilder.msm import MarkovStateModel ## Load meta, ktrajs = load_trajs('ktrajs') ## Fit msm = MarkovStateModel(lag_time=2, n_timescales=10, verbose=False) msm.fit(list(ktrajs.values())) ## Transform microktrajs = {} for k, v in ktrajs.items(): microktrajs[k] = msm.partial_transform(v) ## Save print(msm.summarize()) save_generic(msm, 'msm.pickl') save_trajs(microktrajs, 'microktrajs', meta)
please cite msmbuilder in any publications """ import mdtraj as md import os from msmbuilder.io.sampling import sample_states from msmbuilder.io import load_trajs, save_generic, preload_top, backup, load_generic ## Load meta, ttrajs = load_trajs('ttrajs') kmeans = load_generic("kmeans.pickl") ## Sample inds = sample_states(ttrajs, kmeans.cluster_centers_, k=10) save_generic(inds, "cluster-sample-inds.pickl") ## Make trajectories top = preload_top(meta) out_folder = "cluster_samples" backup(out_folder) os.mkdir(out_folder) for state_i, state_inds in enumerate(inds): traj = md.join( md.load_frame(meta.loc[traj_i]['traj_fn'], index=frame_i, top=top) for traj_i, frame_i in state_inds) traj.save("{}/{}.xtc".format(out_folder, state_i))
"""Make a microstate MSM msmbuilder autogenerated template version 2 created 2017-05-23T16:38:49.116944 please cite msmbuilder in any publications """ from msmbuilder.io import load_trajs, save_trajs, save_generic from msmbuilder.msm import MarkovStateModel ## Load meta, ktrajs = load_trajs('ktrajs') ## Fit msm = MarkovStateModel(lag_time=2, n_timescales=10, verbose=False) msm.fit(list(ktrajs.values())) ## Transform microktrajs = {} for k, v in ktrajs.items(): microktrajs[k] = msm.partial_transform(v) ## Save print(msm.summarize()) save_generic(msm, 'msm.pickl') save_trajs(microktrajs, 'microktrajs', meta)
"""Reduce dimensionality with tICA msmbuilder autogenerated template version 2 created 2017-05-23T16:38:49.125259 please cite msmbuilder in any publications """ from msmbuilder.io import load_trajs, save_trajs, save_generic from msmbuilder.decomposition import tICA ## Load tica = tICA(n_components=5, lag_time=10, kinetic_mapping=True) meta, ftrajs = load_trajs("ftrajs") ## Fit tica.fit(ftrajs.values()) ## Transform ttrajs = {} for k, v in ftrajs.items(): ttrajs[k] = tica.partial_transform(v) ## Save save_trajs(ttrajs, 'ttrajs', meta) save_generic(tica, 'tica.pickl')
dt = float(meta['step_ps'].unique()[0]) df = pd.DataFrame.from_records(data=rtrajs) df['Time_ps'] = np.arange(nframes) * dt df = pd.melt(df, id_vars=['Time_ps'], var_name='Production_ID', value_name='RMSD') df[id_cols] = pd.DataFrame(df['Production_ID'].tolist()) del df['Production_ID'] df = df.join(df.groupby(id_cols)['RMSD'].rolling(10).mean().reset_index( level=[0, 1, 2, 3, 4]), rsuffix='_r') df.drop(labels=[x + '_r' for x in id_cols], axis=1, inplace=True) long_trajs = ['{}.1'.format(x) for x in range(1, 11)] # Plot rolling with sns.plotting_context("notebook", font_scale=2): sample = df.ix[df['Prod_ID'].isin(long_trajs), :].sample(frac=0.1, axis=0) sample.sort_values(by=['Prod_ID', 'Site_ID', 'Time_ps'], inplace=True) g = sns.FacetGrid(sample, col='Prod_ID', hue='Site_ID', col_wrap=5) g.map(plt.plot, 'Time_ps', 'RMSD_r') g.set_ylabels("RMSD $\AA$") g.set_xlabels("") g.set_titles("") g.fig.subplots_adjust(wspace=0.05, hspace=0.05) plt.savefig('rmsd_trajectory_long.png', transparent=True) # Save dataframe save_generic(df, 'rmsd_trajectory.pickl')