def featurize(featurizer, meta_data): tops = preload_tops(meta) def feat(irow): i, row = irow traj = md.load(row['traj_fn'], top=tops[row['top_fn']]) feat_traj = featurizer.partial_transform(traj) return i, feat_traj feature_trajs = dict(map(feat, meta.iterrows())) save_trajs(feature_trajs, 'ftrajs', meta) save_generic(featurizer, 'featurizer.pickl') return feature_trajs
with Pool() as pool: feature_trajs = dict(pool.imap_unordered(msmb_feat, args)) # # Create save objects # featurizer = dict([(x[0], x[2]) for x in feature_trajs]) # feature_trajs = dict([(x[0], x[1]) for x in feature_trajs]) selector = VarianceThreshold() selector.fit([traj for traj in feature_trajs.values()]) ftrajs = {} for k, v in feature_trajs.items(): ftrajs[k] = np.squeeze(selector.transform([v])) # SAVE save_trajs(ftrajs, 'featurized_trajectories/{}-ftraj'.format(name), meta) save_generic(feat, 'featurized_trajectories/{}-featurizer.pickl'.format(name)) # pyEMMA FEATURIZERS featurizers = [('angles', 'add_angles', angles), ('dihedrals', 'add_dihedrals', dihedrals)] for name, feat, indices in featurizers: print('Featurizing {}'.format(name)) args = zip(meta.iterrows(), [feat] * meta.shape[0], [tops] * meta.shape[0], [indices]*meta.shape[0]) # Fit features with Pool() as pool: feature_trajs = dict(pool.imap_unordered(pyemma_feat, args))
"""Make a microstate MSM msmbuilder autogenerated template version 2 created 2017-05-23T16:38:49.116944 please cite msmbuilder in any publications """ from msmbuilder.io import load_trajs, save_trajs, save_generic from msmbuilder.msm import MarkovStateModel ## Load meta, ktrajs = load_trajs('ktrajs') ## Fit msm = MarkovStateModel(lag_time=2, n_timescales=10, verbose=False) msm.fit(list(ktrajs.values())) ## Transform microktrajs = {} for k, v in ktrajs.items(): microktrajs[k] = msm.partial_transform(v) ## Save print(msm.summarize()) save_generic(msm, 'msm.pickl') save_trajs(microktrajs, 'microktrajs', meta)
from utilities import plot_box if __name__ == '__main__': # Load meta, feature_trajs = load_trajs('ftraj') # Select scaler featurizer = RobustScaler() # Transform values featurizer.fit_transform(feature_trajs.values()) scaled_trajs = {} for k, v in feature_trajs.items(): scaled_trajs[k] = featurizer.partial_transform(v) # Save sample = np.concatenate([fx for fx in scaled_trajs.values()]) sample = sample[np.random.choice(sample.shape[0], 1000, replace=False), :] variance = np.apply_along_axis(np.var, axis=0, arr=sample) order = np.argsort(variance) ord_var = variance[order] labels = [str(x) for x in ord_var[::10]] ind = range(variance.shape[0]) fig, ax = plt.subplots() ax.plot(ind, ord_var) plt.savefig('ScaledFeatureVariance.png') save_trajs(scaled_trajs, 'straj', meta) save_generic(featurizer, 'scaler.pickl')
"""Check for abnormally high rmsd values to a reference structure {{header}} Meta ---- depends: - meta.pandas.pickl - trajs - top.pdb """ import mdtraj as md from msmbuilder.io import load_meta, itertrajs, save_trajs ## Load reference structure ref = md.load("top.pdb") meta = load_meta() ## Do calculation and save rmsds = {k: md.rmsd(traj, ref) for k, traj in itertrajs(meta)} save_trajs(rmsds, 'rmsds', meta)
vec = msm.left_eigenvectors_ n_states = vec.shape[ 0] # may be less than 200 as T may be non-ergodic. fig, axes = plt.subplots(nrows=m, sharex=True) for i in range(m): for j in range(m): mask = pcca_mapping == j axes[i].bar(np.arange(n_states)[mask], vec[mask, i], label='PCCA State {}'.format(j), align='center') axes[i].yaxis.set_major_formatter(FormatStrFormatter('%.2f')) axes[i].legend() axes[i].set_ylabel('Cluster projection') plt.xlabel('Cluster') plt.savefig('figures/rmsd_msm_left_eigenvectors-pcca.png', transparent=True) # Transforms: msm_traj = {} pcca_traj = {} for k, v in ctraj_dict.items(): print(k) msm_traj[k] = msm.partial_transform(np.squeeze(v), mode='fill') pcca_traj[k] = pcca.partial_transform(np.squeeze(v), mode='fill') save_trajs(msm_traj, 'msm-traj-200', meta) save_generic(msm, 'msm-200.pickl') save_trajs(pcca_traj, 'pcca-2-traj', meta) save_generic(pcca, 'pcca-2.pickl')
{{header}} Meta ---- depends: - meta.pandas.pickl - trajs - top.pdb """ import mdtraj as md from msmbuilder.io import load_meta, itertrajs, save_trajs, preload_top ## Load meta = load_meta() centroids = md.load("centroids.xtc", top=preload_top(meta)) ## Kernel SIGMA = 0.3 # nm from msmbuilder.featurizer import RMSDFeaturizer import numpy as np featurizer = RMSDFeaturizer(centroids) lfeats = {} for i, traj in itertrajs(meta): lfeat = featurizer.partial_transform(traj) lfeat = np.exp(-lfeat**2 / (2 * (SIGMA**2))) lfeats[i] = lfeat save_trajs(lfeats, 'ftrajs', meta)
please cite msmbuilder in any publications """ import mdtraj as md from msmbuilder.featurizer import DihedralFeaturizer from msmbuilder.io import load_meta, preload_tops, save_trajs, save_generic from multiprocessing import Pool ## Load meta = load_meta() tops = preload_tops(meta) dihed_feat = DihedralFeaturizer() ## Featurize logic def feat(irow): i, row = irow traj = md.load(row['traj_fn'], top=tops[row['top_fn']]) feat_traj = dihed_feat.partial_transform(traj) return i, feat_traj ## Do it in parallel with Pool() as pool: dihed_trajs = dict(pool.imap_unordered(feat, meta.iterrows())) ## Save save_trajs(dihed_trajs, 'ftrajs', meta) save_generic(dihed_feat, 'featurizer.pickl')
from msmbuilder.io import load_trajs, save_trajs import numpy as np from multiprocessing import Pool import matplotlib matplotlib.use('Agg') from matplotlib.pylab import plt import sys import seaborn as sns from sklearn.neighbors.kde import KernelDensity from scipy.signal import argrelextrema # Don't prune these: for feature in ['angles', 'bonds', 'contacts']: meta, ftraj = load_trajs( 'featurized_trajectories/{}-ftraj'.format(feature)) save_trajs(ftraj, 'pruned_trajectories/{}-ftraj'.format(feature), meta) # Prune these: for feature in ['dihedrals']: meta, ftraj_dict = load_trajs( 'featurized_trajectories/{}-ftraj'.format(feature)) ftraj = np.concatenate([traj for traj in ftraj_dict.values()]) cos_idx = np.arange(0, ftraj.shape[1] - 1, 2).reshape(-1, 1) variance = ftraj[:, cos_idx].var(axis=0).reshape(-1, 1) # Do KDE and split the data num_splits = 3 bandwidths = np.linspace(.01, .10, num=100) x = np.linspace(0, .5, 1000).reshape(-1, 1) for bw in bandwidths:
return i, traj traj_dict = dict(map(traj_load, meta.iterrows())) all_trajs = [traj for traj in traj_dict.values()] cluster = LandmarkAgglomerative(n_clusters=200, n_landmarks=int(totframes /200), linkage='ward', metric='rmsd') cluster.fit(all_trajs) # TODO will this work? args = [(k,v,cluster) for k, v in traj_dict.items()] with Pool() as pool: all_ctrajs_dict = dict(pool.imap_unordered(clust, args)) save_generic(cluster, 'cluster-200') save_trajs(all_ctrajs_dict, 'ctraj-200', meta) long_ctrajs = [np.squeeze(traj) for traj in all_ctrajs_dict.values() if traj.shape[0] > 1000] all_ctrajs = [np.squeeze(traj) for traj in all_ctrajs_dict.values()] lags = np.concatenate((np.arange(200, 1000, 200),np.arange(1000, 5000, 500))) all_msms = [] for lag in lags: print('Fitting lag {}'.format(lag)) if lag > 1000: trajs = long_ctrajs else: trajs = all_ctrajs msm = MarkovStateModel(lag_time=int(lag), n_timescales=100)
"""Reduce dimensionality with tICA {{header}} Meta ---- depends: - ftrajs - meta.pandas.pickl """ from msmbuilder.io import load_trajs, save_trajs, save_generic from msmbuilder.decomposition import tICA ## Load tica = tICA(n_components=5, lag_time=10, kinetic_mapping=True) meta, ftrajs = load_trajs("ftrajs") ## Fit tica.fit(ftrajs.values()) ## Transform ttrajs = {} for k, v in ftrajs.items(): ttrajs[k] = tica.partial_transform(v) ## Save save_trajs(ttrajs, "ttrajs", meta) save_generic(tica, "tica.pickl")
# Featurize logic def feat(irow): i, row = irow print('Loading traj {}'.format(row['traj_fn'])) traj = md.load(row['traj_fn'], top=tops[row['top_fn']]) top = traj.topology ctni_atoms = [] ctnt_atoms = [] for res in [292, 329, 337, 383]: ca = top.select('resid {} and name CA'.format(res - 1)) ctni_atoms.append(ca[0]) for res in [162, 184, 185, 237]: ca = top.select('resid {} and name CA'.format(res - 1)) ctnt_atoms.append(ca[0]) atom_indices = np.array([ctni_atoms, ctnt_atoms]) diheds = md.compute_dihedrals(traj, atom_indices) return i, diheds if __name__ == '__main__': meta = load_meta() tops = preload_tops(meta) with Pool() as pool: dtrajs = dict(pool.imap_unordered(feat, meta.iterrows())) save_trajs(dtrajs, 'dtrajs', meta)
{{header}} Meta ---- depends: - meta.pandas.pickl - trajs - top.pdb """ import mdtraj as md from msmbuilder.io import load_meta, itertrajs, save_trajs, preload_top ## Load meta = load_meta() centroids = md.load("centroids.xtc", top=preload_top(meta)) ## Kernel SIGMA = 0.3 # nm from msmbuilder.featurizer import RMSDFeaturizer import numpy as np featurizer = RMSDFeaturizer(centroids) lfeats = {} for i, traj in itertrajs(meta): lfeat = featurizer.partial_transform(traj) lfeat = np.exp(-lfeat ** 2 / (2 * (SIGMA ** 2))) lfeats[i] = lfeat save_trajs(lfeats, 'ftrajs', meta)
- trajs - top.pdb """ import mdtraj as md from msmbuilder.featurizer import DihedralFeaturizer from msmbuilder.io import load_meta, preload_tops, save_trajs, save_generic from multiprocessing import Pool ## Load meta = load_meta() tops = preload_tops(meta) dihed_feat = DihedralFeaturizer() ## Featurize logic def feat(irow): i, row = irow traj = md.load(row['traj_fn'], top=tops[row['top_fn']]) feat_traj = dihed_feat.partial_transform(traj) return i, feat_traj ## Do it in parallel with Pool() as pool: dihed_trajs = dict(pool.imap_unordered(feat, meta.iterrows())) ## Save save_trajs(dihed_trajs, 'ftrajs', meta) save_generic(dihed_feat, 'featurizer.pickl')
"""Cluster tICA results {{header}} Meta ---- depends: - ttrajs - meta.pandas.pickl """ from msmbuilder.io import load_trajs, save_trajs, save_generic from msmbuilder.cluster import MiniBatchKMeans ## Load meta, ttrajs = load_trajs('ttrajs') ## Fit dim = 5 kmeans = MiniBatchKMeans(n_clusters=500) kmeans.fit([traj[:, :dim] for traj in ttrajs.values()]) ## Transform ktrajs = {} for k, v in ttrajs.items(): ktrajs[k] = kmeans.partial_transform(v[:, :dim]) ## Save print(kmeans.summarize()) save_trajs(ktrajs, 'ktrajs', meta) save_generic(kmeans, 'kmeans.pickl')
clf = GridSearchCV(estimator=model, param_grid=param_grid, cv=cv_iter, n_jobs=2) clf.fit(X) # Save results results = pd.DataFrame(clf.cv_results_) save_generic(results, '{}-grid-search-results.pickl'.format(feature_name)) # Print Results print("Best parameters set found on development set:") print(clf.best_params_) print() print("Grid scores on development set:") print() means = clf.cv_results_['mean_test_score'] stds = clf.cv_results_['std_test_score'] for mean, std, params in zip(means, stds, clf.cv_results_['params']): print("%0.3f (+/-%0.03f) for %r" % (mean, std * 2, params)) # Fit best estimator to data tica = clf.best_estimator_ ttrajs = {} for k, v in ftrajs.items(): ttrajs[k] = tica.partial_transform(v) # Save save_trajs(ttrajs, '{}-ttrajs'.format(feature_name), meta) save_generic(tica, '{}-tica.pickl'.format(feature_name))
# cluster print('Attempting to cluster') num_clusters = 20 cluster = LandmarkAgglomerative(n_clusters=num_clusters, n_landmarks=int(totframes / 100), linkage='ward', metric='rmsd') cluster.fit(trajs) # # print('Fitting cluster labels') # ctraj = {} # for k, v in traj_dict.items(): # v = cluster.partial_predict(v) # diff = nframes-v.shape[0] # v = np.append(v, np.zeros(diff)-1) # ctraj[k] = v # Convert to DF for plotting and sampling. # df = to_dataframe(ctraj, nframes, dt=1) print('Fitting cluster labels for MSM') ctraj = {} for k, v in traj_dict.items(): ctraj[k] = cluster.partial_predict(v) # Save dataframe save_generic(df, 'clusters/rmsd_cluster_trajectory.pickl') save_trajs(ctraj, 'ftraj', meta)
#!/usr/bin/env python from msmbuilder.dataset import dataset from msmbuilder.io import save_trajs, load_meta import argparse parser = argparse.ArgumentParser( prog='dataset_h5_to_npy_dir.py', formatter_class=argparse.RawDescriptionHelpFormatter, description='''version1''') parser.add_argument("dataset", help="""An HDF5 dataset""", type=str) parser.add_argument("meta", help="A metadata pickl file", type=str) parser.add_argument("trajs", help="The folder in which to store the trajs", type=str, default='trajs') if __name__ == '__main__': args = parser.parse_args() meta = load_meta(args.meta) ds = dataset(args.dataset) trajs = {} for k, v in ds.items(): trajs[k] = v save_trajs(trajs, args.trajs, meta)
if __name__ == '__main__': # Load meta = load_meta() tops = preload_tops(meta) # Select featurizer feature_name = 'Positions' reference = md.load('topology.pdb') featurizer = RawPositionsFeaturizer(ref_traj=reference) args = zip(meta.iterrows(), [featurizer] * meta.shape[0], [tops] * meta.shape[0]) # Do it in parallel with Pool() as pool: feature_trajs = dict(pool.imap_unordered(feat, args)) # Plot unscaled features ftrajs = np.concatenate([fx[::100] for fx in feature_trajs.values()]) fig, ax = plt.subplots(figsize=(15, 5)) plot_box(ax, fxx=ftrajs, feature_name='Unscaled {}'.format(feature_name)) fig.tight_layout() fig.savefig("Unscaled-{}-box.pdf".format(feature_name)) ## Save save_trajs(feature_trajs, 'Unscaled-{}-ftraj'.format(feature_name), meta) save_generic(featurizer, 'Unscaled-{}-featurizer.pickl'.format(feature_name))
"""Make a microstate MSM {{header}} """ from msmbuilder.io import load_trajs, save_trajs, save_generic from msmbuilder.msm import MarkovStateModel ## Load meta, ktrajs = load_trajs('ktrajs') ## Fit msm = MarkovStateModel(lag_time=2, n_timescales=10, verbose=False) msm.fit(list(ktrajs.values())) ## Transform microktrajs = {} for k, v in ktrajs.items(): microktrajs[k] = msm.partial_transform(v) ## Save print(msm.summarize()) save_generic(msm, 'msm.pickl') save_trajs(microktrajs, 'microktrajs', meta)
"""Reduce dimensionality with tICA msmbuilder autogenerated template version 2 created 2017-05-23T16:38:49.125259 please cite msmbuilder in any publications """ from msmbuilder.io import load_trajs, save_trajs, save_generic from msmbuilder.decomposition import tICA ## Load tica = tICA(n_components=5, lag_time=10, kinetic_mapping=True) meta, ftrajs = load_trajs("ftrajs") ## Fit tica.fit(ftrajs.values()) ## Transform ttrajs = {} for k, v in ftrajs.items(): ttrajs[k] = tica.partial_transform(v) ## Save save_trajs(ttrajs, 'ttrajs', meta) save_generic(tica, 'tica.pickl')
720, 736, 748, 767, 783, 804, 814, 825, 840, 850, 870, 889, 910, 927, 941, 948, 969, 980, 994, 1004, 1019, 1035, 1054, 1061, 1085, 1099, 1109, 1133, 1153, 1172, 1189, 1202, 1214, 1226, 1233, 1250, 1266, 1290, 1302, 1324, 1335, 1349, 1373, 1395, 1416, 1432, 1444, 1455, 1469, 1483, 1502, 1516, 1530, 1547, 1571, 1593, 1603, 1622, 1634, 1658, 1672, 1682, 1697, 1713, 1730, 1746, 1761, 1777, 1793, 1807, 1827, 1849, 1871, 1885, 1892, 1909, 1933, 1953, 1969, 1983, 2003, 2022, 2036, 2053, 2074, 2086, 2102, 2126, 2138, 2153, 2167, 2174, 2189, 2210, 2234, 2255, 2266, 2283, 2290, 2310, 2327, 2338 ]) num = len(alpha_carbon_number) atompair = [] for i in range(num): for j in range(i + 1, num): atompair += [[alpha_carbon_number[i], alpha_carbon_number[j]]] dist_feat = AtomPairsFeaturizer(pair_indices=atompair) ## Distance featurizer def feat2(irow): i, row = irow traj = md.load(row['traj_fn'], top=tops[row['top_fn']]) feat_traj = dist_feat.partial_transform(traj) return i, feat_traj with contextlib.closing(Pool(processes=32)) as pool: dist_trajs = dict(pool.imap_unordered(feat2, meta.iterrows())) save_trajs(dist_trajs, 'alpha_carbon', meta)