Пример #1
0
def featurize(featurizer, meta_data):

    tops = preload_tops(meta)

    def feat(irow):
        i, row = irow
        traj = md.load(row['traj_fn'], top=tops[row['top_fn']])
        feat_traj = featurizer.partial_transform(traj)
        return i, feat_traj

    feature_trajs = dict(map(feat, meta.iterrows()))

    save_trajs(feature_trajs, 'ftrajs', meta)
    save_generic(featurizer, 'featurizer.pickl')

    return feature_trajs
Пример #2
0
        with Pool() as pool:
            feature_trajs = dict(pool.imap_unordered(msmb_feat, args))

        # # Create save objects
        # featurizer = dict([(x[0], x[2]) for x in feature_trajs])
        # feature_trajs = dict([(x[0], x[1]) for x in feature_trajs])

        selector = VarianceThreshold()
        selector.fit([traj for traj in feature_trajs.values()])
        ftrajs = {}
        for k, v in feature_trajs.items():
            ftrajs[k] = np.squeeze(selector.transform([v]))

        # SAVE
        save_trajs(ftrajs, 'featurized_trajectories/{}-ftraj'.format(name), meta)
        save_generic(feat, 'featurized_trajectories/{}-featurizer.pickl'.format(name))


    # pyEMMA FEATURIZERS
    featurizers = [('angles', 'add_angles', angles),
                   ('dihedrals', 'add_dihedrals', dihedrals)]

    for name, feat, indices in featurizers:
        print('Featurizing {}'.format(name))
        args = zip(meta.iterrows(), [feat] * meta.shape[0], [tops] * meta.shape[0],
                   [indices]*meta.shape[0])

        # Fit features
        with Pool() as pool:
            feature_trajs = dict(pool.imap_unordered(pyemma_feat, args))
Пример #3
0
"""Make a microstate MSM

msmbuilder autogenerated template version 2
created 2017-05-23T16:38:49.116944
please cite msmbuilder in any publications
"""

from msmbuilder.io import load_trajs, save_trajs, save_generic
from msmbuilder.msm import MarkovStateModel

## Load
meta, ktrajs = load_trajs('ktrajs')

## Fit
msm = MarkovStateModel(lag_time=2, n_timescales=10, verbose=False)
msm.fit(list(ktrajs.values()))

## Transform
microktrajs = {}
for k, v in ktrajs.items():
    microktrajs[k] = msm.partial_transform(v)

## Save
print(msm.summarize())
save_generic(msm, 'msm.pickl')
save_trajs(microktrajs, 'microktrajs', meta)
Пример #4
0
from utilities import plot_box

if __name__ == '__main__':

    # Load
    meta, feature_trajs = load_trajs('ftraj')

    # Select scaler
    featurizer = RobustScaler()

    # Transform values
    featurizer.fit_transform(feature_trajs.values())
    scaled_trajs = {}
    for k, v in feature_trajs.items():
        scaled_trajs[k] = featurizer.partial_transform(v)

    # Save
    sample = np.concatenate([fx for fx in scaled_trajs.values()])
    sample = sample[np.random.choice(sample.shape[0], 1000, replace=False), :]
    variance = np.apply_along_axis(np.var, axis=0, arr=sample)
    order = np.argsort(variance)
    ord_var = variance[order]
    labels = [str(x) for x in ord_var[::10]]
    ind = range(variance.shape[0])
    fig, ax = plt.subplots()
    ax.plot(ind, ord_var)
    plt.savefig('ScaledFeatureVariance.png')

    save_trajs(scaled_trajs, 'straj', meta)
    save_generic(featurizer, 'scaler.pickl')
Пример #5
0
"""Check for abnormally high rmsd values to a reference structure

{{header}}

Meta
----
depends:
  - meta.pandas.pickl
  - trajs
  - top.pdb

"""

import mdtraj as md

from msmbuilder.io import load_meta, itertrajs, save_trajs

## Load reference structure
ref = md.load("top.pdb")
meta = load_meta()

## Do calculation and save
rmsds = {k: md.rmsd(traj, ref) for k, traj in itertrajs(meta)}
save_trajs(rmsds, 'rmsds', meta)
Пример #6
0
        vec = msm.left_eigenvectors_
        n_states = vec.shape[
            0]  # may be less than 200 as T may be non-ergodic.
        fig, axes = plt.subplots(nrows=m, sharex=True)
        for i in range(m):
            for j in range(m):
                mask = pcca_mapping == j
                axes[i].bar(np.arange(n_states)[mask],
                            vec[mask, i],
                            label='PCCA State {}'.format(j),
                            align='center')
            axes[i].yaxis.set_major_formatter(FormatStrFormatter('%.2f'))
            axes[i].legend()
            axes[i].set_ylabel('Cluster projection')

        plt.xlabel('Cluster')
        plt.savefig('figures/rmsd_msm_left_eigenvectors-pcca.png',
                    transparent=True)

    # Transforms:
    msm_traj = {}
    pcca_traj = {}
    for k, v in ctraj_dict.items():
        print(k)
        msm_traj[k] = msm.partial_transform(np.squeeze(v), mode='fill')
        pcca_traj[k] = pcca.partial_transform(np.squeeze(v), mode='fill')

    save_trajs(msm_traj, 'msm-traj-200', meta)
    save_generic(msm, 'msm-200.pickl')
    save_trajs(pcca_traj, 'pcca-2-traj', meta)
    save_generic(pcca, 'pcca-2.pickl')
Пример #7
0
{{header}}

Meta
----
depends:
  - meta.pandas.pickl
  - trajs
  - top.pdb
"""
import mdtraj as md

from msmbuilder.io import load_meta, itertrajs, save_trajs, preload_top

## Load
meta = load_meta()
centroids = md.load("centroids.xtc", top=preload_top(meta))

## Kernel
SIGMA = 0.3  # nm
from msmbuilder.featurizer import RMSDFeaturizer
import numpy as np

featurizer = RMSDFeaturizer(centroids)
lfeats = {}
for i, traj in itertrajs(meta):
    lfeat = featurizer.partial_transform(traj)
    lfeat = np.exp(-lfeat**2 / (2 * (SIGMA**2)))
    lfeats[i] = lfeat
save_trajs(lfeats, 'ftrajs', meta)
Пример #8
0
please cite msmbuilder in any publications


"""
import mdtraj as md

from msmbuilder.featurizer import DihedralFeaturizer
from msmbuilder.io import load_meta, preload_tops, save_trajs, save_generic
from multiprocessing import Pool

## Load
meta = load_meta()
tops = preload_tops(meta)
dihed_feat = DihedralFeaturizer()


## Featurize logic
def feat(irow):
    i, row = irow
    traj = md.load(row['traj_fn'], top=tops[row['top_fn']])
    feat_traj = dihed_feat.partial_transform(traj)
    return i, feat_traj


## Do it in parallel
with Pool() as pool:
    dihed_trajs = dict(pool.imap_unordered(feat, meta.iterrows()))

## Save
save_trajs(dihed_trajs, 'ftrajs', meta)
save_generic(dihed_feat, 'featurizer.pickl')
Пример #9
0
from msmbuilder.io import load_trajs, save_trajs
import numpy as np
from multiprocessing import Pool
import matplotlib
matplotlib.use('Agg')
from matplotlib.pylab import plt
import sys
import seaborn as sns
from sklearn.neighbors.kde import KernelDensity
from scipy.signal import argrelextrema

# Don't prune these:
for feature in ['angles', 'bonds', 'contacts']:
    meta, ftraj = load_trajs(
        'featurized_trajectories/{}-ftraj'.format(feature))
    save_trajs(ftraj, 'pruned_trajectories/{}-ftraj'.format(feature), meta)

# Prune these:

for feature in ['dihedrals']:
    meta, ftraj_dict = load_trajs(
        'featurized_trajectories/{}-ftraj'.format(feature))
    ftraj = np.concatenate([traj for traj in ftraj_dict.values()])
    cos_idx = np.arange(0, ftraj.shape[1] - 1, 2).reshape(-1, 1)
    variance = ftraj[:, cos_idx].var(axis=0).reshape(-1, 1)

    # Do KDE and split the data
    num_splits = 3
    bandwidths = np.linspace(.01, .10, num=100)
    x = np.linspace(0, .5, 1000).reshape(-1, 1)
    for bw in bandwidths:
            return i, traj


        traj_dict = dict(map(traj_load, meta.iterrows()))
        all_trajs = [traj for traj in traj_dict.values()]

        cluster = LandmarkAgglomerative(n_clusters=200, n_landmarks=int(totframes /200), linkage='ward', metric='rmsd')
        cluster.fit(all_trajs)
        # TODO will this work?
        args = [(k,v,cluster) for k, v in traj_dict.items()]

        with Pool() as pool:
            all_ctrajs_dict = dict(pool.imap_unordered(clust, args))

        save_generic(cluster, 'cluster-200')
        save_trajs(all_ctrajs_dict, 'ctraj-200', meta)

    long_ctrajs = [np.squeeze(traj) for traj in all_ctrajs_dict.values() if traj.shape[0] > 1000]
    all_ctrajs = [np.squeeze(traj) for traj in all_ctrajs_dict.values()]

    lags = np.concatenate((np.arange(200, 1000, 200),np.arange(1000, 5000, 500)))
    all_msms = []

    for lag in lags:
        print('Fitting lag {}'.format(lag))
        if lag > 1000:
            trajs = long_ctrajs
        else:
            trajs = all_ctrajs

        msm = MarkovStateModel(lag_time=int(lag), n_timescales=100)
Пример #11
0
"""Reduce dimensionality with tICA

{{header}}
Meta
----
depends:
  - ftrajs
  - meta.pandas.pickl
"""

from msmbuilder.io import load_trajs, save_trajs, save_generic
from msmbuilder.decomposition import tICA

## Load
tica = tICA(n_components=5, lag_time=10, kinetic_mapping=True)
meta, ftrajs = load_trajs("ftrajs")

## Fit
tica.fit(ftrajs.values())

## Transform
ttrajs = {}
for k, v in ftrajs.items():
    ttrajs[k] = tica.partial_transform(v)

## Save
save_trajs(ttrajs, "ttrajs", meta)
save_generic(tica, "tica.pickl")
Пример #12
0
# Featurize logic
def feat(irow):
    i, row = irow
    print('Loading traj {}'.format(row['traj_fn']))
    traj = md.load(row['traj_fn'], top=tops[row['top_fn']])
    top = traj.topology

    ctni_atoms = []
    ctnt_atoms = []
    for res in [292, 329, 337, 383]:
        ca = top.select('resid {} and name CA'.format(res - 1))
        ctni_atoms.append(ca[0])
    for res in [162, 184, 185, 237]:
        ca = top.select('resid {} and name CA'.format(res - 1))
        ctnt_atoms.append(ca[0])

    atom_indices = np.array([ctni_atoms, ctnt_atoms])

    diheds = md.compute_dihedrals(traj, atom_indices)

    return i, diheds


if __name__ == '__main__':
    meta = load_meta()
    tops = preload_tops(meta)

    with Pool() as pool:
        dtrajs = dict(pool.imap_unordered(feat, meta.iterrows()))
    save_trajs(dtrajs, 'dtrajs', meta)
Пример #13
0
{{header}}

Meta
----
depends:
  - meta.pandas.pickl
  - trajs
  - top.pdb
"""
import mdtraj as md

from msmbuilder.io import load_meta, itertrajs, save_trajs, preload_top

## Load
meta = load_meta()
centroids = md.load("centroids.xtc", top=preload_top(meta))

## Kernel
SIGMA = 0.3  # nm
from msmbuilder.featurizer import RMSDFeaturizer
import numpy as np

featurizer = RMSDFeaturizer(centroids)
lfeats = {}
for i, traj in itertrajs(meta):
    lfeat = featurizer.partial_transform(traj)
    lfeat = np.exp(-lfeat ** 2 / (2 * (SIGMA ** 2)))
    lfeats[i] = lfeat
save_trajs(lfeats, 'ftrajs', meta)
Пример #14
0
  - trajs
  - top.pdb
"""
import mdtraj as md

from msmbuilder.featurizer import DihedralFeaturizer
from msmbuilder.io import load_meta, preload_tops, save_trajs, save_generic
from multiprocessing import Pool

## Load
meta = load_meta()
tops = preload_tops(meta)
dihed_feat = DihedralFeaturizer()


## Featurize logic
def feat(irow):
    i, row = irow
    traj = md.load(row['traj_fn'], top=tops[row['top_fn']])
    feat_traj = dihed_feat.partial_transform(traj)
    return i, feat_traj


## Do it in parallel
with Pool() as pool:
    dihed_trajs = dict(pool.imap_unordered(feat, meta.iterrows()))

## Save
save_trajs(dihed_trajs, 'ftrajs', meta)
save_generic(dihed_feat, 'featurizer.pickl')
Пример #15
0
"""Cluster tICA results

{{header}}

Meta
----
depends:
 - ttrajs
 - meta.pandas.pickl
"""
from msmbuilder.io import load_trajs, save_trajs, save_generic
from msmbuilder.cluster import MiniBatchKMeans

## Load
meta, ttrajs = load_trajs('ttrajs')

## Fit
dim = 5
kmeans = MiniBatchKMeans(n_clusters=500)
kmeans.fit([traj[:, :dim] for traj in ttrajs.values()])

## Transform
ktrajs = {}
for k, v in ttrajs.items():
    ktrajs[k] = kmeans.partial_transform(v[:, :dim])

## Save
print(kmeans.summarize())
save_trajs(ktrajs, 'ktrajs', meta)
save_generic(kmeans, 'kmeans.pickl')
    clf = GridSearchCV(estimator=model,
                       param_grid=param_grid,
                       cv=cv_iter,
                       n_jobs=2)
    clf.fit(X)

    # Save results
    results = pd.DataFrame(clf.cv_results_)
    save_generic(results, '{}-grid-search-results.pickl'.format(feature_name))

    # Print Results
    print("Best parameters set found on development set:")
    print(clf.best_params_)
    print()
    print("Grid scores on development set:")
    print()
    means = clf.cv_results_['mean_test_score']
    stds = clf.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, clf.cv_results_['params']):
        print("%0.3f (+/-%0.03f) for %r" % (mean, std * 2, params))

    # Fit best estimator to data
    tica = clf.best_estimator_
    ttrajs = {}
    for k, v in ftrajs.items():
        ttrajs[k] = tica.partial_transform(v)

    # Save
    save_trajs(ttrajs, '{}-ttrajs'.format(feature_name), meta)
    save_generic(tica, '{}-tica.pickl'.format(feature_name))
Пример #17
0
# cluster
print('Attempting to cluster')
num_clusters = 20
cluster = LandmarkAgglomerative(n_clusters=num_clusters,
                                n_landmarks=int(totframes / 100),
                                linkage='ward',
                                metric='rmsd')
cluster.fit(trajs)

#
# print('Fitting cluster labels')
# ctraj = {}
# for k, v in traj_dict.items():
#     v = cluster.partial_predict(v)
#     diff = nframes-v.shape[0]
#     v = np.append(v, np.zeros(diff)-1)
#     ctraj[k] = v

# Convert to DF for plotting and sampling.
# df = to_dataframe(ctraj, nframes, dt=1)

print('Fitting cluster labels for MSM')
ctraj = {}
for k, v in traj_dict.items():
    ctraj[k] = cluster.partial_predict(v)

# Save dataframe
save_generic(df, 'clusters/rmsd_cluster_trajectory.pickl')
save_trajs(ctraj, 'ftraj', meta)
Пример #18
0
#!/usr/bin/env python
from msmbuilder.dataset import dataset
from msmbuilder.io import save_trajs, load_meta
import argparse
parser = argparse.ArgumentParser(
    prog='dataset_h5_to_npy_dir.py',
    formatter_class=argparse.RawDescriptionHelpFormatter,
    description='''version1''')

parser.add_argument("dataset", help="""An HDF5 dataset""", type=str)
parser.add_argument("meta", help="A metadata pickl file", type=str)
parser.add_argument("trajs",
                    help="The folder in which to store the trajs",
                    type=str,
                    default='trajs')

if __name__ == '__main__':
    args = parser.parse_args()
    meta = load_meta(args.meta)
    ds = dataset(args.dataset)
    trajs = {}
    for k, v in ds.items():
        trajs[k] = v
    save_trajs(trajs, args.trajs, meta)
Пример #19
0
if __name__ == '__main__':

    # Load
    meta = load_meta()
    tops = preload_tops(meta)

    # Select featurizer
    feature_name = 'Positions'
    reference = md.load('topology.pdb')
    featurizer = RawPositionsFeaturizer(ref_traj=reference)

    args = zip(meta.iterrows(), [featurizer] * meta.shape[0],
               [tops] * meta.shape[0])

    # Do it in parallel
    with Pool() as pool:
        feature_trajs = dict(pool.imap_unordered(feat, args))

    # Plot unscaled features
    ftrajs = np.concatenate([fx[::100] for fx in feature_trajs.values()])
    fig, ax = plt.subplots(figsize=(15, 5))
    plot_box(ax, fxx=ftrajs, feature_name='Unscaled {}'.format(feature_name))
    fig.tight_layout()
    fig.savefig("Unscaled-{}-box.pdf".format(feature_name))

    ## Save
    save_trajs(feature_trajs, 'Unscaled-{}-ftraj'.format(feature_name), meta)
    save_generic(featurizer,
                 'Unscaled-{}-featurizer.pickl'.format(feature_name))
Пример #20
0
"""Make a microstate MSM

{{header}}
"""

from msmbuilder.io import load_trajs, save_trajs, save_generic
from msmbuilder.msm import MarkovStateModel

## Load
meta, ktrajs = load_trajs('ktrajs')

## Fit
msm = MarkovStateModel(lag_time=2, n_timescales=10, verbose=False)
msm.fit(list(ktrajs.values()))

## Transform
microktrajs = {}
for k, v in ktrajs.items():
    microktrajs[k] = msm.partial_transform(v)

## Save
print(msm.summarize())
save_generic(msm, 'msm.pickl')
save_trajs(microktrajs, 'microktrajs', meta)
Пример #21
0
"""Reduce dimensionality with tICA

msmbuilder autogenerated template version 2
created 2017-05-23T16:38:49.125259
please cite msmbuilder in any publications

"""

from msmbuilder.io import load_trajs, save_trajs, save_generic
from msmbuilder.decomposition import tICA

## Load
tica = tICA(n_components=5, lag_time=10, kinetic_mapping=True)
meta, ftrajs = load_trajs("ftrajs")

## Fit
tica.fit(ftrajs.values())

## Transform
ttrajs = {}
for k, v in ftrajs.items():
    ttrajs[k] = tica.partial_transform(v)

## Save
save_trajs(ttrajs, 'ttrajs', meta)
save_generic(tica, 'tica.pickl')
Пример #22
0
    720, 736, 748, 767, 783, 804, 814, 825, 840, 850, 870, 889, 910, 927, 941,
    948, 969, 980, 994, 1004, 1019, 1035, 1054, 1061, 1085, 1099, 1109, 1133,
    1153, 1172, 1189, 1202, 1214, 1226, 1233, 1250, 1266, 1290, 1302, 1324,
    1335, 1349, 1373, 1395, 1416, 1432, 1444, 1455, 1469, 1483, 1502, 1516,
    1530, 1547, 1571, 1593, 1603, 1622, 1634, 1658, 1672, 1682, 1697, 1713,
    1730, 1746, 1761, 1777, 1793, 1807, 1827, 1849, 1871, 1885, 1892, 1909,
    1933, 1953, 1969, 1983, 2003, 2022, 2036, 2053, 2074, 2086, 2102, 2126,
    2138, 2153, 2167, 2174, 2189, 2210, 2234, 2255, 2266, 2283, 2290, 2310,
    2327, 2338
])
num = len(alpha_carbon_number)

atompair = []
for i in range(num):
    for j in range(i + 1, num):
        atompair += [[alpha_carbon_number[i], alpha_carbon_number[j]]]
dist_feat = AtomPairsFeaturizer(pair_indices=atompair)  ## Distance featurizer


def feat2(irow):
    i, row = irow
    traj = md.load(row['traj_fn'], top=tops[row['top_fn']])
    feat_traj = dist_feat.partial_transform(traj)
    return i, feat_traj


with contextlib.closing(Pool(processes=32)) as pool:
    dist_trajs = dict(pool.imap_unordered(feat2, meta.iterrows()))

save_trajs(dist_trajs, 'alpha_carbon', meta)