예제 #1
0
def featurize(featurizer, meta_data):

    tops = preload_tops(meta)

    def feat(irow):
        i, row = irow
        traj = md.load(row['traj_fn'], top=tops[row['top_fn']])
        feat_traj = featurizer.partial_transform(traj)
        return i, feat_traj

    feature_trajs = dict(map(feat, meta.iterrows()))

    save_trajs(feature_trajs, 'ftrajs', meta)
    save_generic(featurizer, 'featurizer.pickl')

    return feature_trajs
예제 #2
0
def sample_tica_dim(dim=0, n_frames=200, meta=None, ttrajs=None):

    ## Load
    if (not meta is None) & (not ttrajs is None):

        ## Sample
        # These are apparently ordered according tica value
        inds = sample_dimension(ttrajs,
                                dimension=dim,
                                n_frames=n_frames,
                                scheme='random')

        save_generic(inds, "tica-dimension-{}-inds.pickl".format(dim + 1))

        ## Get tica components
        tica_values = np.array(
            [ttrajs[traj_i][frame_i][dim] for traj_i, frame_i in inds])
        tica_values = (tica_values - tica_values.min()) / (tica_values.max() -
                                                           tica_values.min())
        tica_values *= 10
        ## Make trajectory
        top = preload_top(meta)

        # Use loc because sample_dimension is nice
        traj = md.join(
            md.load_frame(meta.loc[traj_i]['traj_fn'], index=frame_i, top=top)
            for traj_i, frame_i in inds)

        ## Supperpose

        ## Save
        traj_fn = "tica-dimension-{}.dcd".format(dim + 1)
        backup(traj_fn)
        traj.save(traj_fn)
    else:
        raise ValueError('Specify meta data and trajectory objects')
예제 #3
0
if __name__ == '__main__':

    # Load
    meta = load_meta()
    tops = preload_tops(meta)

    # Select featurizer
    feature_name = 'Positions'
    reference = md.load('topology.pdb')
    featurizer = RawPositionsFeaturizer(ref_traj=reference)

    args = zip(meta.iterrows(), [featurizer] * meta.shape[0],
               [tops] * meta.shape[0])

    # Do it in parallel
    with Pool() as pool:
        feature_trajs = dict(pool.imap_unordered(feat, args))

    # Plot unscaled features
    ftrajs = np.concatenate([fx[::100] for fx in feature_trajs.values()])
    fig, ax = plt.subplots(figsize=(15, 5))
    plot_box(ax, fxx=ftrajs, feature_name='Unscaled {}'.format(feature_name))
    fig.tight_layout()
    fig.savefig("Unscaled-{}-box.pdf".format(feature_name))

    ## Save
    save_trajs(feature_trajs, 'Unscaled-{}-ftraj'.format(feature_name), meta)
    save_generic(featurizer,
                 'Unscaled-{}-featurizer.pickl'.format(feature_name))
예제 #4
0
        vec = msm.left_eigenvectors_
        n_states = vec.shape[
            0]  # may be less than 200 as T may be non-ergodic.
        fig, axes = plt.subplots(nrows=m, sharex=True)
        for i in range(m):
            for j in range(m):
                mask = pcca_mapping == j
                axes[i].bar(np.arange(n_states)[mask],
                            vec[mask, i],
                            label='PCCA State {}'.format(j),
                            align='center')
            axes[i].yaxis.set_major_formatter(FormatStrFormatter('%.2f'))
            axes[i].legend()
            axes[i].set_ylabel('Cluster projection')

        plt.xlabel('Cluster')
        plt.savefig('figures/rmsd_msm_left_eigenvectors-pcca.png',
                    transparent=True)

    # Transforms:
    msm_traj = {}
    pcca_traj = {}
    for k, v in ctraj_dict.items():
        print(k)
        msm_traj[k] = msm.partial_transform(np.squeeze(v), mode='fill')
        pcca_traj[k] = pcca.partial_transform(np.squeeze(v), mode='fill')

    save_trajs(msm_traj, 'msm-traj-200', meta)
    save_generic(msm, 'msm-200.pickl')
    save_trajs(pcca_traj, 'pcca-2-traj', meta)
    save_generic(pcca, 'pcca-2.pickl')
예제 #5
0
please cite msmbuilder in any publications


"""

import mdtraj as md

from msmbuilder.io import load_trajs, save_generic, preload_top, backup, load_generic
from msmbuilder.io.sampling import sample_msm

## Load
meta, ttrajs = load_trajs('ttrajs')
msm = load_generic('msm.pickl')
kmeans = load_generic('kmeans.pickl')

## Sample
# Warning: make sure ttrajs and kmeans centers have
# the same number of dimensions
inds = sample_msm(ttrajs, kmeans.cluster_centers_, msm, n_steps=200, stride=1)
save_generic(inds, "msm-traj-inds.pickl")

## Make trajectory
top = preload_top(meta)
traj = md.join(
    md.load_frame(meta.loc[traj_i]['traj_fn'], index=frame_i, top=top)
    for traj_i, frame_i in inds)

## Save
traj_fn = "msm-traj.xtc"
backup(traj_fn)
traj.save(traj_fn)
예제 #6
0
  - trajs
"""

import mdtraj as md

from msmbuilder.io import load_trajs, save_generic, preload_top, backup, load_generic
from msmbuilder.io.sampling import sample_msm

## Load
meta, ttrajs = load_trajs('ttrajs')
msm = load_generic('msm.pickl')
kmeans = load_generic('kmeans.pickl')

## Sample
# Warning: make sure ttrajs and kmeans centers have
# the same number of dimensions
inds = sample_msm(ttrajs, kmeans.cluster_centers_, msm, n_steps=200, stride=1)
save_generic(inds, "msm-traj-inds.pickl")

## Make trajectory
top = preload_top(meta)
traj = md.join(
    md.load_frame(meta.loc[traj_i]['traj_fn'], index=frame_i, top=top)
    for traj_i, frame_i in inds
)

## Save
traj_fn = "msm-traj.xtc"
backup(traj_fn)
traj.save(traj_fn)
예제 #7
0
"""Cluster tICA results

{{header}}

Meta
----
depends:
 - ttrajs
 - meta.pandas.pickl
"""
from msmbuilder.io import load_trajs, save_trajs, save_generic
from msmbuilder.cluster import MiniBatchKMeans

## Load
meta, ttrajs = load_trajs('ttrajs')

## Fit
dim = 5
kmeans = MiniBatchKMeans(n_clusters=500)
kmeans.fit([traj[:, :dim] for traj in ttrajs.values()])

## Transform
ktrajs = {}
for k, v in ttrajs.items():
    ktrajs[k] = kmeans.partial_transform(v[:, :dim])

## Save
print(kmeans.summarize())
save_trajs(ktrajs, 'ktrajs', meta)
save_generic(kmeans, 'kmeans.pickl')
예제 #8
0
sample.sort_values(by=['Prod_ID', 'Site_ID', 'Time_ps'], inplace=True)
g = sns.FacetGrid(sample, col='Prod_ID',hue='Site_ID', col_wrap=10)
g.map(plt.scatter, 'Time_ps', 'Trajectory', alpha=0.5)
g.set(ylim=(-0.5,num_clusters))
g.fig.tight_layout()
plt.savefig('figures/{}_cluster_trajectory.pdf'.format(feature))

# Plot  histograms
g = sns.FacetGrid(sample, col='Prod_ID',hue='Site_ID', col_wrap=10)
g = g.map(plt.hist, 'Trajectory', bins=range(num_clusters), histtype='step', lw='5')
g.fig.tight_layout()
plt.savefig('figures/{}_cluster_hist.pdf'.format(feature))


# Save dataframe
save_generic(df, 'clusters/{}_cluster_trajectory.pickl'.format(feature))

# Sampling (only plot 10 random assortments
to_plot = np.random.choice(range(num_clusters), min(10, num_clusters), replace=False)
for i in to_plot:
    num_samples = 100
    df_smp = df.ix[df['Trajectory']==i, ['Key', 'Time_ps']].sample(num_samples)
    inds = zip(df_smp['Key'], df_smp['Time_ps'])

    # Use loc because sample_dimension is nice
    traj = md.join(
        md.load_frame(meta.loc[traj_i]['traj_fn'], index=frame_i, top=meta.loc[traj_i]['top_fn'])
        for traj_i, frame_i in inds
    )

    # Original trajectories include both BT1 and BT2 so need to superpose
            traj = md.load(row['traj_fn'], top=tops[row['top_fn']])
            return i, traj


        traj_dict = dict(map(traj_load, meta.iterrows()))
        all_trajs = [traj for traj in traj_dict.values()]

        cluster = LandmarkAgglomerative(n_clusters=200, n_landmarks=int(totframes /200), linkage='ward', metric='rmsd')
        cluster.fit(all_trajs)
        # TODO will this work?
        args = [(k,v,cluster) for k, v in traj_dict.items()]

        with Pool() as pool:
            all_ctrajs_dict = dict(pool.imap_unordered(clust, args))

        save_generic(cluster, 'cluster-200')
        save_trajs(all_ctrajs_dict, 'ctraj-200', meta)

    long_ctrajs = [np.squeeze(traj) for traj in all_ctrajs_dict.values() if traj.shape[0] > 1000]
    all_ctrajs = [np.squeeze(traj) for traj in all_ctrajs_dict.values()]

    lags = np.concatenate((np.arange(200, 1000, 200),np.arange(1000, 5000, 500)))
    all_msms = []

    for lag in lags:
        print('Fitting lag {}'.format(lag))
        if lag > 1000:
            trajs = long_ctrajs
        else:
            trajs = all_ctrajs
예제 #10
0
    'cluster__n_clusters': scipy.stats.randint(low=200, high=200),
    'tica__n_components': scipy.stats.randint(low=2, high=40),
    'tica__lag_time': scipy.stats.randint(low=100, high=999)
}

pipe = Pipeline(estimators)
pipe.set_params(msm__lag_time=500)
pipe.set_params(msm__n_timescales=10)

if __name__ == "__main__":

    cvSearch = GridSearchCV(pipe, params, n_jobs=3, verbose=1, cv=5)

    print("Performing grid search...")
    print("pipeline:", [name for name, _ in pipe.steps])
    print("parameters:")
    print(params)
    t0 = time()
    cvSearch.fit(ftraj)
    print("done in %0.3fs" % (time() - t0))
    print()

    print("Best score: %0.3f" % cvSearch.best_score_)
    print("Best parameters set:")
    best_parameters = cvSearch.best_estimator_.get_params()
    for param_name in sorted(params.keys()):
        print("\t%s: %r" % (param_name, best_parameters[param_name]))

    df = pd.DataFrame(cvSearch.cv_results_)
    save_generic(df, 'results/random_search.pickl')
예제 #11
0
def guestimate_stride():
    total_data = meta['nframes'].sum()
    want = kmed.n_clusters * 10
    stride = max(1, total_data // want)
    print("Since we have", total_data, "frames, we're going to stride by",
          stride, "during fitting, because this is probably adequate for",
          kmed.n_clusters, "clusters")
    return stride


## Fit
kmed.fit([traj for _, traj in itertrajs(meta, stride=guestimate_stride())])
print(kmed.summarize())

## Save
save_generic(kmed, 'clusterer.pickl')


## Save centroids
def frame(traj_i, frame_i):
    # Note: kmedoids does 0-based, contiguous integers so we use .iloc
    row = meta.iloc[traj_i]
    return md.load_frame(row['traj_fn'], frame_i, top=row['top_fn'])


centroids = md.join((frame(ti, fi) for ti, fi in kmed.cluster_ids_),
                    check_topology=False)
centroids_fn = 'centroids.xtc'
backup(centroids_fn)
centroids.save("centroids.xtc")
예제 #12
0
"""

import mdtraj as md

from msmbuilder.io.sampling import sample_dimension
from msmbuilder.io import load_trajs, save_generic, preload_top, backup

## Load
meta, ttrajs = load_trajs('ttrajs')

## Sample
inds = sample_dimension(ttrajs,
                        dimension=0,
                        n_frames=200, scheme='random')

save_generic(inds, "tica-dimension-0-inds.pickl")

## Make trajectory
top = preload_top(meta)

# Use loc because sample_dimension is nice
traj = md.join(
    md.load_frame(meta.loc[traj_i]['traj_fn'], index=frame_i, top=top)
    for traj_i, frame_i in inds
)

## Save
traj_fn = "tica-dimension-0.xtc"
backup(traj_fn)
traj.save(traj_fn)
예제 #13
0
  - trajs
  - top.pdb
"""
import mdtraj as md

from msmbuilder.featurizer import DihedralFeaturizer
from msmbuilder.io import load_meta, preload_tops, save_trajs, save_generic
from multiprocessing import Pool

## Load
meta = load_meta()
tops = preload_tops(meta)
dihed_feat = DihedralFeaturizer()


## Featurize logic
def feat(irow):
    i, row = irow
    traj = md.load(row['traj_fn'], top=tops[row['top_fn']])
    feat_traj = dihed_feat.partial_transform(traj)
    return i, feat_traj


## Do it in parallel
with Pool() as pool:
    dihed_trajs = dict(pool.imap_unordered(feat, meta.iterrows()))

## Save
save_trajs(dihed_trajs, 'ftrajs', meta)
save_generic(dihed_feat, 'featurizer.pickl')
예제 #14
0
def guestimate_stride():
    total_data = meta['nframes'].sum()
    want = kcen.n_clusters * 20
    stride = max(1, total_data // want)
    print("Since we have", total_data, "frames, we're going to stride by",
          stride, "during fitting, because this is probably adequate for",
          kcen.n_clusters, "clusters")
    return stride


## Fit
kcen.fit([traj for _, traj in itertrajs(meta, stride=guestimate_stride())])
print(kcen.summarize())

## Save
save_generic(kcen, 'clusterer' + str(round_num) +'.pickl')


## Save centroids
def frame(traj_i, frame_i):
    # Note: kmedoids does 0-based, contiguous integers so we use .iloc
    row = meta.iloc[traj_i]
    return md.load_frame(row['traj_fn'], frame_i, top=row['top_fn'])


centroids = md.join((frame(ti, fi) for ti, fi in kcen.cluster_ids_),
                    check_topology=False)

centroids_fn = 'centroids_' + str(round_num) + '.xtc'
backup(centroids_fn)
centroids.save("centroids_" + str(round_num) + ".xtc")
예제 #15
0
import mdtraj as md
import os

from msmbuilder.io.sampling import sample_states
from msmbuilder.io import load_trajs, save_generic, preload_top, backup, load_generic

## Load
meta, ttrajs = load_trajs('ttrajs')
kmeans = load_generic("kmeans.pickl")

## Sample
inds = sample_states(ttrajs,
                     kmeans.cluster_centers_,
                     k=10)

save_generic(inds, "cluster-sample-inds.pickl")

## Make trajectories
top = preload_top(meta)
out_folder = "cluster_samples"
backup(out_folder)
os.mkdir(out_folder)

for state_i, state_inds in enumerate(inds):
    traj = md.join(
        md.load_frame(meta.loc[traj_i]['traj_fn'], index=frame_i, top=top)
        for traj_i, frame_i in state_inds
    )
    traj.save("{}/{}.xtc".format(out_folder, state_i))
예제 #16
0
## Try to limit RAM usage
def guestimate_stride():
    total_data = meta['nframes'].sum()
    want = kmed.n_clusters * 10
    stride = max(1, total_data // want)
    print("Since we have", total_data, "frames, we're going to stride by",
          stride, "during fitting, because this is probably adequate for",
          kmed.n_clusters, "clusters")
    return stride


## Fit
kmed.fit([traj for _, traj in itertrajs(meta, stride=guestimate_stride())])
print(kmed.summarize())

## Save
save_generic(kmed, 'clusterer.pickl')


## Save centroids
def frame(traj_i, frame_i):
    # Note: kmedoids does 0-based, contiguous integers so we use .iloc
    row = meta.iloc[traj_i]
    return md.load_frame(row['traj_fn'], frame_i, top=row['top_fn'])


centroids = md.join((frame(ti, fi) for ti, fi in kmed.cluster_ids_),
                    check_topology=False)
centroids_fn = 'centroids.xtc'
backup(centroids_fn)
centroids.save("centroids.xtc")
예제 #17
0
from utilities import plot_box

if __name__ == '__main__':

    # Load
    meta, feature_trajs = load_trajs('ftraj')

    # Select scaler
    featurizer = RobustScaler()

    # Transform values
    featurizer.fit_transform(feature_trajs.values())
    scaled_trajs = {}
    for k, v in feature_trajs.items():
        scaled_trajs[k] = featurizer.partial_transform(v)

    # Save
    sample = np.concatenate([fx for fx in scaled_trajs.values()])
    sample = sample[np.random.choice(sample.shape[0], 1000, replace=False), :]
    variance = np.apply_along_axis(np.var, axis=0, arr=sample)
    order = np.argsort(variance)
    ord_var = variance[order]
    labels = [str(x) for x in ord_var[::10]]
    ind = range(variance.shape[0])
    fig, ax = plt.subplots()
    ax.plot(ind, ord_var)
    plt.savefig('ScaledFeatureVariance.png')

    save_trajs(scaled_trajs, 'straj', meta)
    save_generic(featurizer, 'scaler.pickl')
#
# TIMESCALES
#
# The data will be loaded with a stride of 10 frames.  Each fame is 50ps, so the time per frame will be
# 500ps/frame or 0.5ns/frame.
# Each trajectory is 1000 frames long
# Lag time will be 40 frames (20 ns)  based on a visual inspection of /Misc/MSM_lag_time.ipynb

features = tica_unstructured_features
to_ns = 0.5
msm_lag = int(40 / to_ns)

#
# MODEL
#
for feat in tica_unstructured_features:
    pipe = Pipeline([(feat[0], feat[1]), ('variance_cut', VarianceThreshold()),
                     ('scaling', RobustScaler()),
                     ('tica', tICA(kinetic_mapping=True)),
                     ('cluster', MiniBatchKMeans()),
                     ('msm',
                      MarkovStateModel(lag_time=msm_lag,
                                       verbose=False,
                                       n_timescales=2))])
    #
    # SAVE MODEL
    #
    savedir = 'gp-m52-ei-tica-indv'
    save_generic(pipe, '{0}/{1}.pickl'.format(savedir, feat[0]))
예제 #19
0
        with Pool() as pool:
            feature_trajs = dict(pool.imap_unordered(msmb_feat, args))

        # # Create save objects
        # featurizer = dict([(x[0], x[2]) for x in feature_trajs])
        # feature_trajs = dict([(x[0], x[1]) for x in feature_trajs])

        selector = VarianceThreshold()
        selector.fit([traj for traj in feature_trajs.values()])
        ftrajs = {}
        for k, v in feature_trajs.items():
            ftrajs[k] = np.squeeze(selector.transform([v]))

        # SAVE
        save_trajs(ftrajs, 'featurized_trajectories/{}-ftraj'.format(name), meta)
        save_generic(feat, 'featurized_trajectories/{}-featurizer.pickl'.format(name))


    # pyEMMA FEATURIZERS
    featurizers = [('angles', 'add_angles', angles),
                   ('dihedrals', 'add_dihedrals', dihedrals)]

    for name, feat, indices in featurizers:
        print('Featurizing {}'.format(name))
        args = zip(meta.iterrows(), [feat] * meta.shape[0], [tops] * meta.shape[0],
                   [indices]*meta.shape[0])

        # Fit features
        with Pool() as pool:
            feature_trajs = dict(pool.imap_unordered(pyemma_feat, args))
예제 #20
0
"""Reduce dimensionality with tICA

{{header}}
Meta
----
depends:
  - ftrajs
  - meta.pandas.pickl
"""

from msmbuilder.io import load_trajs, save_trajs, save_generic
from msmbuilder.decomposition import tICA

## Load
tica = tICA(n_components=5, lag_time=10, kinetic_mapping=True)
meta, ftrajs = load_trajs("ftrajs")

## Fit
tica.fit(ftrajs.values())

## Transform
ttrajs = {}
for k, v in ftrajs.items():
    ttrajs[k] = tica.partial_transform(v)

## Save
save_trajs(ttrajs, "ttrajs", meta)
save_generic(tica, "tica.pickl")
예제 #21
0
# cluster
print('Attempting to cluster')
num_clusters = 20
cluster = LandmarkAgglomerative(n_clusters=num_clusters,
                                n_landmarks=int(totframes / 100),
                                linkage='ward',
                                metric='rmsd')
cluster.fit(trajs)

#
# print('Fitting cluster labels')
# ctraj = {}
# for k, v in traj_dict.items():
#     v = cluster.partial_predict(v)
#     diff = nframes-v.shape[0]
#     v = np.append(v, np.zeros(diff)-1)
#     ctraj[k] = v

# Convert to DF for plotting and sampling.
# df = to_dataframe(ctraj, nframes, dt=1)

print('Fitting cluster labels for MSM')
ctraj = {}
for k, v in traj_dict.items():
    ctraj[k] = cluster.partial_predict(v)

# Save dataframe
save_generic(df, 'clusters/rmsd_cluster_trajectory.pickl')
save_trajs(ctraj, 'ftraj', meta)
예제 #22
0

"""
import mdtraj as md

from msmbuilder.featurizer import DihedralFeaturizer
from msmbuilder.io import load_meta, preload_tops, save_trajs, save_generic
from multiprocessing import Pool

## Load
meta = load_meta()
tops = preload_tops(meta)
dihed_feat = DihedralFeaturizer()


## Featurize logic
def feat(irow):
    i, row = irow
    traj = md.load(row['traj_fn'], top=tops[row['top_fn']])
    feat_traj = dihed_feat.partial_transform(traj)
    return i, feat_traj


## Do it in parallel
with Pool() as pool:
    dihed_trajs = dict(pool.imap_unordered(feat, meta.iterrows()))

## Save
save_trajs(dihed_trajs, 'ftrajs', meta)
save_generic(dihed_feat, 'featurizer.pickl')
예제 #23
0
traj_dict = dict(map(traj_load, meta.iterrows()))
trajs = [traj for traj in traj_dict.values() if traj.n_frames > 1000]
print(len(trajs))
num_clust = 20
cluster = LandmarkAgglomerative(n_clusters=num_clust,
                                n_landmarks=int(totframes / 100),
                                linkage='ward',
                                metric='rmsd')
ctrajs = cluster.fit_transform(trajs)

# print('Fitting cluster labels for MSM')
# ctraj = {}
# count = 0
# for k, v in traj_dict.items():
#     print(k, count)
#     count +=1
#     ctraj[k] = cluster.partial_predict(v)
#
# ctrajs = [traj for traj in ctraj.values() if traj.shape[0] > 1000]

print('Fitting MSM')
lag = 4000
msm = MarkovStateModel(lag_time=lag, n_timescales=50)
msm.fit(ctrajs)

# save_trajs(ctraj, 'results/nclusters-{0}-ctraj'.format(num_clust), meta)
save_generic(cluster,
             'results/clusterer-nclusters-{0}.pickle'.format(num_clust))
save_generic(msm,
             'results/msm-lag-{0}-nclusters-{1}.pickl'.format(lag, num_clust))
예제 #24
0
                                      linkage='ward',
                                      metric='rmsd',
                                      landmark_strategy='stride',
                                      random_state=None,
                                      max_landmarks=None,
                                      ward_predictor='ward')
    msm = MarkovStateModel(lag_time=msm_lag)
    pipe = Pipeline([('cluster', clusterer), ('msm', msm)])

    # -------------------------------------------------------------------------
    # Set param search object
    # -------------------------------------------------------------------------

    params = {
        'cluster__n_clusters':
        list((np.logspace(-0.5, 2, 10) * n_clusters).astype(int))
    }
    print(params)
    cv_iter = ShuffleSplit(n_splits=10, test_size=0.5)
    param_search = GridSearchCV(pipe, param_grid=params, cv=cv_iter)

    # -------------------------------------------------------------------------
    # Search param space and save
    # -------------------------------------------------------------------------

    param_search.fit(trajs)
    save_generic(param_search, 'models/rmsd_model.pickl')

    print('Best score of {0} was achieved with \n {1}'.format(
        param_search.best_score_, param_search.best_params_))
예제 #25
0
# TIMESCALES
#
# The data will be loaded with a stride of 10 frames.  Each fame is 50ps, so the time per frame will be
# 500ps/frame or 0.5ns/frame.
# Each trajectory is 1000 frames long
# Lag time will be 40 frames (20 ns)  based on a visual inspection of /Misc/MSM_lag_time.ipynb

features = tica_unstructured_features
to_ns = 0.5
msm_lag = int(40/to_ns)

#
# MODEL
#
pipe = Pipeline([('features', FeatureSelector(features=tica_unstructured_features)),
                 ('variance_cut', VarianceThreshold()),
                 ('scaling', RobustScaler()),
                 ('tica', tICA(kinetic_mapping=True)),
                 ('cluster', MiniBatchKMeans()),
                 ('msm', MarkovStateModel(lag_time=msm_lag, verbose=False, n_timescales=2))])
#
# SAVE MODEL
#
savedir = 'rand-tica-all'
save_generic(pipe, '{}/model.pickl'.format(savedir))
print_feature_names(features, join(savedir, 'feature_list.txt'))




예제 #26
0
                                     metric='rmsd')),
              ('msm', MarkovStateModel())]

params = {'cluster__n_clusters': [200]}

pipe = Pipeline(estimators)
pipe.set_params(msm__lag_time=999)
pipe.set_params(msm__n_timescales=20)

if __name__ == "__main__":

    cvSearch = GridSearchCV(pipe, params, n_jobs=1, verbose=1, cv=cv_iter)

    print("Performing grid search...")
    print("pipeline:", [name for name, _ in pipe.steps])
    print("parameters:")
    print(params)
    t0 = time()
    cvSearch.fit(trajs)
    print("done in %0.3fs" % (time() - t0))
    print()

    print("Best score: %0.3f" % cvSearch.best_score_)
    print("Best parameters set:")
    best_parameters = cvSearch.best_estimator_.get_params()
    for param_name in sorted(params.keys()):
        print("\t%s: %r" % (param_name, best_parameters[param_name]))

    df = pd.DataFrame(cvSearch.cv_results_)
    save_generic(df, 'results/lag999-ncluster200.pickl')
예제 #27
0
 def _save_model(self):
     """
     Save a model to disk in pickle format
     """
     save_generic(self.model, self.model_pkl_fname)
    cv_iter = ShuffleSplit(n_splits=10, test_size=0.5, random_state=0)
    param_grid = [{'n_components': [10, 20, 40], 'lag_time': [1, 10, 100]}]

    # CV object
    model = tICA(kinetic_mapping=True)

    # Do grid search
    clf = GridSearchCV(estimator=model,
                       param_grid=param_grid,
                       cv=cv_iter,
                       n_jobs=2)
    clf.fit(X)

    # Save results
    results = pd.DataFrame(clf.cv_results_)
    save_generic(results, '{}-grid-search-results.pickl'.format(feature_name))

    # Print Results
    print("Best parameters set found on development set:")
    print(clf.best_params_)
    print()
    print("Grid scores on development set:")
    print()
    means = clf.cv_results_['mean_test_score']
    stds = clf.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, clf.cv_results_['params']):
        print("%0.3f (+/-%0.03f) for %r" % (mean, std * 2, params))

    # Fit best estimator to data
    tica = clf.best_estimator_
    ttrajs = {}
#
# TIMESCALES
#
# The data will be loaded with a stride of 10 frames.  Each fame is 50ps, so the time per frame will be
# 500ps/frame or 0.5ns/frame.
# Each trajectory is 1000 frames long
# Lag time will be 40 frames (20 ns)  based on a visual inspection of /Misc/MSM_lag_time.ipynb
to_ns = 0.5
msm_lag = int(40 / to_ns)

#
# FEATURE INDICES
#
all_idx = np.load('indices_all.npy')

#
# OTHER PARAMETERS
#
ref_traj = md.load('../Data/data/trajectory-1.xtc',
                   top='../Data/data/fs-peptide.pdb')

featurizer = FeatureSelector(features=feats)

pipe = Pipeline([('features', featurizer),
                 ('variance_cut', VarianceThreshold()),
                 ('scaling', RobustScaler()), ('cluster', MiniBatchKMeans()),
                 ('msm', MarkovStateModel(lag_time=msm_lag, verbose=False))])

save_generic(pipe, 'model.pickl')
예제 #30
0
"""Make a microstate MSM

{{header}}
"""

from msmbuilder.io import load_trajs, save_trajs, save_generic
from msmbuilder.msm import MarkovStateModel

## Load
meta, ktrajs = load_trajs('ktrajs')

## Fit
msm = MarkovStateModel(lag_time=2, n_timescales=10, verbose=False)
msm.fit(list(ktrajs.values()))

## Transform
microktrajs = {}
for k, v in ktrajs.items():
    microktrajs[k] = msm.partial_transform(v)

## Save
print(msm.summarize())
save_generic(msm, 'msm.pickl')
save_trajs(microktrajs, 'microktrajs', meta)
예제 #31
0
please cite msmbuilder in any publications


"""

import mdtraj as md
import os

from msmbuilder.io.sampling import sample_states
from msmbuilder.io import load_trajs, save_generic, preload_top, backup, load_generic

## Load
meta, ttrajs = load_trajs('ttrajs')
kmeans = load_generic("kmeans.pickl")

## Sample
inds = sample_states(ttrajs, kmeans.cluster_centers_, k=10)

save_generic(inds, "cluster-sample-inds.pickl")

## Make trajectories
top = preload_top(meta)
out_folder = "cluster_samples"
backup(out_folder)
os.mkdir(out_folder)

for state_i, state_inds in enumerate(inds):
    traj = md.join(
        md.load_frame(meta.loc[traj_i]['traj_fn'], index=frame_i, top=top)
        for traj_i, frame_i in state_inds)
    traj.save("{}/{}.xtc".format(out_folder, state_i))
예제 #32
0
"""Make a microstate MSM

msmbuilder autogenerated template version 2
created 2017-05-23T16:38:49.116944
please cite msmbuilder in any publications
"""

from msmbuilder.io import load_trajs, save_trajs, save_generic
from msmbuilder.msm import MarkovStateModel

## Load
meta, ktrajs = load_trajs('ktrajs')

## Fit
msm = MarkovStateModel(lag_time=2, n_timescales=10, verbose=False)
msm.fit(list(ktrajs.values()))

## Transform
microktrajs = {}
for k, v in ktrajs.items():
    microktrajs[k] = msm.partial_transform(v)

## Save
print(msm.summarize())
save_generic(msm, 'msm.pickl')
save_trajs(microktrajs, 'microktrajs', meta)
예제 #33
0
"""Reduce dimensionality with tICA

msmbuilder autogenerated template version 2
created 2017-05-23T16:38:49.125259
please cite msmbuilder in any publications

"""

from msmbuilder.io import load_trajs, save_trajs, save_generic
from msmbuilder.decomposition import tICA

## Load
tica = tICA(n_components=5, lag_time=10, kinetic_mapping=True)
meta, ftrajs = load_trajs("ftrajs")

## Fit
tica.fit(ftrajs.values())

## Transform
ttrajs = {}
for k, v in ftrajs.items():
    ttrajs[k] = tica.partial_transform(v)

## Save
save_trajs(ttrajs, 'ttrajs', meta)
save_generic(tica, 'tica.pickl')
dt = float(meta['step_ps'].unique()[0])
df = pd.DataFrame.from_records(data=rtrajs)
df['Time_ps'] = np.arange(nframes) * dt
df = pd.melt(df,
             id_vars=['Time_ps'],
             var_name='Production_ID',
             value_name='RMSD')
df[id_cols] = pd.DataFrame(df['Production_ID'].tolist())
del df['Production_ID']

df = df.join(df.groupby(id_cols)['RMSD'].rolling(10).mean().reset_index(
    level=[0, 1, 2, 3, 4]),
             rsuffix='_r')
df.drop(labels=[x + '_r' for x in id_cols], axis=1, inplace=True)

long_trajs = ['{}.1'.format(x) for x in range(1, 11)]
# Plot rolling
with sns.plotting_context("notebook", font_scale=2):
    sample = df.ix[df['Prod_ID'].isin(long_trajs), :].sample(frac=0.1, axis=0)
    sample.sort_values(by=['Prod_ID', 'Site_ID', 'Time_ps'], inplace=True)
    g = sns.FacetGrid(sample, col='Prod_ID', hue='Site_ID', col_wrap=5)
    g.map(plt.plot, 'Time_ps', 'RMSD_r')
    g.set_ylabels("RMSD $\AA$")
    g.set_xlabels("")
    g.set_titles("")
    g.fig.subplots_adjust(wspace=0.05, hspace=0.05)
    plt.savefig('rmsd_trajectory_long.png', transparent=True)

# Save dataframe
save_generic(df, 'rmsd_trajectory.pickl')