Пример #1
0
def project_and_cluster(trajfiles,
                        featurizer,
                        sparsify=False,
                        tica=True,
                        lag=100000,
                        scale=True,
                        var_cutoff=1.0,
                        ncluster=100):
    """
    Returns
    -------
    trans_obj, Y, clustering

    """
    X = coor.load(trajfiles, featurizer)
    if sparsify:
        X = remove_constant(X)
    if tica:
        trans_obj = coor.tica(X, lag=lag, var_cutoff=var_cutoff)
        Y = trans_obj.get_output()
    else:
        trans_obj = coor.pca(X, dim=-1, var_cutoff=var_cutoff)
        Y = trans_obj.get_output()
    if scale:
        for y in Y:
            y *= trans_obj.eigenvalues[:trans_obj.dimension()]
    if cluster:
        cl_obj = coor.cluster_kmeans(Y,
                                     k=ncluster,
                                     max_iter=3,
                                     fixed_seed=True)
        return trans_obj, Y, cl_obj
    return trans_obj, Y
Пример #2
0
def loadCoordinates(path, trajectories_basename='*traj*.pdb', topfile=''):
    """ Load the coordinates from the simulation into an object that can be
    used with the rest of PyEMMA tools.
    Returns an array with the trajectories"""

    feat = coor.featurizer(topfile)

    path_to_file = os.path.join(path, trajectories_basename)
    files = glob.glob(path_to_file)
    return coor.load(files, feat)
Пример #3
0
def run_sampling(args):
    topology = "Native.pdb"
    ticadim = 10
    num_sample_frames = 10000

    fn = args.file  # file name
    wn = args.weights  # weights name

    weights = np.loadtxt(wn)
    weights = weights / np.sum(weights)
    # first time
    time1 = time.clock()
    feat = coor.featurizer(topology)
    feat.add_distances_ca()
    X1 = coor.load(fn, feat, stride=1)
    # time for loading
    time2 = time.clock()
    print "Took %f minutes to load a file" % ((time2 - time1) / 60.0)
    sampled_frames = np.zeros((num_sample_frames, np.shape(X1)[1]))

    selected_frames = np.random.choice(np.shape(X1)[0], size=num_sample_frames, replace=True, p=weights)
    time3 = time.clock()
    print "Took %f minutes to select new frames" % ((time3 - time2) / 60.0)
    for i in range(num_sample_frames):
        ##debug
        # print np.shape(sampled_frames)
        # print np.shape(X1)
        ##debugg
        sampled_frames[i, :] = X1[selected_frames[i], :]
    time4 = time.clock()
    print "Took %f minutes to load the new frames" % ((time4 - time3) / 60.0)
    ##debug
    for j in sampled_frames:
        for i in j:
            if i == 0:
                print "ERROR, distance too short, something not written"
                f = open("log.txt", "w")
                f.write("ERROR, distance too short, something not written")
                f.close()
    ##debugg
    time5 = time.clock()
    print "Took %f minutes to go through the debug check" % ((time5 - time4) / 60.0)
    tica_obj = coor.tica(sampled_frames, stride=1, lag=1, dim=ticadim)
    time6 = time.clock()
    print "Took %f minutes to calculate the tica_object" % ((time6 - time5) / 60.0)
    outputs = tica_obj.get_output()[0]
    eigen = tica_obj.eigenvalues
    time7 = time.clock()
    print "Took %f minutes to get the output of the tica_object" % ((time7 - time6) / 60.0)
    print "saving files"
    np.savetxt("output.dat", outputs)
    np.savetxt("eigenvalues.dat", eigen)
    print "files saved"
    time8 = time.clock()
    print "Took %f minutes to write the output files" % ((time8 - time7) / 60.0)
Пример #4
0
def run_analysis(args):
    feat = coor.featurizer(args.topfile)
    feat.add_distances(tmeth.generate_pairs(args.range[0],args.range[1], args.step_size, args.cut_value))
    traj = coor.load(args.traj_file, feat, stride=args.stride)
    tica_obj = coor.tica(traj, stride=1, lag=args.lag, dim=args.ticadim)
    outputs = tica_obj.get_output()[0]
    eigen = tica_obj.eigenvalues
    np.savetxt("%s_output_raw.dat"%args.title, outputs)
    np.savetxt("%s_eigenvalues_raw.dat"%args.title, eigen)
    tmeth.plot_eigen_series(eigen, args.title, time_scale=args.time_step*args.stride)
    tmeth.plot_output(outputs, args.title, time_scale=args.time_step*args.stride)
def run_sampling(args):
    topology = args.topfile
    ticadim = 10
    num_sample_frames = 10000
    tica_lag_time = 5
    fn = args.filedir  # file name
    wn = args.weights  # weights name

    weights = np.loadtxt(wn)
    weights = weights / np.sum(weights)
    # first time
    time1 = time.clock()
    feat = coor.featurizer(topology)
    feat.add_distances(tmeth.generate_pairs(5, 288, 4, 4))
    selected_frames = np.random.choice(args.number_traj, size=num_sample_frames, replace=True, p=weights)

    selected_files = []
    selected_frames.sort()
    for i in selected_frames:
        selected_files.append("%s/traj%d.xtc" % (fn, i))
    time2 = time.clock()
    print "Took %f minutes to select new frames" % ((time2 - time1) / 60.0)
    sampled_frames = coor.load(selected_files, feat, stride=10)

    time3 = time.clock()
    print "Took %f minutes to load the new frames" % ((time3 - time2) / 60.0)

    tica_obj = coor.tica(sampled_frames, stride=1, lag=tica_lag_time, dim=ticadim)
    time4 = time.clock()
    print "Took %f minutes to calculate the tica_object" % ((time4 - time3) / 60.0)
    all_outputs = tica_obj.get_output()[0]
    for i in xrange(num_sample_frames - 1):
        outputs = tica_obj.get_output()[i + 1]
        all_outputs = np.append(all_outputs, outputs, axis=0)
    eigen = tica_obj.eigenvalues
    print "saving files"
    np.savetxt("output.dat", all_outputs)
    np.savetxt("eigenvalues.dat", eigen)
    np.savetxt("selected_frames.dat", selected_frames)
    print "files saved"
    time5 = time.clock()
    print "Took %f minutes to write the output files" % ((time5 - time4) / 60.0)
Пример #6
0
import analysis_scripts.plot_package as pltpkg


if __name__ == "__main__":
    topology = "firstframe.pdb"
    feat = coor.featurizer(topology)

    pairs = np.array([[79, 492]])

    feat.add_distances(pairs)

    print feat.describe()
    files_list = []
    for i in np.arange(0, 10, 1):
        files_list.append("ww_2-protein-00%d.dcd" % i)
    for i in np.arange(10, 50, 1):
        files_list.append("ww_2-protein-0%d.dcd" % i)

    output = coor.load(files_list, features=feat)

    print np.shape(output)

    yvalues = np.array(output).flatten()

    print np.shape(yvalues)

    np.savetxt("trace_ww_2.dat", yvalues)

    print np.max(yvalues)
    print np.min(yvalues)
Пример #7
0
import mdtraj as md
import pickle
import matplotlib.pyplot as plt
import time
## set path
start_time = time.time()
indir = '/scratch/jap12009/gamma/pg'
topfile = '/scratch/jap12009/gamma/gamma1.pdb'
save_file = 'pgTICs-ca-con-8.pkl'
## create list of trajectories and Colvar files
traj_list = [indir + "/p60win{:2.1f}.xtc".format(i) for i in np.arange(0.8,15.5,0.1)]
## define topology
f = coor.featurizer(topfile)
f.add_distances_ca()
## load trajectories and colvar files
inp = coor.load(traj_list, f)
## tica

inp1 = [i[20000::10] for i in inp]

cumvar3 = []
timescales1 = []
timescales2 = []
timescales3 = []

print('length of inp1 is ', len(inp1))
print('length of inp1 is ', inp1[0].shape)

#for i in range(5000,39999,5000):
for i in [ 2000, 4000, 6000, 8000, 10000, 12000]
     k = int(i/10)
#! /usr/bin/env/ python
# Plot tica data

import mdtraj as md
import pyemma.coordinates as coor
import numpy as np
import pickle
import pyemma
import os
import pandas as pd
import pyemma.plots as pyemma_plots
import matplotlib.pyplot as plt

sys = 'tica_plots_06/fdis/fdis'  ## What tica data do you want to look at
tica_data = coor.load(
    'tica_data_05/fdis_tica_data.h5')  ## Where is that tica data?
tica_data_cat = np.concatenate(tica_data)


def kin_var_plot():
    ''' Plot kinetic variance vs time to identify how many tica
       components to use '''
    cumvar = np.load(open("tica_data_05/chi2_cumvar.npy", "rb"))
    fig, ax = plt.subplots()
    index = range(1, len(cumvar) + 1)
    ax.plot(index, cumvar)
    ax.axhline(y=0.95, c='y')
    ax.axhline(y=0.90, c='C1')
    ax.axhline(y=0.85, c='r')
    ax.set_xlabel('Tica index', fontsize=16)
    ax.set_ylabel('Cumulative Variance (%)', fontsize=16)
Пример #9
0
import pyemma.coordinates as coor
import numpy as np
import pyemma.msm as msm
import pyemma.plots as pyemma_plots
import matplotlib.pyplot as plt

sys = 'fdis'
n_clusters = 100
dtrajs = coor.load(f'cluster_data_10/{sys}_{n_clusters}_cluster_dtrajs.h5')
max_lag = 15

dt2 = [i.astype(np.int_) for i in dtrajs]
dt3 = [i.reshape((i.shape[0])) for i in dt2]

msm1 = msm.estimate_markov_model(dt3, 5)

print(f'Acitve state percentage is {msm1.active_state_fraction}')
print(f'Acitve count percentage is {msm1.active_count_fraction}')
Пример #10
0
    #start = 10
    #stop = 50
    ##debugg

    for i in np.arange(start, stop, cutoff):
        for j in np.arange(i+4, stop, cutoff):
            pair.append([i, j])
        
    print np.shape(pair)

    pairs = np.array(pair)

    feat.add_distances(pairs)

    #feat.add_distances_ca()
    X1 = coor.load("traj.xtc", feat, stride=1)

    #traj = md.load("traj.xtc", top="Native.pdb")
    #X1 = md.compute_distances(traj, [[115, 192]], periodic=False)

    print np.shape(X1)
    possible_times = np.logspace(1,100,5)
    possible_times = possible_times.astype(int)
    lag_times = []
    for i in possible_times:
        if i not in lag_times:
            lag_times.append(i)

    print lag_times

    collected_eigenvalues=[]
    '''run vamp score on list of feature options
    at several different lag times
    returns scores and errors'''
    scores = [0] * len(lag_list)
    errors = [0] * len(lag_list)
    for i, lag in enumerate(lag_list):
        scores[i] = []
        errors[i] = []
        vamp_score = [0] * len(feat_option_list)
        for j in range(len(feat_option_list)):
            vamp_score[j] = score_cv(feat_option_list[j], lag=lag, dim=dim, number_of_splits=number_of_splits)
            scores[i] += [vamp_score[j].mean()]
            errors[i] += [vamp_score[j].std()]
    return scores, errors

feat_optionsA = [coor.load(i) for i in feature_list['path_a']]
feat_optionsB = [coor.load(i) for i in feature_list['path_b']]
feat_options = [0] * len(feat_optionsA)
for i in range(len(feat_optionsA)):
    feat_options[i] = feat_optionsA[i] + feat_optionsB[i]

scores, errors = run_vamp_score(feat_options, dim=10, number_of_splits=20)

with open('vamp_scores_10dim_both_1.npy','wb') as handle:
    np.save(handle, scores)

with open('vamp_errors_10dim_both_1.npy','wb') as handle:
    np.save(handle, errors)

print(scores)
print(errors)
#cluster tica data into clusters
import pyemma.coordinates as coor
import numpy as np

sys = 'fdis'
tica_data = coor.load('tica_data_05/fdis_tica_data.h5')

n_clusters = 100

cl = coor.cluster_kmeans(tica_data, k=n_clusters, max_iter=50)

#cl.save(f'cluster_data/{sys}_{n_clusters}_mini_cluster_object.h5', overwrite=True)

cl.write_to_hdf5(f'cluster_data_11/{sys}_{n_clusters}_cluster_dtrajs22.h5')
from util.plot_structure_util import plot_vmd_cylinder_from_inds, plot_pymol_cylinder_from_inds

dis_cutoff = 1.0
std_cutoff = 0.035
outfile = 'filtered_distance_featurization_01/filtered_dis_ind_10_035_more'
save = True
plot = 'all'  # should be all, pymol, vmd, or none

traj_num = [f'{i:04d}' for i in range(100)] 
traj_path = '../DESRES-Trajectory_sarscov2-10880334-no-water-no-ion-glueCA/sarscov2-10880334-no-water-no-ion-glueCA/sarscov2-10880334-no-water-no-ion-glueCA-'
traj_list = [ traj_path + str(i) + '.dcd' for i in traj_num]

pdb = '../DESRES_protease_chainid.pdb'
feat = coor.featurizer(pdb)
feat.add_distances(feat.pairs(feat.select('name == CA and chainid == 0'), excluded_neighbors=3))
traj = coor.load(traj_list, feat, stride=5)
traj_cat = np.concatenate(traj)

feat1 = coor.featurizer(pdb)
feat1.add_distances(feat1.pairs(feat1.select('name == CA and chainid == 1'), excluded_neighbors=3))
traj1 = coor.load(traj_list, feat, stride=5)
traj_cat1 = np.concatenate(traj)

traj_cat_pair = np.concatenate((traj_cat, traj_cat1), axis=0)

min_dist = traj_cat_pair.min(axis=0)
std_dist = traj_cat_pair.std(axis=0)

new_dists = np.where((min_dist < dis_cutoff) & (std_dist > std_cutoff))[0]

print('new distances:', new_dists.shape)
#cluster data into a small amount of clusters to later pull out structures
import pyemma.coordinates as coor
import numpy as np

sys = 'back'
tica_data = coor.load('tica_data_05/back_tica_data.h5')

n_clusters = 50

cl = coor.cluster_kmeans(tica_data, k=n_clusters, max_iter=50)

cl.save(f'{sys}_{n_clusters}_mini_cluster_object.h5', overwrite=True)

cl.write_to_hdf5(f'{sys}_{n_clusters}_cluster_dtrajs.h5')
Пример #15
0
#! /usr/bin/env/ python
# Featurize trajectories several different ways to test in next step

import mdtraj as md
import pyemma.coordinates as coor
import numpy as np
import pickle
import pyemma
import os
import pandas as pd

var_cutoff = 0.9  # adjust to find elbow of of cumulative kinetic variance

feature_list = pd.read_pickle('feature_list_1.pickl')

data_a = coor.load('feature_data_02/backbone_chi1_2_chain_0.h5')
data_b = coor.load('feature_data_02/backbone_chi1_2_chain_1.h5')
data = data_a + data_b

tica = coor.tica(data=data, lag=10, kinetic_map=False, commute_map=True)

with open('tica_data_05/chi2_cumvar.npy', 'wb') as handle:
    np.save(handle, tica.cumvar)

tica.var_cutoff = var_cutoff

print('Number of dimentions saved is: ', tica.dimension())
tica.write_to_hdf5('tica_data_05/chi2_tica_data.h5')
import pyemma.coordinates as coor

# define features to load for spacetime diffusion map analysis: heavy atom coordinates only.
print('define basis functions: heavy atom coordinates')
print('\n')
sys.stdout.flush()

featurizer = coor.featurizer(topology)
featurizer.add_selection(featurizer.select_Heavy())

print(featurizer.dimension())
sys.stdout.flush()

# use featurizer to read in trajectory
X1 = coor.load(allfiles1, featurizer, stride=nskip)
# concatenating trajectory chunks into one single trajectory
X1 = np.vstack(X1)

print(X1.shape)

print('trajectory loaded!')
sys.stdout.flush()

# extracting the (indices) subset of configurations from the whole trajectory that was just loaded
state = [[] for k in range(n_traj)]
for i in range(n_traj):
    for j in range(hidden_states):
        state[i].append(states[i][j][np.where(states[i][j]%int(nskip) == 0)]/int(nskip))

my_idx1 = state[0][state_idx]