def featurize_file(job_tuple):

    yaml_file, protein, feat, traj_file,stride = job_tuple
    yaml_file = load_yaml_file(yaml_file)

    if feat is None:
        feat = DihedralFeaturizer(types=['phi', 'psi','chi1'])

    _check_output_folder_exists(yaml_file, protein)

    output_folder = os.path.join(yaml_file["base_dir"],
                                 protein,
                                 yaml_file["feature_dir"])

    traj_name = os.path.splitext(os.path.basename(traj_file))[0]
    output_fname = os.path.join(output_folder, traj_name+".jl")

    feat_descriptor = os.path.join(output_folder, "feature_descriptor.h5")
    try:
        trj = mdt.load(traj_file)
    except :
        warnings.warn("Removing %s because of misformed trajectory"%traj_file)
        os.remove(traj_file)
        return

    features = feat.partial_transform(trj)
    verbosedump(features, output_fname)

    if not os.path.isfile(feat_descriptor) and hasattr(feat, "describe_features"):
        dih_df = pd.DataFrame(feat.describe_features(trj[0]))
        verbosedump(dih_df, feat_descriptor)

    return
예제 #2
0
def test_DihedralFeaturizer_describe_features_nosincos():
    feat = DihedralFeaturizer(sincos=False)
    rnd_traj = np.random.randint(len(trajectories))
    features = feat.transform([trajectories[rnd_traj]])
    df = pd.DataFrame(feat.describe_features(trajectories[rnd_traj]))

    for f in range(25):
        f_index = np.random.choice(len(df))

        atom_inds = df.iloc[f_index].atominds
        feature_value = md.compute_dihedrals(trajectories[rnd_traj],
                                             [atom_inds])
        if feat.sincos:
            func = getattr(np, '%s' % df.iloc[f_index].otherinfo)
            feature_value = func(feature_value)

        assert (features[0][:, f_index] == feature_value.flatten()).all()
def test_DihedralFeaturizer_describe_features_nosincos():
    feat = DihedralFeaturizer(sincos=False)
    rnd_traj = np.random.randint(len(trajectories))
    features = feat.transform([trajectories[rnd_traj]])
    df = pd.DataFrame(feat.describe_features(trajectories[rnd_traj]))

    for f in range(25):
        f_index = np.random.choice(len(df))

        atom_inds = df.iloc[f_index].atominds
        feature_value = md.compute_dihedrals(trajectories[rnd_traj],
                                             [atom_inds])
        if feat.sincos:
            func = getattr(np, '%s' % df.iloc[f_index].otherinfo)
            feature_value = func(feature_value)

        assert (features[0][:, f_index] == feature_value.flatten()).all()
예제 #4
0
def Get_dihedral_features_villin():
 import os 
 import shutil
 import mdtraj as md
 os.chdir('/homes/anuginueni/traj_villin')
 if(os.path.isdir('./diheds')):  
   shutil.rmtree('./diheds')
 from msmbuilder.dataset import dataset
 t=md.load( "/homes/anuginueni/traj_villin/trajectory-331.xtc",top='/homes/anuginueni/traj_villin/filtered.pdb',stride=5)
 xyz = dataset( "/homes/anuginueni/traj_villin/*.xtc",topology='/homes/anuginueni/traj_villin/filtered.pdb',stride=5) 
 from msmbuilder.featurizer import DihedralFeaturizer        #for dihedrals          
 featurizer = DihedralFeaturizer(types=['phi', 'psi'])       #for dihedrals
 diheds = xyz.fit_transform_with(featurizer, 'diheds/', fmt='dir-npy') #for dihedrals
 des_feat=featurizer.describe_features(t)
 res = [ sub['resids'] for sub in des_feat ]
 print(str(res))
 return diheds
예제 #5
0
def test_FeatureSelector_describe_features():
    rnd_traj = np.random.randint(len(trajectories))
    f_ca = ContactFeaturizer(scheme='CA', ignore_nonprotein=True)
    f1 = f_ca.transform([trajectories[rnd_traj]])
    df1 = pd.DataFrame(f_ca.describe_features(trajectories[rnd_traj]))

    f_dih = DihedralFeaturizer()
    f2 = f_dih.transform([trajectories[rnd_traj]])
    df2 = pd.DataFrame(f_dih.describe_features(trajectories[rnd_traj]))

    df_dict = {}
    df_dict["ca"] = df1
    df_dict["dih"] = df2

    f_comb = FeatureSelector([('ca', f_ca), ('dih', f_dih)])
    f3 = f_comb.transform([trajectories[rnd_traj]])
    df3 = pd.DataFrame(f_comb.describe_features(trajectories[rnd_traj]))
    assert len(df3) == len(df1) + len(df2)
    df4 = pd.concat([df_dict[i] for i in f_comb.feat_list])
    # lets randomly compare 40 features
    for i in np.random.choice(range(len(df3)), 40):
        for j in df3.columns:
            assert eq(df3.iloc[i][j], df4.iloc[i][j])
def test_FeatureSelector_describe_features():
    rnd_traj = np.random.randint(len(trajectories))
    f_ca = ContactFeaturizer(scheme='CA', ignore_nonprotein=True)
    f1 = f_ca.transform([trajectories[rnd_traj]])
    df1 = pd.DataFrame(f_ca.describe_features(trajectories[rnd_traj]))

    f_dih = DihedralFeaturizer()
    f2 = f_dih.transform([trajectories[rnd_traj]])
    df2 = pd.DataFrame(f_dih.describe_features(trajectories[rnd_traj]))

    df_dict = {}
    df_dict["ca"] = df1
    df_dict["dih"] = df2

    f_comb = FeatureSelector([('ca', f_ca), ('dih', f_dih)])
    f3 = f_comb.transform([trajectories[rnd_traj]])
    df3 = pd.DataFrame(f_comb.describe_features(trajectories[rnd_traj]))
    assert len(df3) == len(df1) + len(df2)
    df4 = pd.concat([df_dict[i] for i in f_comb.feat_list])
    # lets randomly compare 40 features
    for i in np.random.choice(range(len(df3)), 40):
        for j in df3.columns:
            assert eq(df3.iloc[i][j], df4.iloc[i][j])
예제 #7
0
def featurize_traj(job_tuple):
    #separate out the job tuple into required things
    mutant,mutant_dir,project,proj_folder,proj_top_folder,traj_file,stride,save_common,allowed_residue_ind \
    = job_tuple
    #load top file to setup solute/solvent indices
    top_path = os.path.join(proj_top_folder, "%s.pdb"%os.path.basename(traj_file).split("_")[0])
    top_trj = mdtraj.load(top_path)

    #set up featurizer objects
    dihedral_feat = DihedralFeaturizer(types=['phi', 'psi','chi1'])

    #load the trajectory
    try:
        trj = mdtraj.load(traj_file,stride=stride)
    except:
        print "Cant featurize %s"%traj_file
        return 
    #setup file name
    traj_name = os.path.splitext(os.path.basename(traj_file))[0]
    print traj_name
    dihedral_output_file = os.path.join(mutant_dir,"features/dihedral_features/")+str(project)+\
    "_"+traj_name+".h5"
    water_output_file = os.path.join(mutant_dir,"features/water_features/")+str(project)+\
    "_"+traj_name+".h5"
    combined_output_file = os.path.join(mutant_dir,"features/combined_features/")+str(project)+\
    "_"+traj_name+".h5"
    do_again=True
    already_done=False
    if os.path.isfile(combined_output_file):
    	f = verboseload(combined_output_file)
	if f.shape[0]!=trj.n_frames:
		already_done=True

    if not already_done or do_again:
        dihedral_features = dihedral_feat.partial_transform(trj)

    	traj_name = os.path.splitext(os.path.basename(traj_file))[0]

        dihedral_output_file = os.path.join(mutant_dir,"features/dihedral_features/")+str(project)+\
        "_"+traj_name+".h5"

        #now we can dump
    	verbosedump(dihedral_features,dihedral_output_file)

        if save_common:
            dih_df = pandas.DataFrame(dihedral_feat.describe_features(top_trj))

            dih_f_ind = numpy.array([set(i).issubset(allowed_residue_ind) for i in dih_df["resid"]])

            subset_dihedral_features = dihedral_features[:,dih_f_ind]

            dihedral_output_file = os.path.join(mutant_dir,"features/common_basis/dihedral_features/")+\
            str(project)+"_"+traj_name+".h5"


            #now we can dump
            verbosedump(subset_dihedral_features,dihedral_output_file)
            #save the featurizer information.
            verbosedump([dih_df,allowed_residue_ind,dih_f_ind,],\
os.path.join(mutant_dir,"features/common_basis/dihedral_features/")+"saved_dihed_feat.h5")

            return

    else:
	   print "skipping featurization for %s since its already done"%traj_name
    return
예제 #8
0
top = md.load("../top.pdb")

trj_list = [md.load(i, top=top) for i in flist]
print("Found %d trajs" % len(trj_list))

f = DihedralFeaturizer(sincos=False)
dump(f, "raw_featurizer.pkl")

feat = f.transform(trj_list)

dump(feat, "raw_features.pkl")

f = load("./featurizer.pkl")
dump(f, "featurizer.pkl")
df1 = pd.DataFrame(f.describe_features(trj_list[0]))
dump(df1, "feature_descriptor.pkl")
feat = f.transform(trj_list)

dump(feat, "features.pkl")

t = tICA(lag_time=100, n_components=2, kinetic_mapping=False)

tica_feat = t.fit_transform(feat)

dump(t, "tica_mdl.pkl")
dump(tica_feat, "tica_features.pkl")

kmeans_mdl = KMeans(50)
ass = kmeans_mdl.fit_predict(tica_feat)
msm_mdl = MarkovStateModel(100)
#msmbuilder imports
from msmbuilder.dataset import dataset
from msmbuilder.featurizer import ContactFeaturizer
from msmbuilder.featurizer import DihedralFeaturizer
from msmbuilder.decomposition import tICA
from msmbuilder.cluster import MiniBatchKMeans
from msmbuilder.msm import ContinuousTimeMSM
from msmbuilder.utils import verbosedump, verboseload
from msmbuilder.cluster import KCenters
from msmbuilder.utils import load, dump

#other imports
import os, glob, shutil
import numpy as np
import mdtraj as md
import pandas as pd
import pickle
#prettier plots

a = np.arange(1119, 1277)
top = md.load("../prot.pdb", atom_indices=a)

# swap this for whatever you have. The code for now supports contacts, dihedral, and angles.
feat = DihedralFeaturizer(types=['chi1', 'chi2'])

# this basically maps every feature to atom indices.
df1 = pd.DataFrame(feat.describe_features(top))
dump(df1, "feature_descriptor.pkl")
예제 #10
0
featurizer = DihedralFeaturizer(types=['chi1', 'chi2'])
#dump(featurizer,"raw_featurizer.pkl")

#from msmbuilder.utils import load,dump
f = DihedralFeaturizer(types=['chi1', 'chi2'], sincos=False)
dump(f, "raw_featurizer.pkl")

#featurizer = DihedralFeaturizer(types=['chi1', 'chi2'], resids= 73,74,75,76,77,78,79,80,81,82,83)
diheds = featurizer.fit_transform(ds)
dump(diheds, "features.pkl")

#print(ds[0].shape)
print(diheds[0].shape)

# this basically maps every feature to atom indices.
df1 = pd.DataFrame(featurizer.describe_features(ds))
dump(df1, "feature_descriptor.pkl")

#Robust scaling
from msmbuilder.preprocessing import RobustScaler
scaler = RobustScaler()
scaled_diheds = scaler.fit_transform(diheds)

print(diheds[0].shape)
print(scaled_diheds[0].shape)

#Reducing dimension
tica_model = tICA(lag_time=1, n_components=10)
# fit and transform can be done in seperate steps:
tica_model.fit(diheds)
tica_trajs = tica_model.transform(diheds)