def featurize_file(job_tuple): yaml_file, protein, feat, traj_file,stride = job_tuple yaml_file = load_yaml_file(yaml_file) if feat is None: feat = DihedralFeaturizer(types=['phi', 'psi','chi1']) _check_output_folder_exists(yaml_file, protein) output_folder = os.path.join(yaml_file["base_dir"], protein, yaml_file["feature_dir"]) traj_name = os.path.splitext(os.path.basename(traj_file))[0] output_fname = os.path.join(output_folder, traj_name+".jl") feat_descriptor = os.path.join(output_folder, "feature_descriptor.h5") try: trj = mdt.load(traj_file) except : warnings.warn("Removing %s because of misformed trajectory"%traj_file) os.remove(traj_file) return features = feat.partial_transform(trj) verbosedump(features, output_fname) if not os.path.isfile(feat_descriptor) and hasattr(feat, "describe_features"): dih_df = pd.DataFrame(feat.describe_features(trj[0])) verbosedump(dih_df, feat_descriptor) return
def test_DihedralFeaturizer_describe_features_nosincos(): feat = DihedralFeaturizer(sincos=False) rnd_traj = np.random.randint(len(trajectories)) features = feat.transform([trajectories[rnd_traj]]) df = pd.DataFrame(feat.describe_features(trajectories[rnd_traj])) for f in range(25): f_index = np.random.choice(len(df)) atom_inds = df.iloc[f_index].atominds feature_value = md.compute_dihedrals(trajectories[rnd_traj], [atom_inds]) if feat.sincos: func = getattr(np, '%s' % df.iloc[f_index].otherinfo) feature_value = func(feature_value) assert (features[0][:, f_index] == feature_value.flatten()).all()
def Get_dihedral_features_villin(): import os import shutil import mdtraj as md os.chdir('/homes/anuginueni/traj_villin') if(os.path.isdir('./diheds')): shutil.rmtree('./diheds') from msmbuilder.dataset import dataset t=md.load( "/homes/anuginueni/traj_villin/trajectory-331.xtc",top='/homes/anuginueni/traj_villin/filtered.pdb',stride=5) xyz = dataset( "/homes/anuginueni/traj_villin/*.xtc",topology='/homes/anuginueni/traj_villin/filtered.pdb',stride=5) from msmbuilder.featurizer import DihedralFeaturizer #for dihedrals featurizer = DihedralFeaturizer(types=['phi', 'psi']) #for dihedrals diheds = xyz.fit_transform_with(featurizer, 'diheds/', fmt='dir-npy') #for dihedrals des_feat=featurizer.describe_features(t) res = [ sub['resids'] for sub in des_feat ] print(str(res)) return diheds
def test_FeatureSelector_describe_features(): rnd_traj = np.random.randint(len(trajectories)) f_ca = ContactFeaturizer(scheme='CA', ignore_nonprotein=True) f1 = f_ca.transform([trajectories[rnd_traj]]) df1 = pd.DataFrame(f_ca.describe_features(trajectories[rnd_traj])) f_dih = DihedralFeaturizer() f2 = f_dih.transform([trajectories[rnd_traj]]) df2 = pd.DataFrame(f_dih.describe_features(trajectories[rnd_traj])) df_dict = {} df_dict["ca"] = df1 df_dict["dih"] = df2 f_comb = FeatureSelector([('ca', f_ca), ('dih', f_dih)]) f3 = f_comb.transform([trajectories[rnd_traj]]) df3 = pd.DataFrame(f_comb.describe_features(trajectories[rnd_traj])) assert len(df3) == len(df1) + len(df2) df4 = pd.concat([df_dict[i] for i in f_comb.feat_list]) # lets randomly compare 40 features for i in np.random.choice(range(len(df3)), 40): for j in df3.columns: assert eq(df3.iloc[i][j], df4.iloc[i][j])
def featurize_traj(job_tuple): #separate out the job tuple into required things mutant,mutant_dir,project,proj_folder,proj_top_folder,traj_file,stride,save_common,allowed_residue_ind \ = job_tuple #load top file to setup solute/solvent indices top_path = os.path.join(proj_top_folder, "%s.pdb"%os.path.basename(traj_file).split("_")[0]) top_trj = mdtraj.load(top_path) #set up featurizer objects dihedral_feat = DihedralFeaturizer(types=['phi', 'psi','chi1']) #load the trajectory try: trj = mdtraj.load(traj_file,stride=stride) except: print "Cant featurize %s"%traj_file return #setup file name traj_name = os.path.splitext(os.path.basename(traj_file))[0] print traj_name dihedral_output_file = os.path.join(mutant_dir,"features/dihedral_features/")+str(project)+\ "_"+traj_name+".h5" water_output_file = os.path.join(mutant_dir,"features/water_features/")+str(project)+\ "_"+traj_name+".h5" combined_output_file = os.path.join(mutant_dir,"features/combined_features/")+str(project)+\ "_"+traj_name+".h5" do_again=True already_done=False if os.path.isfile(combined_output_file): f = verboseload(combined_output_file) if f.shape[0]!=trj.n_frames: already_done=True if not already_done or do_again: dihedral_features = dihedral_feat.partial_transform(trj) traj_name = os.path.splitext(os.path.basename(traj_file))[0] dihedral_output_file = os.path.join(mutant_dir,"features/dihedral_features/")+str(project)+\ "_"+traj_name+".h5" #now we can dump verbosedump(dihedral_features,dihedral_output_file) if save_common: dih_df = pandas.DataFrame(dihedral_feat.describe_features(top_trj)) dih_f_ind = numpy.array([set(i).issubset(allowed_residue_ind) for i in dih_df["resid"]]) subset_dihedral_features = dihedral_features[:,dih_f_ind] dihedral_output_file = os.path.join(mutant_dir,"features/common_basis/dihedral_features/")+\ str(project)+"_"+traj_name+".h5" #now we can dump verbosedump(subset_dihedral_features,dihedral_output_file) #save the featurizer information. verbosedump([dih_df,allowed_residue_ind,dih_f_ind,],\ os.path.join(mutant_dir,"features/common_basis/dihedral_features/")+"saved_dihed_feat.h5") return else: print "skipping featurization for %s since its already done"%traj_name return
top = md.load("../top.pdb") trj_list = [md.load(i, top=top) for i in flist] print("Found %d trajs" % len(trj_list)) f = DihedralFeaturizer(sincos=False) dump(f, "raw_featurizer.pkl") feat = f.transform(trj_list) dump(feat, "raw_features.pkl") f = load("./featurizer.pkl") dump(f, "featurizer.pkl") df1 = pd.DataFrame(f.describe_features(trj_list[0])) dump(df1, "feature_descriptor.pkl") feat = f.transform(trj_list) dump(feat, "features.pkl") t = tICA(lag_time=100, n_components=2, kinetic_mapping=False) tica_feat = t.fit_transform(feat) dump(t, "tica_mdl.pkl") dump(tica_feat, "tica_features.pkl") kmeans_mdl = KMeans(50) ass = kmeans_mdl.fit_predict(tica_feat) msm_mdl = MarkovStateModel(100)
#msmbuilder imports from msmbuilder.dataset import dataset from msmbuilder.featurizer import ContactFeaturizer from msmbuilder.featurizer import DihedralFeaturizer from msmbuilder.decomposition import tICA from msmbuilder.cluster import MiniBatchKMeans from msmbuilder.msm import ContinuousTimeMSM from msmbuilder.utils import verbosedump, verboseload from msmbuilder.cluster import KCenters from msmbuilder.utils import load, dump #other imports import os, glob, shutil import numpy as np import mdtraj as md import pandas as pd import pickle #prettier plots a = np.arange(1119, 1277) top = md.load("../prot.pdb", atom_indices=a) # swap this for whatever you have. The code for now supports contacts, dihedral, and angles. feat = DihedralFeaturizer(types=['chi1', 'chi2']) # this basically maps every feature to atom indices. df1 = pd.DataFrame(feat.describe_features(top)) dump(df1, "feature_descriptor.pkl")
featurizer = DihedralFeaturizer(types=['chi1', 'chi2']) #dump(featurizer,"raw_featurizer.pkl") #from msmbuilder.utils import load,dump f = DihedralFeaturizer(types=['chi1', 'chi2'], sincos=False) dump(f, "raw_featurizer.pkl") #featurizer = DihedralFeaturizer(types=['chi1', 'chi2'], resids= 73,74,75,76,77,78,79,80,81,82,83) diheds = featurizer.fit_transform(ds) dump(diheds, "features.pkl") #print(ds[0].shape) print(diheds[0].shape) # this basically maps every feature to atom indices. df1 = pd.DataFrame(featurizer.describe_features(ds)) dump(df1, "feature_descriptor.pkl") #Robust scaling from msmbuilder.preprocessing import RobustScaler scaler = RobustScaler() scaled_diheds = scaler.fit_transform(diheds) print(diheds[0].shape) print(scaled_diheds[0].shape) #Reducing dimension tica_model = tICA(lag_time=1, n_components=10) # fit and transform can be done in seperate steps: tica_model.fit(diheds) tica_trajs = tica_model.transform(diheds)