def read_and_featurize(filename, dihedrals=['chi2'], stride=10): #print("reading and featurizing %s" %(filename)) top = md.load_frame(filename, 0).topology #print("got top") atom_indices = [a.index for a in top.atoms if a.residue.resSeq == 93 and a.residue != "POPC" and str(a.residue)[0] == "H"] print(len(atom_indices)) #atom_indices = [a.index for a in top.atoms if a.residue.chain.index == 0 and a.residue.resSeq != 93 and a.residue != "POPC" and a.residue.resSeq != 130 and a.residue.resSeq != 172 and a.residue.resSeq != 79 and a.residue.resSeq != 341] #print("got indices") traj = md.load(filename, stride=1000, atom_indices=atom_indices) #print("got traj") featurizer = DihedralFeaturizer(types = dihedrals) features = featurizer.transform(traj_list = traj) #print(np.shape(features)) #print("finished featurizing") directory = filename.split("/") condition = directory[len(directory)-2] dcd_file = directory[len(directory)-1] new_file = "%s_features_stride%d.h5" %(dcd_file.rsplit( ".", 1 )[ 0 ] , stride) new_root_dir = "/scratch/users/enf/b2ar_analysis/subsampled_features" new_condition_dir = "%s/%s" %(new_root_dir, condition) new_file_full = "%s/%s/%s" %(new_root_dir, condition, new_file) #print("saving features as %s" %new_file_full) verbosedump(features, new_file_full) return features
def read_and_featurize(filename, dihedrals=['phi', 'psi', 'chi2'], stride=10): print("reading and featurizing %s" %(filename)) traj = md.load(filename) #test_traj_init = md.load_frame(filename,5) #test_traj_init.save_pdb("/scratch/users/enf/b2ar_analysis/test_init.pdb") #traj.topology = fix_topology(traj.topology) #traj[-1].save_pdb("/scratch/users/enf/b2ar_analysis/test_fixed.pdb") #traj.save_dcd("/scratch/users/enf/b2ar_analysis/test_fixed.dcd") #print("got traj") featurizer = DihedralFeaturizer(types = dihedrals) features = featurizer.transform(traj_list = traj) #print("finished featurizing") directory = filename.split("/") traj_file = directory[len(directory)-1] condition = traj_file.split("_")[0].split(".")[0] print("Condition %s has features of shape %s" %(condition, np.shape(features))) new_file = "/scratch/users/enf/b2ar_analysis/combined_features/%s_features.h5" %condition verbosedump(features, new_file)
def read_and_featurize(filename, dihedrals=['phi', 'psi', 'chi2'], stride=10): print(("reading and featurizing %s" % (filename))) traj = md.load(filename) #test_traj_init = md.load_frame(filename,5) #test_traj_init.save_pdb("/scratch/users/enf/b2ar_analysis/test_init.pdb") #traj.topology = fix_topology(traj.topology) #traj[-1].save_pdb("/scratch/users/enf/b2ar_analysis/test_fixed.pdb") #traj.save_dcd("/scratch/users/enf/b2ar_analysis/test_fixed.dcd") #print("got traj") featurizer = DihedralFeaturizer(types=dihedrals) features = featurizer.transform(traj_list=traj) #print("finished featurizing") directory = filename.split("/") traj_file = directory[len(directory) - 1] condition = traj_file.split("_")[0].split(".")[0] print(("Condition %s has features of shape %s" % (condition, np.shape(features)))) new_file = "/scratch/users/enf/b2ar_analysis/combined_features/%s_features.h5" % condition verbosedump(features, new_file)
def read_and_featurize_divided(filename, dihedrals=['phi', 'psi', 'chi2'], stride=10): #print("reading and featurizing %s" %(filename)) traj_top = md.load_frame(filename,0).topology atom_indices = [a.index for a in traj_top.atoms if a.residue.name[0:2] != "HI"] traj = md.load(filename,atom_indices=atom_indices) #print("got traj") featurizer = DihedralFeaturizer(types = dihedrals) features = featurizer.transform(traj_list = traj) #print(np.shape(features)) #print("finished featurizing") directory = filename.split("/") condition = directory[len(directory)-2] dcd_file = directory[len(directory)-1] new_file = "%s_features_stride%d.h5" %(dcd_file.rsplit( ".", 1 )[ 0 ] , stride) new_root_dir = "/scratch/users/enf/b2ar_analysis/subsampled_features" new_condition_dir = "%s/%s" %(new_root_dir, condition) new_file_full = "%s/%s/%s" %(new_root_dir, condition, new_file) #print("saving features as %s" %new_file_full) verbosedump(features, new_file_full) return features
def read_and_featurize_divided(filename, dihedrals=['phi', 'psi', 'chi2'], stride=10): #print("reading and featurizing %s" %(filename)) traj_top = md.load_frame(filename, 0).topology atom_indices = [ a.index for a in traj_top.atoms if a.residue.name[0:2] != "HI" ] traj = md.load(filename, atom_indices=atom_indices) #print("got traj") featurizer = DihedralFeaturizer(types=dihedrals) features = featurizer.transform(traj_list=traj) #print(np.shape(features)) #print("finished featurizing") directory = filename.split("/") condition = directory[len(directory) - 2] dcd_file = directory[len(directory) - 1] new_file = "%s_features_stride%d.h5" % (dcd_file.rsplit(".", 1)[0], stride) new_root_dir = "/scratch/users/enf/b2ar_analysis/subsampled_features" new_condition_dir = "%s/%s" % (new_root_dir, condition) new_file_full = "%s/%s/%s" % (new_root_dir, condition, new_file) #print("saving features as %s" %new_file_full) verbosedump(features, new_file_full) return features
def read_and_featurize(filename, dihedrals=['chi2'], stride=10): #print("reading and featurizing %s" %(filename)) top = md.load_frame(filename, 0).topology #print("got top") atom_indices = [a.index for a in top.atoms if a.residue.resSeq == 93 and a.residue != "POPC" and str(a.residue)[0] == "H"] print((len(atom_indices))) #atom_indices = [a.index for a in top.atoms if a.residue.chain.index == 0 and a.residue.resSeq != 93 and a.residue != "POPC" and a.residue.resSeq != 130 and a.residue.resSeq != 172 and a.residue.resSeq != 79 and a.residue.resSeq != 341] #print("got indices") traj = md.load(filename, stride=1000, atom_indices=atom_indices) #print("got traj") featurizer = DihedralFeaturizer(types = dihedrals) features = featurizer.transform(traj_list = traj) #print(np.shape(features)) #print("finished featurizing") directory = filename.split("/") condition = directory[len(directory)-2] dcd_file = directory[len(directory)-1] new_file = "%s_features_stride%d.h5" %(dcd_file.rsplit( ".", 1 )[ 0 ] , stride) new_root_dir = "/scratch/users/enf/b2ar_analysis/subsampled_features" new_condition_dir = "%s/%s" %(new_root_dir, condition) new_file_full = "%s/%s/%s" %(new_root_dir, condition, new_file) #print("saving features as %s" %new_file_full) verbosedump(features, new_file_full) return features
def test_function_featurizer(): trajectories = AlanineDipeptide().get_cached().trajectories trj0 = trajectories[0] # use the dihedral to compute phi for ala atom_ind = [[4, 6, 8, 14]] func = compute_dihedrals # test with args f = FunctionFeaturizer(func, func_args={"indices": atom_ind}) res1 = f.transform([trj0]) # test with function in a function without any args def funcception(trj): return compute_phi(trj)[1] f = FunctionFeaturizer(funcception) res2 = f.transform([trj0]) # know results f3 = DihedralFeaturizer(['phi'], sincos=False) res3 = f3.transform([trj0]) # compare all for r in [res2, res3]: np.testing.assert_array_almost_equal(res1, r)
def test_get_common_features(): yaml_file = load_yaml_file(os.path.join(base_dir,"mdl_dir","project.yaml")) aligned_dict={} for protein in yaml_file["protein_list"]: t = load_random_traj(yaml_file, protein) aligned_dict[protein] = t.top.to_fasta(chain=0) f= DihedralFeaturizer() common_feature_dic,_ = _get_common_features(yaml_file,f, aligned_dict, False) for protein in yaml_file["protein_list"]: t = load_random_traj(yaml_file, protein) assert(len(common_feature_dic[protein])==f.transform(t)[0].shape[1]) return
def test_code_works(): # creates a 4-state HMM on the ALA2 data. Nothing fancy, just makes # sure the code runs without erroring out trajectories = AlanineDipeptide().get_cached().trajectories topology = trajectories[0].topology indices = topology.select('symbol C or symbol O or symbol N') featurizer = DihedralFeaturizer(['phi', 'psi'], trajectories[0][0]) sequences = featurizer.transform(trajectories) hmm = VonMisesHMM(n_states=4, n_init=1) hmm.fit(sequences) assert len(hmm.timescales_ == 3) assert np.any(hmm.timescales_ > 50)
def test_DihedralFeaturizer_describe_features_nosincos(): feat = DihedralFeaturizer(sincos=False) rnd_traj = np.random.randint(len(trajectories)) features = feat.transform([trajectories[rnd_traj]]) df = pd.DataFrame(feat.describe_features(trajectories[rnd_traj])) for f in range(25): f_index = np.random.choice(len(df)) atom_inds = df.iloc[f_index].atominds feature_value = md.compute_dihedrals(trajectories[rnd_traj], [atom_inds]) if feat.sincos: func = getattr(np, '%s' % df.iloc[f_index].otherinfo) feature_value = func(feature_value) assert (features[0][:, f_index] == feature_value.flatten()).all()
def fit_and_transform(directory, stride=5): projected_data_filename = "/scratch/users/enf/b2ar_analysis/phi_psi_chi_stride%d_projected.h5" % stride fit_model_filename = "/scratch/users/enf/b2ar_analysis/phi_psi_chi2_stride%s_tica_coords.h5" % stride #active_pdb_file = "/scratch/users/enf/b2ar_analysis/3P0G_pymol_prepped.pdb" active_pdb_file = "/scratch/users/enf/b2ar_analysis/system_B.pdb" tica_model = tICA(n_components=4) if not os.path.exists(projected_data_filename): print("loading feature files") feature_files = get_trajectory_files(directory) pool = mp.Pool(mp.cpu_count()) features = pool.map(load_features, feature_files) pool.terminate() if not os.path.exists(fit_model_filename): print("fitting data to tICA model") fit_model = tica_model.fit(features) verbosedump(fit_model, fit_model_filename) transformed_data = fit_model.transform(features) verbosedump(transformed_data, projected_data_filename) else: print("loading tICA model") fit_model = verboseload(fit_model_filename) transformed_data = fit_model.transform(features) verbosedump(transformed_data, projected_data_filename) else: fit_model = verboseload(fit_model_filename) transformed_data = verboseload(projected_data_filename) active_pdb = md.load(active_pdb_file) top = active_pdb.topology atom_indices = [ a.index for a in top.atoms if a.residue.is_protein and a.residue.resSeq != 341 and a.residue.name[0:2] != "HI" and a.residue.resSeq != 79 and a.residue.resSeq != 296 and a.residue.resSeq != 269 and a.residue. resSeq != 178 and a.residue.resSeq != 93 and a.residue.name != "NMA" and a.residue.name != "NME" and a.residue.name != "ACE" ] active_pdb = md.load(active_pdb_file, atom_indices=atom_indices) featurizer = DihedralFeaturizer(types=['phi', 'psi', 'chi2']) active_pdb_features = featurizer.transform(active_pdb) active_pdb_projected = fit_model.transform(active_pdb_features) print((active_pdb_projected[0:4]))
def test_pickle(): """Test pickling an HMM""" trajectories = AlanineDipeptide().get_cached().trajectories topology = trajectories[0].topology indices = topology.select('symbol C or symbol O or symbol N') featurizer = DihedralFeaturizer(['phi', 'psi'], trajectories[0][0]) sequences = featurizer.transform(trajectories) hmm = VonMisesHMM(n_states=4, n_init=1) hmm.fit(sequences) logprob, hidden = hmm.predict(sequences) with tempfile.TemporaryFile() as savefile: pickle.dump(hmm, savefile) savefile.seek(0, 0) hmm2 = pickle.load(savefile) logprob2, hidden2 = hmm2.predict(sequences) assert (logprob == logprob2)
def read_and_featurize(filename, dihedrals=['phi','psi','chi2'], stride=10): print("reading and featurizing %s" %(filename)) traj = md.load(filename).select('chain A and protein') featurizer = DihedralFeaturizer(types = dihedrals) features = featurizer.transform(traj_list = traj) print("finished featurizing") directory = filename.split("/") condition = directory[len(directory)-2] dcd_file = directory[len(directory)-1] new_file = "%s_features_stride%d.h5" %(dcd_file.rsplit( ".", 1 )[ 0 ] , stride) new_root_dir = "/home/enf/b2ar_analysis/subsampled_features/" new_condition_dir = "%s/%s" %(new_root_dir, condition) if not os.path.exists(new_condition_dir): os.makedirs(new_condition_dir) new_file_full = "%s/%s/%s" %(new_root_dir, condition, new_file) print("saving features as %s" %new_file_full) verbosedump(features, new_file_full) return features
def read_and_featurize(filename, dihedrals=['phi', 'psi', 'chi2'], stride=10): print(("reading and featurizing %s" % (filename))) traj = md.load(filename).select('chain A and protein') featurizer = DihedralFeaturizer(types=dihedrals) features = featurizer.transform(traj_list=traj) print("finished featurizing") directory = filename.split("/") condition = directory[len(directory) - 2] dcd_file = directory[len(directory) - 1] new_file = "%s_features_stride%d.h5" % (dcd_file.rsplit(".", 1)[0], stride) new_root_dir = "/home/enf/b2ar_analysis/subsampled_features/" new_condition_dir = "%s/%s" % (new_root_dir, condition) if not os.path.exists(new_condition_dir): os.makedirs(new_condition_dir) new_file_full = "%s/%s/%s" % (new_root_dir, condition, new_file) print(("saving features as %s" % new_file_full)) verbosedump(features, new_file_full) return features
def fit_and_transform(directory, stride=5): projected_data_filename = "/scratch/users/enf/b2ar_analysis/phi_psi_chi_stride%d_projected.h5" %stride fit_model_filename = "/scratch/users/enf/b2ar_analysis/phi_psi_chi2_stride%s_tica_coords.h5" %stride #active_pdb_file = "/scratch/users/enf/b2ar_analysis/3P0G_pymol_prepped.pdb" active_pdb_file = "/scratch/users/enf/b2ar_analysis/system_B.pdb" tica_model = tICA(n_components=4) if not os.path.exists(projected_data_filename): print("loading feature files") feature_files = get_trajectory_files(directory) pool = mp.Pool(mp.cpu_count()) features = pool.map(load_features, feature_files) pool.terminate() if not os.path.exists(fit_model_filename): print("fitting data to tICA model") fit_model = tica_model.fit(features) verbosedump(fit_model, fit_model_filename) transformed_data = fit_model.transform(features) verbosedump(transformed_data, projected_data_filename) else: print("loading tICA model") fit_model = verboseload(fit_model_filename) transformed_data = fit_model.transform(features) verbosedump(transformed_data, projected_data_filename) else: fit_model = verboseload(fit_model_filename) transformed_data = verboseload(projected_data_filename) active_pdb = md.load(active_pdb_file) top = active_pdb.topology atom_indices = [a.index for a in top.atoms if a.residue.is_protein and a.residue.resSeq != 341 and a.residue.name[0:2] != "HI" and a.residue.resSeq != 79 and a.residue.resSeq != 296 and a.residue.resSeq != 269 and a.residue.resSeq != 178 and a.residue.resSeq != 93 and a.residue.name != "NMA" and a.residue.name != "NME" and a.residue.name != "ACE"] active_pdb = md.load(active_pdb_file, atom_indices=atom_indices) featurizer = DihedralFeaturizer(types=['phi', 'psi', 'chi2']) active_pdb_features = featurizer.transform(active_pdb) active_pdb_projected = fit_model.transform(active_pdb_features) print(active_pdb_projected[0:4])
def test_FeatureSelector_describe_features(): rnd_traj = np.random.randint(len(trajectories)) f_ca = ContactFeaturizer(scheme='CA', ignore_nonprotein=True) f1 = f_ca.transform([trajectories[rnd_traj]]) df1 = pd.DataFrame(f_ca.describe_features(trajectories[rnd_traj])) f_dih = DihedralFeaturizer() f2 = f_dih.transform([trajectories[rnd_traj]]) df2 = pd.DataFrame(f_dih.describe_features(trajectories[rnd_traj])) df_dict = {} df_dict["ca"] = df1 df_dict["dih"] = df2 f_comb = FeatureSelector([('ca', f_ca), ('dih', f_dih)]) f3 = f_comb.transform([trajectories[rnd_traj]]) df3 = pd.DataFrame(f_comb.describe_features(trajectories[rnd_traj])) assert len(df3) == len(df1) + len(df2) df4 = pd.concat([df_dict[i] for i in f_comb.feat_list]) # lets randomly compare 40 features for i in np.random.choice(range(len(df3)), 40): for j in df3.columns: assert eq(df3.iloc[i][j], df4.iloc[i][j])
### Featurization based on dihedral angles for the protein folding trajectories ### Required packages: mdtraj, msmbuilder, glob ### @Chuankai Zhao, [email protected] import mdtraj as md import glob from msmbuilder.featurizer import DihedralFeaturizer from msmbuilder.utils import verbosedump, verboseload # Set the path of MD trajectories and the name of topology files. trajaddress = "/home/amoffet2/msm_network_project/folding/lindorff-larsen_2011_trajs/protein_g-350K/DESRES-Trajectory_NuG2-*-protein/NuG2-*-protein/*.dcd" top = "/home/amoffet2/msm_network_project/folding/lindorff-larsen_2011_trajs/protein_g-350K/protein_g.pdb" # Load the trajectories using mdtraj files = glob.glob(trajaddress) traj_list = [] for f in files: t = md.load(f, top=top, stride=10) traj_list.append(t) # Featurize the trajectories based on phi, psi, chi1 dihedral angles model = DihedralFeaturizer(types=['phi', 'psi', 'chi1']) features = model.transform(traj_list) # Set the path of output file and save the output. pkl = "/home/czhao37/2-SAXS-Adaptive_Samping/6-protein-G/features/featurized_1mio.pkl" verbosedump(features, pkl)
if traj.endswith(".dcd"): traj_files.append("%s/%s" %(traj_dir,traj)) traj_files.sort() traj = md.load(traj_files, top = "/home/harrigan/compute/wetmsm/gpcr/des/system_mae_to_pdb/des_trajs/DESRES-Trajectory_pnas2011b-H-05-all/system.pdb", stride=10) traj = traj[0].join(traj[1:]) traj.save("/home/enf/b2ar_analysis/H-05/%s" %("combined_traj_stride10.h5")) else: ''' #print("loading h5 traj") #traj = md.load("combined_traj_stride10.h5") ''' ''' if not (os.path.isfile("phi_psi_chi2_features_vd_stride10.h5")): print("featurizing") phi_psi_chi2 = DihedralFeaturizer(types=['phi','psi','chi2']) features = phi_psi_chi2.transform(traj_list = traj) print("finished featurizing") verbosedump(features, "phi_psi_chi2_features_vd_stride10.h5") else: print("loading existing features") features = verboseload("phi_psi_chi2_features_vd_stride10.h5") features = [np.concatenate(features)] if not (os.path.isfile("reduced_phi_psi_chi_stride10.h5")): print("Fitting tICA model") tica_model = tICA(n_components=4) fitted_model = tica_model.fit(features) reduced_data = fitted_model.transform(features) verbosedump(reduced_data, "reduced_phi_psi_chi_stride10.h5") print(tica_model.summarize()) else:
import mdtraj as md import pandas as pd from msmbuilder.msm import MarkovStateModel from msmbuilder.cluster import KMeans flist = glob.glob("../trajectory.xtc") top = md.load("../top.pdb") trj_list = [md.load(i, top=top) for i in flist] print("Found %d trajs" % len(trj_list)) f = DihedralFeaturizer(sincos=False) dump(f, "raw_featurizer.pkl") feat = f.transform(trj_list) dump(feat, "raw_features.pkl") f = load("./featurizer.pkl") dump(f, "featurizer.pkl") df1 = pd.DataFrame(f.describe_features(trj_list[0])) dump(df1, "feature_descriptor.pkl") feat = f.transform(trj_list) dump(feat, "features.pkl") t = tICA(lag_time=100, n_components=2, kinetic_mapping=False) tica_feat = t.fit_transform(feat)
if traj.endswith(".dcd"): traj_files.append("%s/%s" %(traj_dir,traj)) traj_files.sort() traj = md.load(traj_files, top = "/home/harrigan/compute/wetmsm/gpcr/des/system_mae_to_pdb/des_trajs/DESRES-Trajectory_pnas2011b-H-05-all/system.pdb", stride=10) traj = traj[0].join(traj[1:]) traj.save("/home/enf/b2ar_analysis/H-05/%s" %("combined_traj_stride10.h5")) else: ''' #print("loading h5 traj") #traj = md.load("combined_traj_stride10.h5") ''' ''' if not (os.path.isfile("phi_psi_chi2_features_vd_stride10.h5")): print("featurizing") phi_psi_chi2 = DihedralFeaturizer(types=['phi', 'psi', 'chi2']) features = phi_psi_chi2.transform(traj_list=traj) print("finished featurizing") verbosedump(features, "phi_psi_chi2_features_vd_stride10.h5") else: print("loading existing features") features = verboseload("phi_psi_chi2_features_vd_stride10.h5") features = [np.concatenate(features)] if not (os.path.isfile("reduced_phi_psi_chi_stride10.h5")): print("Fitting tICA model") tica_model = tICA(n_components=4) fitted_model = tica_model.fit(features) reduced_data = fitted_model.transform(features) verbosedump(reduced_data, "reduced_phi_psi_chi_stride10.h5") print((tica_model.summarize())) else: