def _test_tic_sampling(yaml_file, protein_name, tic_list, n_frames, scheme): #test to make sure we are sampling right sample_for_all_proteins(yaml_file, [protein_name], tic_list, n_frames, scheme=scheme) ser = ProteinSeries(yaml_file) prt = Protein(ser, protein_name) for tic_index in [0,1]: traj_path = os.path.join(base_dir,yaml_file["mdl_dir"], protein_name,"tic%d.xtc"%tic_index) traj_top = os.path.join(base_dir,yaml_file["mdl_dir"], protein_name, "prot.pdb") tica_traj = mdt.load(traj_path,top=traj_top) print(tica_traj.n_frames) feat = DihedralFeaturizer(types=['phi', 'psi','chi1']) f = feat.partial_transform(tica_traj) t_f = np.round(prt.tica_mdl.transform([f])) #check that the tic goes from min to max print("Look here",t_f[0]) assert t_f[0][0][tic_index] <= t_f[0][-1][tic_index] all_vals = [] for traj_tica_data in prt.tica_data.values(): all_vals.extend(traj_tica_data[:,tic_index]) #sort it because all three sampling schemes use it all_vals = np.round(np.sort(all_vals)) print(tic_index) print(t_f[0][:,tic_index] >= all_vals[0]) print(t_f[0][:,tic_index] <= all_vals[-1]) #make sure the frames are within limitsss assert (t_f[0][:,tic_index] >= all_vals[0]).all() assert (t_f[0][:,tic_index] <= all_vals[-1]).all() return True
def test_dihedral_feat(): print(base_dir) pool = Pool(6) yaml_file = load_yaml_file(os.path.join(base_dir,"mdl_dir","project.yaml")) for prt in ["kinase_1", "kinase_2"]: print(prt) prj = yaml_file["project_dict"][prt][0] featurize_project_wrapper(yaml_file, prt, feat=None, stride=1, view=pool) feat = DihedralFeaturizer(types=['phi', 'psi','chi1']) flist = glob.glob(os.path.join(base_dir, prt , yaml_file["protein_dir"],"*.hdf5")) for i in np.random.choice(flist, 3): trj = mdt.load(i) my_feat = feat.partial_transform(trj) expected_fname = os.path.join(base_dir, prt, yaml_file["feature_dir"], os.path.splitext(os.path.basename(i))[0]+".jl") calc_feat = verboseload(expected_fname) assert np.allclose(my_feat, calc_feat) return True
def featurize_file(job_tuple): yaml_file, protein, feat, traj_file,stride = job_tuple yaml_file = load_yaml_file(yaml_file) if feat is None: feat = DihedralFeaturizer(types=['phi', 'psi','chi1']) _check_output_folder_exists(yaml_file, protein) output_folder = os.path.join(yaml_file["base_dir"], protein, yaml_file["feature_dir"]) traj_name = os.path.splitext(os.path.basename(traj_file))[0] output_fname = os.path.join(output_folder, traj_name+".jl") feat_descriptor = os.path.join(output_folder, "feature_descriptor.h5") try: trj = mdt.load(traj_file) except : warnings.warn("Removing %s because of misformed trajectory"%traj_file) os.remove(traj_file) return features = feat.partial_transform(trj) verbosedump(features, output_fname) if not os.path.isfile(feat_descriptor) and hasattr(feat, "describe_features"): dih_df = pd.DataFrame(feat.describe_features(trj[0])) verbosedump(dih_df, feat_descriptor) return
def feat_traj(traj): # load again to get the waters trj = mdt.load(traj) atp_solute = [ i.index for i in trj.topology.atoms if (i.residue.name == "atp" and (i.element.name == "oxygen" or i.element.name == "nitrogen")) or (i.residue.name == "MG") ] # get the oxygen and nitrogen indices solute_indices = [ i.index for i in trj.topology.atoms if i.residue.is_protein and (i.element.name == "oxygen" or i.element.name == "nitrogen") ] # get the oxygen solvent indices solvent_indices = [i.index for i in trj.topology.atoms if (i.residue.is_water and i.element.name != "hydrogen")] # set up featurizers atp_feat = wmsm.SolventShellsFeaturizer(atp_solute, solvent_indices, 2, 0.3) water_feat = wmsm.SolventShellsFeaturizer(solute_indices, solvent_indices, 2, 0.3) dihedral_feat = DihedralFeaturizer(["phi", "psi", "chi1"]) # calculate features water_features = water_feat.partial_transform(trj) dihedral_features = dihedral_feat.partial_transform(trj) atp_features = atp_feat.partial_transform(trj) combined_features = np.hstack((dihedral_features, water_features, atp_features)) return combined_features # dump fname = os.path.basename(traj) save_path = os.path.join("/nobackup/msultan/research/kinase/fyn_kinase/fah_data/features/") verbosedump(dihedral_features, os.path.join((save_path, "dihedral/%s" % fname))) verbosedump(water_features, os.path.join((save_path, "water/%s" % fname))) verbosedump(atp_features, os.path.join((save_path, "atp/%s" % fname))) verbosedump(combined_features, os.path.join((save_path, "combined/%s" % fname))) return
def featurize_traj(job_tuple): #separate out the job tuple into required things mutant,mutant_dir,project,proj_folder,proj_top_folder,traj_file,stride,save_common,allowed_residue_ind \ = job_tuple #load top file to setup solute/solvent indices top_path = os.path.join(proj_top_folder, "%s.pdb"%os.path.basename(traj_file).split("_")[0]) top_trj = mdtraj.load(top_path) #set up featurizer objects dihedral_feat = DihedralFeaturizer(types=['phi', 'psi','chi1']) #load the trajectory try: trj = mdtraj.load(traj_file,stride=stride) except: print "Cant featurize %s"%traj_file return #setup file name traj_name = os.path.splitext(os.path.basename(traj_file))[0] print traj_name dihedral_output_file = os.path.join(mutant_dir,"features/dihedral_features/")+str(project)+\ "_"+traj_name+".h5" water_output_file = os.path.join(mutant_dir,"features/water_features/")+str(project)+\ "_"+traj_name+".h5" combined_output_file = os.path.join(mutant_dir,"features/combined_features/")+str(project)+\ "_"+traj_name+".h5" do_again=True already_done=False if os.path.isfile(combined_output_file): f = verboseload(combined_output_file) if f.shape[0]!=trj.n_frames: already_done=True if not already_done or do_again: dihedral_features = dihedral_feat.partial_transform(trj) traj_name = os.path.splitext(os.path.basename(traj_file))[0] dihedral_output_file = os.path.join(mutant_dir,"features/dihedral_features/")+str(project)+\ "_"+traj_name+".h5" #now we can dump verbosedump(dihedral_features,dihedral_output_file) if save_common: dih_df = pandas.DataFrame(dihedral_feat.describe_features(top_trj)) dih_f_ind = numpy.array([set(i).issubset(allowed_residue_ind) for i in dih_df["resid"]]) subset_dihedral_features = dihedral_features[:,dih_f_ind] dihedral_output_file = os.path.join(mutant_dir,"features/common_basis/dihedral_features/")+\ str(project)+"_"+traj_name+".h5" #now we can dump verbosedump(subset_dihedral_features,dihedral_output_file) #save the featurizer information. verbosedump([dih_df,allowed_residue_ind,dih_f_ind,],\ os.path.join(mutant_dir,"features/common_basis/dihedral_features/")+"saved_dihed_feat.h5") return else: print "skipping featurization for %s since its already done"%traj_name return
def _fit_transform(prt, trj): f=DihedralFeaturizer(types=['phi', 'psi','chi1']) feat = f.partial_transform(trj) t_f = prt.tica_mdl.transform([feat]) st = prt.kmeans_mdl.transform(t_f) return st