def delete_shell(core_mol, del_mol, cut_off, in_out='in'): """ This function deletes molecules present in the passed argument del_mol that are far (in_out=out) or close (in_out=in) than the selected cutoff distance (in A) from the passed molecules core_mol Parameters: ----------- core_mol: OEMol molecule The core molecules del_mol: OEMol molecule The molecules to be deleted if their distances from the core_mol molecules are greater or closer that the selected cutoff distance cut_off: python float number The threshold distance in A used to mark atom for deletion in_out: python string A flag used to select if delete molecules far or close than the cutoff distance from the core_mol Return: ------- reset_del: copy of del_mol where atoms have been deleted with reset atom indexes """ if in_out not in ['in', 'out']: raise ValueError( "The passed in_out parameter is not recognized: {}".format(in_out)) # Copy the passed molecule to delete in to_del = oechem.OEMol(del_mol) # Create a OE bit vector mask for each atoms of the # molecule to delete bv = oechem.OEBitVector(to_del.GetMaxAtomIdx()) bv.NegateBits() # Create the Nearest neighbours nn = oechem.OENearestNbrs(to_del, cut_off) for nbrs in nn.GetNbrs(core_mol): # bv.SetBitOff(nbrs.GetBgn().GetIdx()) for atom in oechem.OEGetResidueAtoms(nbrs.GetBgn()): bv.SetBitOff(atom.GetIdx()) # Invert selection mask if in_out == 'in': bv.NegateBits() pred = oechem.OEAtomIdxSelected(bv) for atom in to_del.GetAtoms(pred): to_del.DeleteAtom(atom) # It is necessary to reset the atom indexes of the molecule with # delete atoms to avoid possible mismatching reset_del = oechem.OEMol(to_del) return reset_del
def _OEFixBuiltLoopFragmentNumbers(protein): """ Temporary fix, thanks to Jesper! """ prev_fn = -1 # Checking for CA atoms, since this will avoid messing with the caps and built sidechains, # since this is only a built loop problem builtPred = oespruce.OEIsModeledAtom() for atom in protein.GetAtoms(oechem.OEIsCAlpha()): res = oechem.OEAtomGetResidue(atom) fn = res.GetFragmentNumber() if builtPred(atom) and prev_fn != -1: for ra in oechem.OEGetResidueAtoms(atom): r = oechem.OEAtomGetResidue(ra) r.SetFragmentNumber(prev_fn) oechem.OEAtomSetResidue(ra, r) else: prev_fn = fn
def around(dist, ls): """ This function select atom not far than the threshold distance from the current selection. The threshold distance is in Angstrom selection can be: mask = '5.0 around ligand' """ # at = system.GetAtom(oechem.OEHasAtomIdx(idx)) # Atom set selection atom_set_around = set() # Create a OE bit vector mask for each atoms bv_around = oechem.OEBitVector(system.GetMaxAtomIdx()) # Set the mask atom for at in system.GetAtoms(): if at.GetIdx() in ls: bv_around.SetBitOn(at.GetIdx()) # Predicate pred = oechem.OEAtomIdxSelected(bv_around) # Create the system molecule based on the atom mask molecules = oechem.OEMol() oechem.OESubsetMol(molecules, system, pred) # Create the Nearest neighbours nn = oechem.OENearestNbrs(system, float(dist)) for nbrs in nn.GetNbrs(molecules): for atom in oechem.OEGetResidueAtoms(nbrs.GetBgn()): if atom.GetIdx() in ls: continue atom_set_around.add(atom.GetIdx()) return atom_set_around
def extract_aligned_prot_lig_wat_traj(md_components, flask, trj_fn, opt, nmax=30, water_cutoff=15.0): """ Extracts the aligned protein trajectory and aligned ligand trajectory and aligned Water trajectory from a MD trajectory of a larger system that includes other components (eg water). The passed in setup mol must have the topology that matches the trajectory, and its xyz coordinates are the reference for the alignment. The alignment is done on the alpha carbons (atom name CA) of the active site residues within cutoff from the ligand. Once the alignment is done, the protein and ligand trajectories are each placed into a separate OEMol, one conformer per trajectory frame. Water trajectory is selecting the nmax waters from the ligand and protein CA within the cutoff distance for each trajectory snapshot Inputs: md_components: MDComponents object The md components carrying the setup starting flask. flask: OEMol The system flask trj_fn: String The filename of the hdf5-format MD trajectory or Gromacs .trr file format water_cutoff: Float The cutoff distance between the PL binding site and the waters in angstroms nmax: Integer max number of waters to select Outputs: multi_conf_protein: A multi conformer OEMol for the protein, one conformer per frame. multi_conf_ligand: A multi conformer OEMol for the ligand, one conformer per frame. multi_conf_water: A multi conformer OEMol for the waters, one conformer per frame. """ # Extract protein, ligand, water and excipients from the flask # protein, ligand, water, excipients = oeommutils.split(flask, ligand_res_name="LIG") set_up_flask, map_dic = md_components.create_flask protein = md_components.get_protein ligand = md_components.get_ligand check_nmax = nmax_waters(protein, ligand, water_cutoff) if check_nmax < nmax: opt['Logger'].warn( "The selected number of max waters cannot fit around the protein binding site: {} vs {}" .format(nmax, check_nmax)) void, traj_ext = os.path.splitext(trj_fn) traj_dir = os.path.dirname(trj_fn) if traj_ext == '.h5': trj = md.load_hdf5(trj_fn) elif traj_ext == '.trr': pdb_fn = glob.glob(os.path.join(traj_dir, '*.pdb'))[0] trj = md.load_trr(trj_fn, top=pdb_fn) trj = trj[1:] else: raise ValueError( "Trajectory file format {} not recognized in the trajectory {}". format(traj_ext, trj_fn)) # System topology top_trj = trj.topology # Ligand indexes # lig_idx = top_trj.select("resname LIG") lig_idx = map_dic['ligand'] # Protein indexes # prot_idx = top_trj.select("protein") # It is safer to use OE toolkits than mdtraj which is missing the protein caps prot_idx = map_dic['protein'] # for at in protein.GetAtoms(): # prot_idx.append(at.GetIdx()) # Water oxygen indexes water_O_idx = top_trj.select("water and element O") # Protein carbon alpha indexes prot_ca_idx = top_trj.select("backbone and element C") # Cutoff for the selection of the binding site atoms in A cutoff_bs = 5.0 # Carbon alpha binding site indexes ca_bs_idx = md.compute_neighbors(trj[0], cutoff_bs / 10.0, lig_idx, haystack_indices=prot_ca_idx, periodic=True)[0] # Carbon alpha binding site and ligand indexes ca_bs_lig_idx = np.concatenate((ca_bs_idx, lig_idx)) # Image the protein-ligand trajectory so the complex does not jump across box boundaries protlig = trj[0].atom_slice(np.concatenate((prot_idx, lig_idx))) protligAtoms = [atom for atom in protlig.topology.atoms] with open(os.devnull, 'w') as devnull: with contextlib.redirect_stderr(devnull): trjImaged = trj.image_molecules(inplace=False, anchor_molecules=[protligAtoms], make_whole=True) # trjImaged = trj.image_molecules(inplace=False, anchor_molecules=[protligAtoms], make_whole=True) count = 0 water_max_frames = [] # TODO DEBUG # trjImaged = trjImaged[:10] for frame in trjImaged: # print(count, flush=True) # Water oxygen binding site indexes water_O_bs_idx = md.compute_neighbors(frame, water_cutoff / 10.0, ca_bs_lig_idx, haystack_indices=water_O_idx, periodic=True) # Pair combination water indexes times ligand indexes wat_lig_pairs = np.array(np.meshgrid(water_O_bs_idx, lig_idx)).T.reshape(-1, 2) # Distances between the waters and the ligand in nm wat_lig_distances = md.compute_distances(frame, wat_lig_pairs, periodic=True, opt=True) # Reshape the wat_lig_distances ns = np.reshape(wat_lig_distances, (len(water_O_bs_idx[0]), len(lig_idx))) # Min distances in nm between the oxygen waters and the ligand min_wat_O_lig_distances = np.min(ns, axis=1) # Pair combination water indexes times protein binding site carbon alpha indexes wat_ca_bs_pairs = np.array(np.meshgrid(water_O_bs_idx, ca_bs_idx)).T.reshape(-1, 2) # Distances between the waters and the protein binding site carbon alpha in nm wat_ca_bs_distances = md.compute_distances(frame, wat_ca_bs_pairs, periodic=True, opt=True) # Reshape the wat_ca_bs_distances ns = np.reshape(wat_ca_bs_distances, (len(water_O_bs_idx[0]), len(ca_bs_idx))) # Min distances in nm between the oxygen waters and the protein binding site carbon alpha min_wat_O_ca_bs_distances = np.min(ns, axis=1) metrics = min_wat_O_lig_distances + min_wat_O_ca_bs_distances metric_distances = dict() for wat_idx, m in zip(water_O_bs_idx[0], metrics): metric_distances[int(wat_idx)] = m water_list_sorted_max = sorted(metric_distances.items(), key=lambda x: x[1])[:nmax] if len(water_list_sorted_max) != nmax: raise ValueError( "The ordered water list has the wrong size {} vs expected {} for the frame {}" .format(len(water_list_sorted_max), nmax, count)) water_max_frames.append(water_list_sorted_max) # print(min_wat_O_ca_bs_distances) # print(pairs[:len(lig_idx), :]) # for p,d in zip(wat_ca_bs_pairs, wat_ca_bs_distances[0]): # print(p,d) count += 1 # Put the reference mol xyz into the 1-frame topologyTraj to use as a reference in the fit setup_mol_array_coords = oechem.OEDoubleArray(3 * set_up_flask.GetMaxAtomIdx()) set_up_flask.GetCoords(setup_mol_array_coords) setup_mol_xyzArr = np.array(setup_mol_array_coords) setup_mol_xyzArr.shape = (-1, 3) trj_reference = trjImaged[0] # convert from angstroms to nanometers trj_reference.xyz[0] = setup_mol_xyzArr / 10.0 # Fitting trjImaged.superpose(trj_reference, 0, ca_bs_idx) # Delete Original Trajectory to save memory del trj # Molecule copies ligand_reference = oechem.OEMol(ligand) protein_reference = oechem.OEMol(protein) count = 0 # Create the multi conformer protein, ligand and water molecules for frame in trjImaged.xyz: # print("Trj Image loop", count, flush=True) # Extract coordinates in A xyz = frame * 10 # Set flask Coordinates as the current frame for the water extraction flask.SetCoords(xyz.flatten()) water_list_sorted_max = water_max_frames[count] # print(water_list_sorted_max) # TODO The following solution to extract the waters do not # keep the water order # Mark the close water atoms and extract them bv = oechem.OEBitVector(nmax * 3) water_idx = [] for pair in water_list_sorted_max: ow = flask.GetAtom(oechem.OEHasAtomIdx(pair[0])) # Select the whole water molecule for atw in oechem.OEGetResidueAtoms(ow): bv.SetBitOn(atw.GetIdx()) water_idx.append(atw.GetIdx()) pred_vec = oechem.OEAtomIdxSelected(bv) water_nmax_reference = oechem.OEMol() oechem.OESubsetMol(water_nmax_reference, flask, pred_vec) # TODO The following solution to extract the waters # keep the water order but is it seems extremely inefficient # water_list = [] # for pair in water_list_sorted_max: # bv = oechem.OEBitVector(3) # water_idx = [] # ow = flask.GetAtom(oechem.OEHasAtomIdx(pair[0])) # # # Select the whole water molecule # for atw in oechem.OEGetResidueAtoms(ow): # bv.SetBitOn(atw.GetIdx()) # water_idx.append(atw.GetIdx()) # # pred_vec = oechem.OEAtomIdxSelected(bv) # water = oechem.OEMol() # oechem.OESubsetMol(water, flask, pred_vec) # # water_list.append(water) # # # # print(len(water_list)) # # water_nmax_reference = oechem.OEMol() # for w in water_list: # oechem.OEAddMols(water_nmax_reference, w) # ligand and protein conf coordinates lig_xyz_list = [10 * frame[idx] for idx in lig_idx] lig_confxyz = oechem.OEFloatArray(np.array(lig_xyz_list).ravel()) prot_xyz_list = [10 * frame[idx] for idx in prot_idx] prot_confxyz = oechem.OEFloatArray(np.array(prot_xyz_list).ravel()) # Initialize the protein, ligand and water molecule topologies if count == 0: multi_conf_water = oechem.OEMol(water_nmax_reference) if multi_conf_water.NumAtoms() % 3 != 0: raise ValueError("Number of Water atoms is not multiple of 3") # Clean ResNumber and Chain on the multi conf water molecule # oechem.OEPerceiveResidues(multi_conf_water, oechem.OEPreserveResInfo_All) multi_conf_water.SetTitle("Water_" + str(nmax)) res_num = 0 i = 0 for at in multi_conf_water.GetAtoms(): res = oechem.OEAtomGetResidue(at) res.SetSerialNumber(i) res.SetName("HOH") res.SetChainID("Z") if i % 3 == 0: res_num += 1 res.SetResidueNumber(res_num) i += 1 ligand_reference.SetCoords(lig_confxyz) protein_reference.SetCoords(prot_confxyz) multi_conf_ligand = oechem.OEMol(ligand_reference) multi_conf_protein = oechem.OEMol(protein_reference) # Attach the conformers on the multi conformer protein, ligand and water molecules else: water_confxyz = oechem.OEFloatArray( water_nmax_reference.NumAtoms() * 3) water_nmax_reference.GetCoords(water_confxyz) multi_conf_water.NewConf(water_confxyz) multi_conf_ligand.NewConf(lig_confxyz) multi_conf_protein.NewConf(prot_confxyz) count += 1 return multi_conf_protein, multi_conf_ligand, multi_conf_water