def process_water_structures(initial_pdb, main_chains, ligand): """ Detects the waters we have to keep (important for the simulation) and returns a structure holding them. Important waters are the ones closer to Template residue 50 (Ile), the aa is not but it is not guaranteed to be conserved, which means we have to rely into the residue number to choose it, and take any offset into account if needed. Extra: water molecules must be also close to the binding site. We will pick then the water that has minimum distance to the binding site and residue 50 :param initial_pdb: The pdb (prody structure) we want to extract the chains. :return: A dictionary indexed by the water id (res. num. + chain id) holding the prody pdb structure of that water. """ hw = prody.HierView(initial_pdb.select("protein")) water_structs = {} for chain in hw.iterChains(): if chain.getChid() in main_chains: # We cannot do a direct selection, instead we iterate for i, residue in enumerate(chain.iterResidues()): if i == 50: # 50th residue break residue_com = prody.calcCenter(residue) if ligand is None: ligand_com = prody.calcCenter(initial_pdb) else: ligand_com = prody.calcCenter(ligand) # Identify closer water waters = initial_pdb.select("name O and water") if waters is not None: distance_to_R50 = numpy.sqrt( ((residue_com - waters.getCoords())**2).sum(axis=1)) distance_to_BindSite = numpy.sqrt( ((ligand_com - waters.getCoords())**2).sum(axis=1)) distances = distance_to_R50 + distance_to_BindSite min_dist = numpy.min(distances) min_dist_index = numpy.where(distances == min_dist) water_resnum = waters.getResnums()[min_dist_index] water_chid = waters.getChids()[min_dist_index][0] water_id = "%d:%s" % (water_resnum, water_chid) # We use a dict in order to get rid of repeats selection_string = "resnum %d and chain %s" % (water_resnum, water_chid) water_structs[water_id] = initial_pdb.water.select( selection_string).copy() return water_structs
def process_water_structures(initial_pdb, main_chains, ligand): """ Detects the waters we have to keep (important for the simulation) and returns a structure holding them. Important waters are the ones closer to Template residue 50 (Ile), the aa is not but it is not guaranteed to be conserved, which means we have to rely into the residue number to choose it, and take any offset into account if needed. Extra: water molecules must be also close to the binding site. We will pick then the water that has minimum distance to the binding site and residue 50 :param initial_pdb: The pdb (prody structure) we want to extract the chains. :return: A dictionary indexed by the water id (res. num. + chain id) holding the prody pdb structure of that water. """ hw = prody.HierView(initial_pdb.select("protein")) water_structs = {} for chain in hw.iterChains(): if chain.getChid() in main_chains: # We cannot do a direct selection, instead we iterate for i, residue in enumerate(chain.iterResidues()): if i == 50: # 50th residue break residue_com = prody.calcCenter(residue) if ligand is None: ligand_com = prody.calcCenter(initial_pdb) else: ligand_com =prody.calcCenter(ligand) # Identify closer water waters = initial_pdb.select("name O and water") if waters is not None: distance_to_R50 = numpy.sqrt(((residue_com - waters.getCoords())**2).sum(axis=1)) distance_to_BindSite = numpy.sqrt(((ligand_com - waters.getCoords())**2).sum(axis=1)) distances = distance_to_R50 + distance_to_BindSite min_dist = numpy.min(distances) min_dist_index = numpy.where(distances == min_dist) water_resnum = waters.getResnums()[min_dist_index] water_chid = waters.getChids()[min_dist_index][0] water_id = "%d:%s"%(water_resnum, water_chid) # We use a dict in order to get rid of repeats selection_string = "resnum %d and chain %s"%(water_resnum, water_chid) water_structs[water_id] = initial_pdb.water.select(selection_string).copy() return water_structs
def _prepare_points(self): """Load structures and compute the location of the points of the 3D image to be generated. """ self.complex = parsePDB(self.path) protein = self.complex.select("not (resname WER or water)") ligand = self.complex.select("resname WER") center = calcCenter(ligand.getCoords()) moveAtoms(self.complex, by=-center) center = calcCenter(self.complex.select("resname WER").getCoords()) self.protein.structure = protein self.ligand.structure = ligand self.points = grid_around(center, self.size, spacing=24 / (self.size - 1))
def make_query_coords(self): q1_coords = [ self.query.select('name ' + n).getCoords()[0] for n in self.query_lig_corr[0] ] if self.query_cyclic[0]: len_coords = len(q1_coords) q_sel1_coords = [[q1_coords[i - j] for i in range(len_coords)] for j in range(len_coords)] else: q_sel1_coords = [q1_coords] q2_coords = [ self.query.select('name ' + n).getCoords()[0] for n in self.query_lig_corr[1] ] if self.query_cyclic[1]: len_coords = len(q2_coords) q_sel2_coords = [[q2_coords[i - j] for i in range(len_coords)] for j in range(len_coords)] else: q_sel2_coords = [q2_coords] com = pr.calcCenter( self.query.select('name ' + ' '.join(self.query_lig_corr[0]))) self.query_distance = np.max(cdist( [com], q_sel2_coords[0])) + self.rmsd_threshold superpose_list = [] for q1 in q_sel1_coords: for q2 in q_sel2_coords: superpose_list.append(np.vstack((q1, q2))) self.query_coords = superpose_list
def orient(pdb, selection='all'): act = pdb.select(selection) adj = prody.calcCenter(act) oldcoords = act.getCoords() newcoords = np.subtract(oldcoords, adj) nncoords = varimax(newcoords) trans = prody.calcTransformation(oldcoords, nncoords) trans.apply(pdb) return pdb
@author: victor """ import sys import os import glob import prody from hivprotmut.structures.pdbcuration import CurationSelections if __name__ == '__main__': final_db_folder = sys.argv[1] com_file = sys.argv[2] com_handler = open(com_file, "w") ligand_folders = os.listdir(final_db_folder) # first level are ligands txt_root = os.path.split(final_db_folder)[1] for path in ligand_folders: files = glob.glob(os.path.join(final_db_folder, path, "*.pdb")) for pdb_file in files: pdb = prody.parsePDB(pdb_file) txt_pdb = os.path.split(pdb_file)[1] ligand = pdb.select(CurationSelections.HEAVY_LIGAND_SELECTION) com = prody.calcCenter(ligand) txt_pdb_file = os.path.join(path, txt_pdb) com_handler.write("%s %.3f %.3f %.3f\n"%( txt_pdb_file, com[0], com[1], com[2] )) com_handler.close()
def blast(pdb_path): cdir = os.getcwd() tdir = tempfile.mkdtemp() os.chdir(tdir) receptor = os.path.basename(os.path.splitext(pdb_path)[0]) pdbHead = prody.parsePDBHeader(pdb_path) pdbFile = prody.parsePDB(pdb_path) ligands = [] for chem in pdbHead['chemicals']: ligands.append([chem.chain, str(chem.resnum), chem.resname, chem.name]) blast_result = [] for chain, resnum, resname, name in ligands: rec = pdbFile.select('not (chain {} resnum {})'.format(chain, resnum)) ligand = pdbFile.select('chain {} resnum {}'.format(chain, resnum)) cen_ligand = prody.calcCenter(ligand) res_coll = [] ligCoords = ligand.getCoords() print('lig_size', len(ligCoords)) sequence = '' i = 4 while len(sequence) < 100: for center in ligCoords: around_atoms = rec.select( 'same residue as within {} of center'.format(i), center=center) if around_atoms is None: continue res_coll.append(around_atoms) #res_indices = around_atoms.getResindices() #print(around_atoms.getHierView()['A'].getSequence()) #print (res_indices) #res_coll = res_coll | set(res_indices) resindices = reduce(lambda x, y: x | y, res_coll) sequence = resindices.getHierView()['A'].getSequence() print('sequence', i, len(sequence), sequence) i += 1 with open('sequence.fasta', 'w') as fout: fout.write(">receptor\n" + sequence + '\n') cmd = 'blastp -db {} -query sequence.fasta -outfmt 5 -out result'.format( BLASTDB) #print(os.getcwd()) cl = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE) cl.wait() #print(os.listdir(os.getcwd())) dtree = xml.dom.minidom.parse("result") collection = dtree.documentElement hits = collection.getElementsByTagName("Hit") hit_result = [] for hit in hits: hit_id = hit.getElementsByTagName('Hit_id')[0].childNodes[0].data hsps = hit.getElementsByTagName('Hit_hsps')[0] identity = hsps.getElementsByTagName( 'Hsp_identity')[0].childNodes[0].data align_len = hsps.getElementsByTagName( 'Hsp_align-len')[0].childNodes[0].data qseq = hsps.getElementsByTagName('Hsp_qseq')[0].childNodes[0].data hseq = hsps.getElementsByTagName('Hsp_hseq')[0].childNodes[0].data midline = hsps.getElementsByTagName( 'Hsp_midline')[0].childNodes[0].data blast_result.append([ receptor, hit_id, str(identity), str(align_len), str(len(sequence)), midline, hseq, sequence ]) return blast_result
def compute_atom_distances(pdb_target, res_file, output_report, chain="L"): """ This function calculate atom-atom distances for ligand and residue atoms. The residue number and atom names (for both, ligand and residue) must be specified in a file ('res_file'). :param pdb_target: input PDB file path :param res_file: file with instructions. This file must have n rows with three format: RESNUM LATOMNAME/SLIGAND ATOMNAMES/SRESIDUE. If you want to calculate a distance using a center of mass write [ATOM1,ATOM2,ATOMN] :param output_report: :param chain: :return: """ # Load PDB files target = pdb2prody(pdb_target) ligand = target.select("chain {}".format(chain)) print(ligand.getNames()) # Reading instructions from file list_of_instructions = read_selecteds_from_file(res_file) # Select the input atoms report = [] for line in list_of_instructions: resnum, atom_name_ref, atom_name_tar = line.split() # If the user wants to select more than one atom he has to put them in a string with this format: [atom1,atomN...] if "[" in atom_name_ref or "]" in atom_name_ref: print("Multiple atom selection for the ligand") atom_string_with_comas = atom_name_ref.strip("[").strip("]") atom_list = atom_string_with_comas.split(",") atom_string = ' '.join(atom_list) atom_ref_selected = ligand.select("name {}".format(atom_string)) print("Selected atoms: {}".format(atom_ref_selected.getNames())) else: print("Single atom selection for the ligand") atom_ref_selected = ligand.select("name {}".format(atom_name_ref)) print("Selected atom: {}".format(atom_ref_selected.getNames())) if "[" in atom_name_tar or "]" in atom_name_tar: print("Multiple atom selection for the system") atom_string_with_comas = atom_name_tar.strip("[").strip("]") atom_list = atom_string_with_comas.split(",") atom_string = ' '.join(atom_list) atom_tar_selected = select_atom_given_name_type_and_num( target, resnum, atom_string) print("Selected atoms: {}".format(atom_tar_selected.getNames())) else: print("Single atom selection for the system") atom_tar_selected = select_atom_given_name_type_and_num( target, resnum, atom_name_tar) print("Selected atom: {}".format(atom_tar_selected.getNames())) try: number_of_selected_atoms_ref = len(atom_ref_selected.getNames()) except AttributeError: exit( "None atoms where selected. Please, check if the selected atoms exists in the ligand in {}" .format(pdb_target)) try: number_of_selected_atoms_tar = len(atom_tar_selected.getNames()) except AttributeError: exit( "None atoms where selected. Please, check if the selected atoms exists in the residue {} in {}" .format(resnum, pdb_target)) # Now there are four possibilities: len 1 in target and ref, len > 1 in one of both and len >1 in both. # If the len is more than 1 we will use the center of mass as a point to compute the distance. if number_of_selected_atoms_ref <= 1 and number_of_selected_atoms_tar <= 1: distance = prody.calcDistance(atom_tar_selected, atom_ref_selected) elif number_of_selected_atoms_ref <= 1 and number_of_selected_atoms_tar > 1: center_tar = prody.calcCenter(atom_tar_selected) atom_coords = atom_ref_selected.getCoords()[0] distance = np.linalg.norm(center_tar - atom_coords) elif number_of_selected_atoms_ref > 1 and number_of_selected_atoms_tar <= 1: center_ref = prody.calcCenter(atom_ref_selected) atom_coords = atom_tar_selected.getCoords()[0] distance = np.linalg.norm(atom_coords - center_ref) else: center_tar = prody.calcCenter(atom_tar_selected) center_ref = prody.calcCenter(atom_ref_selected) distance = np.linalg.norm(center_tar - center_ref) report_line = "{:4} {:10} {:10} {:6.3f}\n".format( resnum, ''.join(atom_ref_selected.getNames()), ''.join(atom_tar_selected.getNames()), distance) report.append(report_line) report_final = ''.join(report) if output_report: with open(output_report, "w") as report_file: report_file.write(report_final) print(report_final) return report_final
def bundle_ligand_data(self, pick_one, fake_ligand=True, OUT=True, compare_ResId_native='default', Id_suffix='default', filename=None, benchmark=None): ''' :param pick_one: :param fake_ligand: :param OUT: :param compare_ResId_native: :param Id_suffix: :param filename: :param benchmark: :return: ''' PDB = self.PDBname if fake_ligand == False: ResId = str(pick_one.getResindex()) else: ResId = compare_ResId_native + '_' + str(Id_suffix) pdb_store_dir = self.store_dir other = self.receptor # Extract this ligand from protein (as input for openbabel) if filename is None: filename = pdb_store_dir + '/{1}/{0}_{1}_ligand.pdb'.format( PDB, ResId) if not os.path.isfile(filename): if not os.path.exists(pdb_store_dir + '/' + ResId): os.mkdir(pdb_store_dir + '/' + ResId) if OUT: try: pd.writePDB(filename, pick_one) tar_filename = ''.join(filename.split('.')[:-1]) tar_filename += '.mol' pdb_to_mol2(filename, tar_filename) except: print 'Unexpected Error!' logging.error('Cannot convert {} to mol2 format!'.format( filename.split('/')[-1])) return if not os.path.isfile(filename): if not os.path.exists(pdb_store_dir + '/' + ResId): os.mkdir(pdb_store_dir + '/' + ResId) naming = '{}_{}'.format(PDB, ResId) # Get coordinate of center xyz = pick_one.getCoords() middle = pd.calcCenter(pick_one) # in pi degree , the rotation of the box (if needed) rotation = [0, 0, 0] scale = max( max(xyz[:, 0]) - middle[0], middle[0] - min(xyz[:, 0]), max(xyz[:, 1]) - middle[1], middle[1] - min(xyz[:, 1]), max(xyz[:, 2]) - middle[2], middle[2] - min(xyz[:, 2])) # assert scale <= 10 if scale > self.BOX_range / 2: logging.warning( 'Warning! {} has a ligand out of box scale with {} atom distance to center' .format(PDB, scale)) # Now shifting the boxes: max_scale = max( max(xyz[:, 0]) - min(xyz[:, 0]), max(xyz[:, 1]) - min(xyz[:, 1]), max(xyz[:, 2]) - min(xyz[:, 2])) if max_scale > self.BOX_range: logging.error( 'Assertion failed, {} has a ligand out of box completely with scale' .format(PDB, scale)) return # Try to move to the new center temp_mid = [(max(xyz[:, 0]) + min(xyz[:, 0])) / 2, (max(xyz[:, 1]) + min(xyz[:, 1])) / 2, (max(xyz[:, 2]) + min(xyz[:, 2])) / 2] temp_mid[0] = round(temp_mid[0], 6) temp_mid[1] = round(temp_mid[1], 6) temp_mid[2] = round(temp_mid[2], 6) middle = np.array(temp_mid) print middle # print middle scale_extension = (self.BOX_range - self.BOX_size) / 2 box_num = int(np.ceil(self.BOX_range / self.BOX_size)) xx, yy, zz = np.meshgrid( np.linspace(middle[0] - scale_extension, middle[0] + scale_extension, box_num), np.linspace(middle[1] - scale_extension, middle[1] + scale_extension, box_num), np.linspace(middle[2] - scale_extension, middle[2] + scale_extension, box_num)) # print xx vector = np.c_[xx.ravel(), yy.ravel(), zz.ravel()] num_vector = [0] * len(vector) #print len(vector), box_num for atom in pick_one.iterAtoms(): x, y, z = atom.getCoords() x_pos = int(round(x - vector[0][0])) # assert 0 <= x_pos <= 19 y_pos = int(round(y - vector[0][1])) # assert 0 <= y_pos <= 19 z_pos = int(round(z - vector[0][2])) # assert 0 <= z_pos <= 19 if 0 <= x_pos < box_num and 0 <= y_pos < box_num and 0 <= z_pos < box_num: # Simply change here to fulfill the mark as 'H_1' # note (z(y(x))) follows from atuogrid map file format , otherwise the coordinate system is not correspond coorectly num_vector[z_pos * box_num * box_num + y_pos * box_num + x_pos] = atom.getName() + '_' + str(HETERO_PART) # quick,dirty way to find atoms of protein in cubic boxes pd.defSelectionMacro( 'inbox', 'abs(x-{1}) <= {0} and abs(y-{2}) <= {0} and abs(z-{3}) <= {0}'. format(self.BOX_size / 2, middle[0], middle[1], middle[2])) residues = other.select( 'protein and same residue as within 18 of center', center=middle) if residues is None: logging.warning('{} in {} has no atoms nearby'.format(ResId, PDB)) return # This place might have some potential problem # for ADP or ATP , they might either be part of nucleic and the ligand # This will cause a severe bug when calculating autovina score # TODO fix this issue nearby = residues.select('inbox') if nearby is not None: for atom in nearby.iterAtoms(): x, y, z = atom.getCoords() x_pos = int(round(x - vector[0][0])) # assert 0 <= x_pos <= 19 y_pos = int(round(y - vector[0][1])) # assert 0 <= y_pos <= 19 z_pos = int(round(z - vector[0][2])) # assert 0 <= z_pos <= 19 temp = z_pos * box_num * box_num + y_pos * box_num + x_pos if 0 <= x_pos < box_num and 0 <= y_pos < box_num and 0 <= z_pos < box_num and num_vector[ temp] == 0: # Simply change here to fulfill the mark as 'C_2' num_vector[temp] = atom.getName() + '_' + str(PROTEIN_PART) else: # num_vector[temp] += '|'+atom.getName() + '_' + str(PROTEIN_PART) print atom.getName() logging.warning('Coorinate {} {} {} found at {}'.format( x_pos, y_pos, z_pos, self.PDBname)) # Save into the dict for future locating # naming = '{}_{}'.format(PDB, ResId) # Do autogrid mapgeneration: # ligand_filename = os.path.join(temp_pdb_PREFIX, PDB + '/' + naming + '_ligand.pdb') # receptor_filename = os.path.join(temp_pdb_PREFIX, PDB + '/' + naming + '_receptor.pdb') # complex_filename = os.path.join(temp_pdb_PREFIX, PDB + '/' + naming + '_complex.pdb') # fake_ligand_filename = os.path.join(temp_pdb_PREFIX, 'fake-ligand.pdb') self.heterodict[ResId] = { 'raw_vector': num_vector, 'center': middle, 'rotation': rotation, 'naming': '{}_{}'.format(PDB, ResId), 'chain': 'NA', 'filename': filename, 'id': ResId, 'Resname': 'NA', 'ligand': pick_one, 'protein': residues, 'vina_score': 'NA', 'original_one': True, 'file_generated': False, 'fake_ligand': True, 'RMSF': 0, 'Contact Similarity': 1, 'gridmap_protein': 'NA', 'gridmap_ligand': 'NA', 'gridmap_complex': 'NA' } if fake_ligand == True: try: dist = self._calcRMSD( self.heterodict[compare_ResId_native]['ligand'], pick_one, benchmark=benchmark) print dist self.heterodict[ResId]['RMSF'] = dist except: print 'oops' raise IOError self.heterodict[ResId]['Contact Similarity'] = self._calcQ( self.heterodict[compare_ResId_native]['ligand'], pick_one, benchmark=benchmark) else: self.heterodict[ResId]['Resname'] = pick_one.getResname() self.heterodict[ResId]['chain'] = pick_one.getChid()
def process_residue_into_vector(self, prody_ligand, prody_residue, distance_cutoff_polar=3.5, distance_cutoff_greasy=4): """ Converting each residue into a representative vector Metrics I care about for each residue: * Contact distance * Residue characteristics * Amino acid identity or degenerate amino acid groups? * Amino acid chemical characteristics e.g. [Sigma Amino Acid Reference Chart] (http://www.sigmaaldrich.com/life-science/metabolomics/learning-center/amino-acid-reference-chart.html) * Position of residue relative to fragment * Vector from fragment centroid to {residue centroid | closest residue atom } * Backbone or side chain {Angstroms} - X component, vector from fragment centroid to closest residue atom {Angstroms} - Y component, vector from fragment centroid to closest residue atom {Angstroms} - Z component, vector from fragment centroid to closest residue atom { 0 | 1 } - Backbone contact OR Sidechain contact { 0 | 1 } - Ligand Polar Contact OR Ligand Non-polar Contact { 0 | 1 } - Side chain has hydrogen bond donor/acceptor (DEHKNQRSTY) { 0 | 1 } - Hydrophobic, aliphatic (AILV) { 0 | 1 } - Hydrophobic, aromatic (FWY) { 0 | 1 } - Polar (NCQMST) { 0 | 1 } - Charged, Acidic (DE) { 0 | 1 } - Charged, Basic (HKR) { 0 | 1 } - Glycine { 0 | 1 } - Proline { 0 | 1 } - Backbone carbonyl { 0 | 1 } - Backbone amino { 0 | 1 } - Backbone C/CA :return: """ min_contact_distance, row_index_low, column_index_low = minimum_contact_distance( prody_residue, prody_ligand, return_indices=True) polar_residues = [ 'ASP', 'GLU', 'HIS', 'LYS', 'ASN', 'GLN', 'ARG', 'SER', 'THR', 'TYR' ] if all([ prody_residue.getResnames()[0] in polar_residues, min_contact_distance > distance_cutoff_polar ]): return None elif min_contact_distance > distance_cutoff_greasy: return None else: residue_contact_atom = prody_residue.copy().select( 'index {}'.format(row_index_low)) ligand_contact_atom = prody_ligand.copy().select( 'index {}'.format(column_index_low)) # Save min contact residue and ligand atom indicies for evaluating cluster quality later # self.min_contact_distance = min_contact_distance # Residue Contact Type residue_contact_type = 0 if residue_contact_atom.getNames()[0] in [ 'C', 'CA', 'N', 'O' ] else 1 # Ligand Contact Type ligand_contact_type = 1 if ligand_contact_atom.getNames( )[0][0] in ['C'] else 0 # Vector from fragment centroid to closest residue atom contact_vector = (residue_contact_atom.getCoords() - prody.calcCenter(prody_ligand))[0] # contact_unit_vector = contact_vector / np.linalg.norm(contact_vector) # Side chain has hydrogen bond donor/acceptor (DEHKNQRSTY) h_bond_donor_acceptor = 1 if residue_contact_atom.getResnames( )[0] in polar_residues else 0 # Polar atom on residue is contacting ligand residue_polar_contact = 1 if residue_contact_atom.getNames( )[0][0] in ['O', 'N'] else 0 # Residue characteristics # todo: UPDATE so that only one of the below can hold value of 1 at any given time # {0 | 1} - Hydrophobic, aliphatic(AILVC) greasy_ali = 1 if residue_contact_atom.getResnames()[0] in [ 'ALA', 'ILE', 'LEU', 'VAL', 'CYS' ] else 0 # {0 | 1} - Hydrophobic, aromatic(FWY) greasy_aro = 1 if residue_contact_atom.getResnames()[0] in [ 'PHE', 'TYR', 'TRP' ] else 0 # {0 | 1} - Polar(NCQMST) polar = 1 if residue_contact_atom.getResnames()[0] in [ 'ASN', 'CYS', 'GLN', 'MET', 'SER', 'THR' ] else 0 # {0 | 1} - Charged, Acidic(DE) charged_acid = 1 if residue_contact_atom.getResnames()[0] in [ 'ASP', 'GLU' ] else 0 # {0 | 1} - Charged, Basic(HKR) charged_basic = 1 if residue_contact_atom.getResnames()[0] in [ 'HIS', 'LYS', 'ARG' ] else 0 # {0 | 1} - Glycine glycine = 1 if residue_contact_atom.getResnames()[0] in ['GLY' ] else 0 # {0 | 1} - Proline proline = 1 if residue_contact_atom.getResnames()[0] in ['PRO' ] else 0 # {0 | 1} - Backbone carbonyl bb_carbonyl = 1 if residue_contact_atom.getNames()[0] in ['O' ] else 0 # {0 | 1} - Backbone amino bb_amino = 1 if residue_contact_atom.getNames()[0] in ['N'] else 0 # {0 | 1} - Backbone C / CA bb_c_ca = 1 if residue_contact_atom.getNames()[0] in ['C', 'CA' ] else 0 categorical_vector = [ residue_contact_type, ligand_contact_type, h_bond_donor_acceptor, residue_polar_contact, greasy_ali, greasy_aro, polar, charged_acid, charged_basic, glycine, proline, bb_carbonyl, bb_amino, bb_c_ca ] self.categorical_array = np.asanyarray(categorical_vector) self.contact_vector = contact_vector return True
""" Created on 1/9/2014 @author: victor """ import sys import os import glob import prody from hivprotmut.structures.pdbcuration import CurationSelections if __name__ == '__main__': final_db_folder = sys.argv[1] com_file = sys.argv[2] com_handler = open(com_file, "w") ligand_folders = os.listdir(final_db_folder) # first level are ligands txt_root = os.path.split(final_db_folder)[1] for path in ligand_folders: files = glob.glob(os.path.join(final_db_folder, path, "*.pdb")) for pdb_file in files: pdb = prody.parsePDB(pdb_file) txt_pdb = os.path.split(pdb_file)[1] ligand = pdb.select(CurationSelections.HEAVY_LIGAND_SELECTION) com = prody.calcCenter(ligand) txt_pdb_file = os.path.join(path, txt_pdb) com_handler.write("%s %.3f %.3f %.3f\n" % (txt_pdb_file, com[0], com[1], com[2])) com_handler.close()
def __init__(self, parsed_pdb, comb): """instance of class IntFG has attributes including selection names, neighboring atoms, neighborhood density of atoms, etc.""" self.sele = parsed_pdb.possible_ifgs.pop() self.resindex, self._ind = np.unique(self.sele.getResindices(), return_index=True) self.resname = self.sele.getResnames()[self._ind] self.resnum = self.sele.getResnums()[self._ind] self.atom_names = {resname: self.sele.select('resindex ' + str(resindex)).getNames() for resname, resindex in zip(self.resname, self.resindex)} self.chid = np.unique(self.sele.getChids())[0] self.center_coords = pr.calcCenter(self.sele) self.vdm_count = 1 self.count = comb.ifg_count self.sasa = None self.residue_sasa = None self.dssp_sasa = None self.sasa_3A_probe = None self.sasa_4A_probe = None self.sasa_5A_probe = None self.contact_atoms_all = None self.contact_atoms_protein = None self.contact_resnums = None self.contact_chids = None self.contact_resindices = None self.contact_segments = None self.contact_atoms_water = None self.contact_atoms_ligand = None self.contact_atoms_metal = None self.contact_info_water = [] self.contact_info_ligand = [] self.contact_info_metal = [] self.contact_info_protein = [] self.contact_dict = collections.defaultdict(set) self.contact_pair_dict = collections.defaultdict(list) self.probe_hbonds = [] self.rotamer = None self.min_hull_dist_ifg = None self.min_hull_dist_cb_ca = None self.cbeta_density = None self.heavy_atom_density_5A = None self.heavy_atom_density_10A = None if comb.ifg_seq_str != 'element': self.ifg_frag = parsed_pdb.prody_pdb.select('segment A and chain ' + self.chid + ' and resnum `' + str(np.min(self.resnum)-1) + 'to' + str(np.max(self.resnum)+1) + '`') else: self.ifg_frag = self.sele self.frag_length = len(np.unique(self.ifg_frag.getResindices())) if comb.ifg_seq_str != 'element': self.sequence = ''.join(one_letter_code[rn] for rn in self.resname) else: self.sequence = '' self.sec_struct_dssp = None self.sec_struct_phi_psi = None self.contact_number_water = None self.per_res_contact_number_water = None self.contact_atom_names_water = None self.contact_resnames_water = None self.contact_resnums_water = None self.contact_number_ligand = None self.per_res_contact_number_ligand = None self.contact_atom_names_ligand = None self.contact_resnames_ligand = None self.contact_resnums_ligand = None self.contact_number_metal = None self.per_res_contact_number_metal = None self.contact_atom_names_metal = None self.contact_resnames_metal = None self.contact_resnums_metal = None self.hbond_atom_names = [] self.hbond_resnames = [] self.hbond_resnums = [] self.hbond_angle = [] self.hbond_dist_acc_hyd = [] self.hbond_dist_heavy = [] self.hbond_atom_names_water = [] self.hbond_number_water = [] self.hbond_resnames_water = [] self.hbond_resnums_water = [] self.hbond_angle_water = [] self.hbond_dist_acc_hyd_water = [] self.hbond_dist_heavy_water = [] self.hbond_number_ligand = [] self.hbond_atom_names_ligand = [] self.hbond_resnames_ligand = [] self.hbond_resnums_ligand = [] self.hbond_angle_ligand = [] self.hbond_dist_acc_hyd_ligand = [] self.hbond_dist_heavy_ligand = [] self.ca_hbond_atom_names = [] self.ca_hbond_resnames = [] self.ca_hbond_resnums = [] self.ca_hbond_angle = [] self.ca_hbond_dist_acc_hyd = [] self.ca_hbond_dist_heavy = [] self.bb_cb_atom_ind = self.get_bb_cb_atom_indices(parsed_pdb)
def binding_pocket_selection(pose_store, p, ligand_name, selection_radius, center): ''' This function will find by default mass center of the ligand using Prody. If the -gc option is selected the spatial center of the ligand is used by computing the mean distance between the furthest x axis coordinates. If the -ds option is selected a dobule sphere procedure is followed by selecting the furthest x axis atoms. ''' amino = [ 'CYS', 'ASP', 'SER', 'GLN', 'LYS', 'ILE', 'PRO', 'THR', 'PHE', 'ASN', 'GLY', 'HIS', 'LEU', 'ARG', 'TRP', 'ALA', 'VAL', 'GLU', 'TYR', 'MET' ] two_let_atom_code = ['Br', 'FE'] coord = [] x_coord = [] binding_pocket = [] ligand = [] min_coord = None max_coord = None for pose in pose_store: structure = pose_store[pose].split('\n') for line in structure: if line[17:20] == ligand_name: coord.append(float(line[30:38])) coord.append(float(line[38:46])) coord.append(float(line[46:54])) ligand_atom = (line[17:20].strip(), line[12:16].strip(), line[30:38].strip(), line[38:46].strip(), line[46:54].strip(), line[-2:].strip()) ligand.append(ligand_atom) ligand_atom = () for i in range(0, len(coord), 3): x_coord.append(float(coord[i])) if center == 'double': print('\nDouble center of ligand selected') x_out_left = coord[coord.index(min(x_coord))] y_out_left = coord[coord.index(min(x_coord)) + 1] z_out_left = coord[coord.index(min(x_coord)) + 2] x_out_right = coord[coord.index(max(x_coord))] y_out_right = coord[coord.index(max(x_coord)) + 1] z_out_right = coord[coord.index(max(x_coord)) + 2] print('\n') print("Left sphere center coordinates: ", x_out_left, y_out_left, z_out_left) print("Right sphere center coordinates: ", x_out_right, y_out_right, z_out_right) print("Spheres radii: ", selection_radius) for line in structure: if (line[0:6].strip() == "ATOM" or line[0:6].strip() == "HETATM") and line[17:20].strip() in amino: x1 = math.pow((float(line[30:38]) - x_out_left), 2) y1 = math.pow((float(line[38:46]) - y_out_left), 2) z1 = math.pow((float(line[46:54]) - z_out_left), 2) if (x1 + y1 + z1) <= selection_radius**2: atom = (line[17:20].strip(), line[12:16].strip(), line[30:38].strip(), line[38:46].strip(), line[46:54].strip(), line[-2:].strip()) binding_pocket.append(atom) atom = () for line in structure: if (line[0:6].strip() == "ATOM" or line[0:6].strip() == "HETATM") and line[17:20].strip() in amino: x1 = math.pow((float(line[30:38]) - x_out_right), 2) y1 = math.pow((float(line[38:46]) - y_out_right), 2) z1 = math.pow((float(line[46:54]) - z_out_right), 2) if (x1 + y1 + z1) <= selection_radius**2: if line[-3:].strip() in two_let_atom_code: atom = (line[17:20].strip(), line[12:16].strip(), line[30:38].strip(), line[38:46].strip(), line[46:54].strip(), line[-3:].strip()) else: atom = (line[17:20].strip(), line[12:16].strip(), line[30:38].strip(), line[38:46].strip(), line[46:54].strip(), line[-3:].strip()[0]) binding_pocket.append(atom) atom = () print("Number of protein atoms selected: {}".format( len(binding_pocket))) print('Number of ligand atoms selected: {}'.format(len(ligand))) print('Total number of atoms selected: {}'.format( len(binding_pocket) + len(ligand))) elif center == 'geometric': print("\nGeometric ligand center selected") sphere_center_x = (coord[coord.index(max(x_coord))] + coord[coord.index(min(x_coord))]) / 2 sphere_center_y = (coord[coord.index(max(x_coord)) + 1] + coord[coord.index(min(x_coord)) + 1]) / 2 sphere_center_z = (coord[coord.index(max(x_coord)) + 2] + coord[coord.index(min(x_coord)) + 2]) / 2 print('\n') print("Sphere center coordinates: ", sphere_center_x, sphere_center_y, sphere_center_z) print("Sphere radius: ", selection_radius) for line in structure: if (line[0:6].strip() == "ATOM" or line[0:6].strip() == "HETATM") and line[17:20].strip() in amino: x1 = math.pow((float(line[30:38]) - sphere_center_x), 2) y1 = math.pow((float(line[38:46]) - sphere_center_y), 2) z1 = math.pow((float(line[46:54]) - sphere_center_z), 2) if (x1 + y1 + z1) <= selection_radius**2: if line[-3:].strip() in two_let_atom_code: atom = (line[17:20].strip(), line[12:16].strip(), line[30:38].strip(), line[38:46].strip(), line[46:54].strip(), line[-3:].strip()) else: atom = (line[17:20].strip(), line[12:16].strip(), line[30:38].strip(), line[38:46].strip(), line[46:54].strip(), line[-3:].strip()[0]) binding_pocket.append(atom) atom = () print("Number of protein atoms selected: {}".format( len(binding_pocket))) print('Number of ligand atoms selected: {}'.format(len(ligand))) print('Total number of atoms selected: {}'.format( len(binding_pocket) + len(ligand))) elif center == 'mass': print('\nLigand mass center selected') ligand_selection = p.select('not water and hetero') weights = ligand_selection.getMasses() mass_center = calcCenter(ligand_selection, weights) sphere_center_x = mass_center[0] sphere_center_y = mass_center[1] sphere_center_z = mass_center[2] print("\nSphere center coordinates: {}, {}, {}".format( sphere_center_x, sphere_center_y, sphere_center_z)) print("Sphere radius: {}".format(selection_radius)) for line in structure: if (line[0:6].strip() == "ATOM" or line[0:6].strip() == "HETATM") and line[17:20].strip() in amino: x1 = math.pow((float(line[30:38]) - sphere_center_x), 2) y1 = math.pow((float(line[38:46]) - sphere_center_y), 2) z1 = math.pow((float(line[46:54]) - sphere_center_z), 2) if (x1 + y1 + z1) <= selection_radius**2: if line[-3:].strip() in two_let_atom_code: atom = (line[17:20].strip(), line[12:16].strip(), line[30:38].strip(), line[38:46].strip(), line[46:54].strip(), line[-3:].strip()) else: atom = (line[17:20].strip(), line[12:16].strip(), line[30:38].strip(), line[38:46].strip(), line[46:54].strip(), line[-3:].strip()[0]) binding_pocket.append(atom) atom = () print("Number of protein atoms selected: {}".format( len(binding_pocket))) print('Number of ligand atoms selected: {}'.format(len(ligand))) print('Total number of atoms selected: {}'.format( len(binding_pocket) + len(ligand))) return binding_pocket, ligand
def append_vectors(self, hetero_file): ''' Append each docked result as a vector to the dict :param hetero_file: file position :return: nothing , but will generate a vector into the dict ''' #need to split the files TEMP = 'temp.pdb' o = open(hetero_file, 'r') one_pdb = '' for line in o: one_pdb += line if 'END' in line: #write a temporial file with open(TEMP, 'wb') as w: w.write(one_pdb) w.close() one_pdb = '' pdb = pd.parsePDB(TEMP) if pdb.numAtoms() <= 3: continue # Get coordinate of center xyz = pdb.getCoords() middle = pd.calcCenter(pdb) scale = max( max(xyz[:, 0]) - middle[0], middle[0] - min(xyz[:, 0]), max(xyz[:, 1]) - middle[1], middle[1] - min(xyz[:, 1]), max(xyz[:, 2]) - middle[2], middle[2] - min(xyz[:, 2])) # assert scale <= 10 if scale > 10: logging.warning( 'Warning! {} has a ligand out of box scale with {} atom distance to center' .format(self.PDBname, scale)) # Now shifting the boxes: max_scale = max( max(xyz[:, 0]) - min(xyz[:, 0]), max(xyz[:, 1]) - min(xyz[:, 1]), max(xyz[:, 2]) - min(xyz[:, 2])) if max_scale > 20: logging.error( 'Assertion failed, {} has a ligand out of box completely with scale' .format(self.PDBname, scale)) continue # Try to move to the new center middle = [(max(xyz[:, 0]) + min(xyz[:, 0])) / 2, (max(xyz[:, 1]) + min(xyz[:, 1])) / 2, (max(xyz[:, 2]) + min(xyz[:, 2])) / 2] xx, yy, zz = np.meshgrid( np.linspace(middle[0] - 9.5, middle[0] + 9.5, 20), np.linspace(middle[1] - 9.5, middle[1] + 9.5, 20), np.linspace(middle[2] - 9.5, middle[2] + 9.5, 20)) # print xx vector = np.c_[xx.ravel(), yy.ravel(), zz.ravel()] num_vector = [0] * 8000 for atom in pdb.iterAtoms(): x, y, z = atom.getCoords() x_pos = int(round(x - vector[0][0])) # assert 0 <= x_pos <= 19 y_pos = int(round(y - vector[0][1])) # assert 0 <= y_pos <= 19 z_pos = int(round(z - vector[0][2])) # assert 0 <= z_pos <= 19 if 0 <= x_pos <= 19 and 0 <= y_pos <= 19 and 0 <= z_pos <= 19: # Simply change here to fulfill the mark as 'H_1' num_vector[ x_pos * 400 + y_pos * 20 + z_pos] = atom.getName() + '_' + str(HETERO_PART) # quick,dirty way to find atoms of protein in cubic boxes pd.defSelectionMacro( 'inbox', 'abs(x-{}) < 10 and abs(y-{}) < 10 and abs(z-{}) < 10'. format(middle[0], middle[1], middle[2])) nearby = self.protein.select('inbox') if nearby is not None: for atom in nearby.iterAtoms(): x, y, z = atom.getCoords() x_pos = int(round(x - vector[0][0])) # assert 0 <= x_pos <= 19 y_pos = int(round(y - vector[0][1])) # assert 0 <= y_pos <= 19 z_pos = int(round(z - vector[0][2])) # assert 0 <= z_pos <= 19 if 0 <= x_pos <= 19 and 0 <= y_pos <= 19 and 0 <= z_pos <= 19 and num_vector[ x_pos * 400 + y_pos * 20 + z_pos] == 0: # Simply change here to fulfill the mark as 'C_2' num_vector[x_pos * 400 + y_pos * 20 + z_pos] = atom.getName() + '_' + str( PROTEIN_PART) else: logging.warning( 'Coorinate {} {} {} found at {}'.format( x_pos, y_pos, z_pos, self.PDBname)) # This is for checking the correctness when we add atoms in proteins. # filename2= 'data/{}_{}_2.pdb'.format(PDB, ResId) # writePDB(filename2, pick_one+nearby) # Save into the dict for future locating self.heterodict[str(self.ct)] = { 'raw_vector': num_vector, 'center': middle, 'filename': hetero_file, 'id': hetero_file.split('/')[-1].split('.')[0] + '_' + str(self.ct) } self.ct += 1
def find_possible_ifgs_rmsd(self, comb, rmsd_threshold=1.0): """uses iFG definitions in comb object to select iFGs in the parsed protein object that have all atoms and occupancies = 1. """ possible_ifgs = [] if comb.num_res_ifg_query == 1: poss_ifg_sel = self.prody_pdb.select('segment A and chain ' + self.pdb_chain + ' sequence "' + comb.ifg_seq_str_query + '"') if poss_ifg_sel is not None: ifg_resindices, indices = np.unique( poss_ifg_sel.getResindices(), return_index=True) ifg_resnames = poss_ifg_sel.getResnames()[indices] for ifg_resindex, ifg_resname in zip(ifg_resindices, ifg_resnames): ifg_selection = self.prody_pdb.select( 'resindex ' + str(ifg_resindex) + ' and name ' + comb.ifg_sele_dict_query[1][ifg_resname]) if ifg_selection is not None: num_atoms = len(ifg_selection) if num_atoms == len(comb.ifg_sele_dict_query[1] [ifg_resname].split()): if all(ifg_selection.getResnums() > 0): possible_ifgs.append(ifg_selection) comb.total_possible_ifgs += len(possible_ifgs) else: poss_ifg_sel = self.prody_pdb.select('segment A and chain ' + self.pdb_chain + ' sequence "' + comb.ifg_seq_str_query + '"') if poss_ifg_sel is not None: ifg_resindices_cat_list, indices = np.unique( poss_ifg_sel.getResindices(), return_index=True) ifg_resnames_cat_list = poss_ifg_sel.getResnames()[indices] ifg_resindex_pairs = [ ifg_resindices_cat_list[i:i + 2] for i in range(0, len(ifg_resindices_cat_list), 2) ] ifg_resname_pairs = [ ifg_resnames_cat_list[i:i + 2] for i in range(0, len(ifg_resnames_cat_list), 2) ] for ifg_resindex_pair, ifg_resname_pair in zip( ifg_resindex_pairs, ifg_resname_pairs): resind1, resind2 = ifg_resindex_pair resname1, resname2 = ifg_resname_pair try: ifg_selection = self.prody_pdb.select( '(resindex ' + str(resind1) + ' and name ' + comb.ifg_sele_dict_query[1][resname1] + ')' + ' or (resindex ' + str(resind2) + ' and name ' + comb.ifg_sele_dict_query[2][resname2] + ')') except KeyError: print('Non-canonical residue in iFG, skipping.') ifg_selection = None if ifg_selection is not None: num_atoms = len(ifg_selection) names = comb.ifg_sele_dict_query[1][resname1].split() names.extend( comb.ifg_sele_dict_query[2][resname2].split()) if num_atoms == len(names): if all(ifg_selection.getResnums() > 0): possible_ifgs.append(ifg_selection) comb.total_possible_ifgs += len(possible_ifgs) passed_possible_ifgs = [] for pifg in possible_ifgs: com = pr.calcCenter( pifg.select('name ' + ' '.join(comb.query_names[0]))) q2_sel = self.prody_pdb.select( 'name ' + ' '.join(comb.query_names[1]) + ' within ' + str(comb.query_distance) + ' of center', center=com) if q2_sel is not None: resinds_query2s = np.unique(q2_sel.getResindices()) q_sel1_coords = [ pifg.select('name ' + n).getCoords()[0] for n in comb.query_names[0] ] for resind in resinds_query2s: q_sel = self.prody_pdb.select( 'name ' + ' '.join(comb.query_names[1]) + ' and resindex ' + str(resind)) if len(q_sel) == len(comb.query_names[1]): q_sel2_coords = [ q_sel.select('name ' + n).getCoords()[0] for n in comb.query_names[1] ] pifg_coords = np.vstack((q_sel1_coords, q_sel2_coords)) for coords in comb.query_coords: R, m_com, t_com = get_rot_trans( coords, pifg_coords) coords_transformed = np.dot( (coords - m_com), R) + t_com rmsd = pr.calcRMSD(coords_transformed, pifg_coords) if rmsd <= rmsd_threshold: passed_possible_ifgs.append( q_sel ) # This only takes the query2 selection as the iFG. break return passed_possible_ifgs
def compute_center_of_chain(pdb_object, chain="L"): molecule_to_center = pdb_object.select("chain {}".format(chain)) center = prody.calcCenter(molecule_to_center) return center