def read(self, ag): sio = io.StringIO() prody.writePDBStream(sio, ag) cont = sio.getvalue().rstrip() sio.close() mol = openbabel.OBMol() self.ReadString(mol, cont) return mol
def add_mol(self, mol, keep_chains=False, keep_resi=False): """ This behaves bad when molecules have same chain names """ ag1 = self.ag.copy() ag2 = mol.ag.copy() if ag1.numCoordsets() != ag2.numCoordsets(): raise RuntimeError('Atom groups have different numbers of coordinate sets') nsets = ag1.numCoordsets() chains1 = list(set(ag1.getChids())) chains2 = list(set(ag2.getChids())) all_chains = chains1 + chains2 if len(set(all_chains)) != len(all_chains) and keep_chains: logger.warning('Two atom groups contain same chain IDs, merging can go wrong') if keep_resi and len(set(ag1.getResnums()).intersection(set(ag2.getResnums()))) > 0: raise RuntimeError('Refusing to merge atom groups which contain same chain IDs AND residue IDs') if not keep_chains: if len(chains1) + len(chains2) > len(self._chain_order): raise RuntimeError('Total number of chains is too large, out of chain ID letters') iter_chains = iter(self._chain_order) map1 = {x: next(iter_chains) for x in chains1} map2 = {x: next(iter_chains) for x in chains2} for old, new in map1.items(): ag1.select('chain ' + old).setChids(new) for old, new in map2.items(): ag2.select('chain ' + old).setChids(new) if not keep_resi: resi = 1 for r in ag1.getHierView().iterResidues(): r.setResnum(resi) resi += 1 for r in ag2.getHierView().iterResidues(): r.setResnum(resi) resi += 1 buf = StringIO() for i in range(nsets): if nsets > 1: buf.write('MODEL \n' + str(i + 1)) prody.writePDBStream(buf, ag1, csets=i) prody.writePDBStream(buf, ag2, csets=i) if nsets > 1: buf.write('ENDMDL\n') else: buf.write('END\n') buf.seek(0) joint = BasePDB(ag=prody.parsePDBStream(buf)) joint.renumber(keep_resi=True, keep_chains=True) buf.close() return joint
def RDKit_Mol_from_ProDy(prody_instance, removeHs=True): """ Creates an RDKit Mol object from a ProDy AtomGroup instance :return: """ residue_io = io.StringIO() prody.writePDBStream(residue_io, prody_instance) return Chem.MolFromPDBBlock(residue_io.getvalue(), removeHs=removeHs)
def add_hydrogens(self, trim=True, csets=None): raise NotImplementedError() output = [] natoms = -1 csets = self._make_csets(csets) for i in csets: if trim: p_start = Popen([define.REDUCE_EXE, '-Quiet', '-Trim', '-'], stdin=PIPE, stdout=PIPE, stderr=STDOUT) p_finish = Popen([define.REDUCE_EXE, '-Quiet', '-FLIP', '-'], stdin=p_start.stdout, stdout=PIPE, stderr=STDOUT) else: p_start = Popen([define.REDUCE_EXE, '-Quiet', '-FLIP', '-'], stdin=PIPE, stdout=PIPE, stderr=STDOUT) p_finish = p_start prody.writePDBStream(p_start.stdin, self.ag, csets=i) p_start.stdin.close() output += ['MODEL%9i\n' % (i + 1)] reduced = [] while p_finish.poll() is None: reduced = p_finish.stdout.readlines() p_start.wait() p_finish.wait() print(reduced) natoms_cur = len(list(filter(lambda x: x.startswith('ATOM') or x.startswith('HETATM'), reduced))) if i == csets[0]: natoms = natoms_cur elif natoms != natoms_cur: raise RuntimeError('Number of atoms in reduced model %i is different from the first model (%i, %i)' % ( i, natoms_cur, natoms)) output += reduced output += ['ENDMDL\n'] status = p_finish.poll() if status != 0: logger.error('Called process returned ' + str(status)) self.ag = prody.parsePDBStream(StringIO(''.join(output))) self.renumber() return self
def print_pdb(self, ifg, parsed_pdb, comb): vdm_renum = renumber_chids_resnums(self, 'X') ifg_renum = renumber_chids_resnums(ifg, 'Y') filename = comb.output_dir_pdb + 'iFG_' + str(ifg.count) + '_vdM_' + str(ifg.vdm_count) \ + '_' + comb.file_tag + '.pdb.gz' with gzip.open(filename, 'wt') as pdbfile: pr.writePDBStream(pdbfile, vdm_renum) pr.writePDBStream(pdbfile, ifg_renum) if ifg.contact_atoms_water: pr.writePDBStream(pdbfile, ifg.contact_atoms_water) if ifg.contact_atoms_metal: pr.writePDBStream(pdbfile, ifg.contact_atoms_metal) if ifg.contact_atoms_ligand: pr.writePDBStream( pdbfile, parsed_pdb.prody_pdb.select('resindex ' + ' '.join( str(ri) for ri in np.unique( ifg.contact_atoms_ligand.getResindices()))))
def __str__(self): ss = io.StringStream() prody.writePDBStream(ss, self._pdb) return ss.read()
def searchDali(pdb, chain=None, subset='fullPDB', daliURL=None, **kwargs): """Search Dali server with input of PDB ID (or local PDB file) and chain ID. Dali server: http://ekhidna2.biocenter.helsinki.fi/dali/ :arg pdb: PDB code or local PDB file for the protein to be searched :arg chain: chain identifier (only one chain can be assigned for PDB) :type chain: str :arg subset: fullPDB, PDB25, PDB50, PDB90 :type subset: str """ import requests LOGGER.timeit('_dali') # timeout = 120 timeout = kwargs.pop('timeout', 120) if daliURL is None: daliURL = "http://ekhidna2.biocenter.helsinki.fi/cgi-bin/sans/dump.cgi" if isinstance(pdb, Atomic): atoms = pdb chain_set = set(atoms.getChids()) if chain and not chain in chain_set: raise ValueError('input structure (%s) does not have chain %s'%(atoms.getTitle(), chain)) if len(chain_set) > 1: if not chain: raise TypeError('the structure (%s) contains more than one chain, therefore a chain identifier ' 'needs to be specified'%pdb.getTitle()) atoms = atoms.select('chain '+chain) else: chain = chain_set.pop() stream = createStringIO() writePDBStream(stream, atoms) data = stream.getvalue() stream.close() files = {"file1" : data} pdbId = atoms.getTitle() pdb_chain = '' dali_title = 'Title_'+pdbId+chain elif isinstance(pdb, str): if os.path.isfile(pdb): atoms = parsePDB(pdb) chain_set = set(atoms.getChids()) # pdbId = "s001" filename = os.path.basename(pdb) filename, ext = os.path.splitext(filename) if ext.lower() == '.gz': filename2, ext2 = os.path.splitext(filename) if ext2.lower() == '.pdb': filename = filename2 pdbId = filename if chain and not chain in chain_set: raise ValueError('input PDB file does not have chain ' + chain) if len(chain_set) > 1: if not chain: raise TypeError('PDB file (%s) contains more than one chain, therefore a chain identifier ' 'needs to be specified'%pdb) atoms = atoms.select('chain '+chain) #local_temp_pdb = pdbId+chain+'.pdb' #local_temp_pdb = 's001'+chain+'.pdb' stream = createStringIO() writePDBStream(stream, atoms) data = stream.getvalue() stream.close() else: data = open(pdb, "rb") chain = chain_set.pop() files = {"file1" : data} # case: multiple chains. apply fetch ? multiple times? pdb_chain = '' dali_title = 'Title_' + pdbId + chain else: pdbId, ch = _getPDBid(pdb) if not chain: chain = ch if not chain: raise TypeError('a chain identifier is needed for the search') pdb_chain = pdbId + chain dali_title = 'Title_' + pdb_chain files = '' parameters = { 'cd1' : pdb_chain, 'method': 'search', 'title': dali_title, 'address': '' } # enc_params = urllib.urlencode(parameters).encode('utf-8') # request = urllib2.Request(daliURL, enc_params) request = requests.post(daliURL, parameters, files=files) try_error = 3 while try_error >= 0: try: # url = urllib2.urlopen(request).url url = request.url break except: try_error -= 1 if try_error >= 0: LOGGER.sleep(2, '. Connection error happened. Trying to reconnect...') continue else: # url = urllib2.urlopen(request).url url = request.url break if url.split('.')[-1].lower() in ['html', 'php']: # print('test -1: '+url) url = url.replace(url.split('/')[-1], '') LOGGER.debug('Submitted Dali search for PDB "{0}{1}".'.format(pdbId, chain)) LOGGER.info(url) LOGGER.clear() return DaliRecord(url, pdbId, chain, subset=subset, timeout=timeout, **kwargs)
def save_cluster_elements(elements, ids, out_pdb_name, data_handler, options, cluster_sizes = None): """ Saves a pdb file containing the most representative elements of the clustering. @param elements: A list of the representative elements of the clustering we want to extract. @params ids: A list with the cluster ids (1 to 1 mapping with 'elements'). @param out_pdb_name: The complete path of the produced file. @param data_handler: The trajectory handler for this run or an array with pdb file paths. @param options: postprocessing options to generate the file. Currently a dic with any of these: "keep_remarks" - Will add each model's remarks before the model header if present Possible values are: - "NONE": not to store remarks (Default) - "STANDARD": stores remarks that follow pdb standard - "NOT STANDARD": stores remarks not following the pdb standard - "ALL": stores all remarks "add_source_details" - Will add two remarks before the model tag: the path of the source file and the original model number. @params cluster_sizes: specific for the representatives case. Each element of this array holds the size of its cluster. """ keep_remarks = options.get_value("keep_remarks", default_value = "NONE") add_source_details = options.get_value("add_source_details", default_value = False) file_handler_out = open(out_pdb_name, "w") data = data_handler.get_data() merged_structure = data.get_all_elements() file_handler_out.write("REMARK 000 File created using Prody and pyProCT\n") if not keep_remarks == "NONE" and not add_source_details: prody.writePDBStream(file_handler_out, merged_structure, csets = elements) else: all_remarks = filter_remarks(data.get_all_remarks(), subset= keep_remarks) all_model_numbers = data.get_all_model_numbers() current_model = 0 for i, element_id in enumerate(elements): if keep_remarks: remarks = all_remarks[element_id] file_handler_out.write("".join(remarks)) if add_source_details: model_number = all_model_numbers[element_id] conf_source = data_handler.get_source_of_element(element_id).get_path() file_handler_out.write("REMARK 000 source : %s\n"%conf_source) file_handler_out.write("REMARK 000 original model nr : %d\n"%model_number) file_handler_out.write("REMARK 000 cluster id : %s\n"%ids[i]) file_handler_out.write("REMARK 000 cluster element : %d\n"%element_id) if cluster_sizes is not None: file_handler_out.write("REMARK 000 cluster population : %s\n"%(cluster_sizes[i])) file_handler_out.write("MODEL"+str(current_model).rjust(9)+"\n") pdb_handler = cStringIO.StringIO() prody.writePDBStream(pdb_handler, merged_structure, csets= element_id) # skip the first remark if any lines = filter(lambda line: line[0:6]!="REMARK" and line[0:5]!="MODEL" and line[0:6]!="ENDMDL", pdb_handler.getvalue().splitlines(True)) pdb_handler.close() file_handler_out.write("".join(lines)) file_handler_out.write("ENDMDL\n") current_model+=1 file_handler_out.close()
def alignment_monstrosity(self, rmsd_cutoff=0.5, use_local_pdb_database=False, verify_substructure=True): """ Consequences of not thinking ahead... For each fragment, align all fragment-containing ligands to fragment Generate PDBs with aligned coordinate systems :param args: :param rmsd_cutoff: fragment alignment RMSD cutoff, anything higher gets rejected :return: """ # Create directory for processed PDBs rejected_dict = self.load_previously_rejected_pdbs() # Create directories... if not use_local_pdb_database: os.makedirs(self.pdb_bank_dir, exist_ok=True) os.makedirs(self.processed_PDBs_path, exist_ok=True) # If use_local_pdb_database=False, use PDB FTP to download all structures # Otherwise, all relevant structures should be found in the local PDB database if not use_local_pdb_database: prody.pathPDBFolder(folder=self.pdb_bank_dir) for current_fragment in self.pdb_ligand_json: # Only download PDBs that aren't already in PDB bank directory existing_PDBs = [ pdb[:4].lower() for pdb in os.listdir(self.pdb_bank_dir) ] PDBs_to_download = list( set(self.pdb_ligand_json[current_fragment]['PDBs']) - set(existing_PDBs)) if len(PDBs_to_download) > 0: print(f'Downloading PDBs for {current_fragment}...\n') prody.fetchPDBviaFTP(*PDBs_to_download) else: print( f'All relevant PDBs for {current_fragment} found in {self.pdb_bank_dir}!\n' ) # Fragment_1, Fragment_2, ... for current_fragment in self.pdb_ligand_json: # Create directory for processed PDBs processed_dir = os.path.join(self.processed_PDBs_path, current_fragment) processed_dir_exists = os.path.exists(processed_dir) os.makedirs(processed_dir, exist_ok=True) # Get list of already processed PDBs for current_fragment already_processed_pdbs = [ file[:4].lower() for file in os.listdir(processed_dir) ] # Save ideal_ligand_containers for each fragment so things are only downloaded once ideal_ligand_dict = dict() ideal_ligand_dict['Ligands'] = dict() ideal_ligand_dict['Failed'] = list() # Align_PDB class holds all information for the current fragment align = Align_PDB(self.user_defined_dir, current_fragment, self.sanitized_smiles_dict[current_fragment], verify_substructure=verify_substructure) # Get PDB IDs that are viable for extracting protein-fragment contacts reject_pdbs = rejected_dict[ current_fragment] if current_fragment in rejected_dict.keys( ) else list() if not processed_dir_exists: reject_pdbs = list() reject_pdbs.append('3k87') # DEBUGGING viable_pdbs = list( set(self.pdb_ligand_json[current_fragment]['PDBs']) - set(reject_pdbs) - set(already_processed_pdbs)) # For each PDB containing a fragment-containing compound for pdbid in viable_pdbs: # Return path of PDB file to use for processing found_pdb, pdb_path = self.return_PDB_to_use_for_alignments( pdbid, use_local_pdb_database=use_local_pdb_database) if not found_pdb: print(f'Cannot find {pdbid}!') continue # Proceed with processing if the current PDB passes all filters print("\n\nProcessing {}...".format(pdbid)) # --- Check which ligands contain relevant fragments --- # relevant_ligands = self.return_substructure_containing_ligands( pdb_path, self.pdb_ligand_json, current_fragment) # Set things up! Get ligands from Ligand Expo if haven't already tried and failed for ligand in relevant_ligands: if not ideal_ligand_dict['Ligands'].get( ligand ) and ligand not in ideal_ligand_dict['Failed']: ideal_ligand_container = Ideal_Ligand_PDB_Container( ligand) if ideal_ligand_container.success: ideal_ligand_dict['Ligands'][ ligand] = ideal_ligand_container else: ideal_ligand_dict['Failed'].append(ligand) # Create a temp list for ligands that will be pulled from the current PDB ligand_container_dict_for_current_pdb = { lig: ideal_ligand_dict['Ligands'][lig] for lig in ideal_ligand_dict['Ligands'] if lig in relevant_ligands } relevant_ligands_prody_dict = align.extract_ligand_records( pdb_path, ligand_container_dict_for_current_pdb) # Reject if no ligands with all atoms represented can be found for the given PDB if len(relevant_ligands_prody_dict) < 1: if current_fragment in rejected_dict.keys(): rejected_dict[current_fragment].append(pdbid) else: rejected_dict[current_fragment] = [pdbid] print( 'REJECTED - no target ligands were fully represented in the PDB' ) continue # --- Perform alignment of PDB fragment substructure (mobile) onto defined fragment (target) --- # # ...if PDB has not been processed, rejected, or excluded by the user else: # Iterate over ligands found to contain fragments as substructures for ligand_resname, ligand_chain, ligand_resnum in relevant_ligands_prody_dict: # Mapping of fragment atoms to target ligand atoms target_ligand_ideal_smiles = ligand_container_dict_for_current_pdb[ ligand_resname].smiles # todo: catch ligands with missing SMILES strings earlier... if target_ligand_ideal_smiles is None: continue target_ligand_pdb_string = io.StringIO() target_ligand_prody = relevant_ligands_prody_dict[( ligand_resname, ligand_chain, ligand_resnum)].select('not hydrogen') prody.writePDBStream(target_ligand_pdb_string, target_ligand_prody) mapping_successful, fragment_target_map = align.fragment_target_mapping( target_ligand_ideal_smiles, target_ligand_pdb_string) if not mapping_successful: if current_fragment in rejected_dict.keys(): rejected_dict[current_fragment].append(pdbid) else: rejected_dict[current_fragment] = [pdbid] print( 'REJECTED - failed atom mapping between target and reference fragment' ) continue print( f'\n{len(fragment_target_map)} possible mapping(s) of fragment onto {pdbid}:{ligand} found...\n' ) # Iterate over possible mappings of fragment onto current ligand rmsd_success = False for count, mapping in enumerate(fragment_target_map): # todo: refactor to use RDKit's atom.GetMonomerInfo() for atom selections... # Determine translation vector and rotation matrix target_coords_and_serials, frag_atom_coords, transformation_matrix = align.determine_rotation_and_translation( mapping, target_ligand_prody) trgt_atom_coords, target_fragment_atom_serials = target_coords_and_serials # Apply transformation to protein_ligand complex if rmsd if below cutoff # Use information from PubChem fragment SMILES in determining correct mappings # Actually, map fragment onto source ligand and use valence information to determine correct mappings rmsd = prody.calcRMSD( frag_atom_coords, prody.applyTransformation( transformation_matrix, trgt_atom_coords)) print( 'RMSD of target onto reference fragment:\t{}'. format(rmsd)) if rmsd < rmsd_cutoff: transformed_pdb = align.apply_transformation( pdb_path, ligand_resnum, target_fragment_atom_serials, transformation_matrix) # Continue if transformed_pdb - ligand is None if transformed_pdb.select( f'not (resname {ligand_resname})' ) is None: continue transformed_pdb_name = f'{pdbid}_{ligand_resname}_{ligand_chain}_{ligand_resnum}-{count}.pdb' prody.writePDB( os.path.join(processed_dir, transformed_pdb_name), transformed_pdb) rmsd_success = True else: print( 'REJECTED - high RMSD upon alignment to reference fragment' ) if rmsd_success is False: if current_fragment in rejected_dict.keys(): rejected_dict[current_fragment].append(pdbid) else: rejected_dict[current_fragment] = [pdbid] # Remember rejected PDBs with open(self.rejected_dict_pickle, 'wb') as reject_pickle: pickle.dump(rejected_dict, reject_pickle)