def get_pdb_from_remote_or_db(pdb_id, selection, source_folder = ""): """ If called without "source_folder" the behaviour of this function is actually that of 'get_pdb'. If "source_folder" is defined (a user-defined folder) Prody will look the availability of the pdb there before trying to download it from rcbs pdb (slower). """ prody.pathPDBFolder(source_folder) return get_pdb(pdb_id, selection)
def get_pdb_from_remote_or_db(pdb_id, selection, source_folder=""): """ If called without "source_folder" the behaviour of this function is actually that of 'get_pdb'. If "source_folder" is defined (a user-defined folder) Prody will look the availability of the pdb there before trying to download it from rcbs pdb (slower). """ prody.pathPDBFolder(source_folder) return get_pdb(pdb_id, selection)
def get_pdb_chain(pdbid,chain,as_object = False,folder = "results",local_path = None): """This function parse pdb_ids and chain name, then write the specific chain. Also this check for non standard aminoacid MSE and rename it as MET. To parse PDBs from a local folder set path in *local_path* pdbid : a PDB identifier or a filename chain : The PDB chain identifier as_object : True for return prody object folder : **path** Folder to storage results local_path : **path** to a local Folder """ folder_result = make_folder(folder_result = folder) if local_path is not None: if os.path.isdir(local_path): pdbs = prody.findPDBFiles(path=local_path) parse = prody.parsePDB(pdbs[pdbid]) else: raise IOError("{0} is not a valid path".format(local_path)) else: prody.pathPDBFolder(folder=folder_result) parse = prody.parsePDB(pdbid) protein = parse.select("protein") p_chain = protein.select("chain %s" %chain) if p_chain == None: return hv= p_chain.getHierView() hvc = hv[chain] for i,r in enumerate(hvc): if str(r)[:3] =="MSE": r.setResname("MET") if as_object: return hvc return prody.writePDB(folder_result+"/%s_%s" %(pdbid,chain),hvc)
def compare(): ###get PDB files from databank that are associated with each protein for later use ##change directory #create a folder that contains all pdb files from the PDB if it does not exist prody.pathPDBFolder(wd + '/challengedata/PDBfiles') #list of proteins that need to be downloaded weeks = [] for(_, dirnames, _) in os.walk(wd + '/challengedata'): if (dirnames=='latest.txt' or dirnames=='answers' or dirnames =='rdkit-scripts'): pass elif (dirnames not in weeks): weeks.extend(dirnames) proteins = [x for x in weeks if 'celpp' not in x] #download pdb using prody for x in proteins: if x=='rdkit-scripts' or x=='PDBfiles' or x=='answers': pass else: protein = prody.fetchPDB(x)
import protocols DEBUG_MODE = False LOGGER._setprefix('') LOGGER.info(f'Started on {datetime.datetime.now()}') LOGGER.info('') # set PDB folder old_verbosity = LOGGER.verbosity LOGGER._setverbosity('none') home_dir = os.environ['HOME'] pdb_dir = os.path.join(home_dir, 'PDBs') if not os.path.isdir(pdb_dir): os.mkdir(pdb_dir) pd.pathPDBFolder(pdb_dir) LOGGER._setverbosity(old_verbosity) # check Rhapsody installation rd.initialSetup() if DEBUG_MODE: time.sleep(5) else: # run appropriate protocol if os.path.isfile('input-sm_query.txt'): # perform saturation mutagenesis rh = protocols.sat_mutagen() elif os.path.isfile('input-batch_query.txt'): # analyse batch query rh = protocols.batch_query()
""" pattern = domain + r"\s+(?P<pdbid>\S{4})\s+(?P<desc>\S+)" m = re.search(pattern, ASTRAL_FILE_DATA) print(domain, m.group('pdbid'), m.group('desc')) return m.group('pdbid'), m.group('desc') def main(): d = torch.load(args.input_pn_dict) for pnid, data in d.items(): try: pdb_id, model_id, chain_id = pnid.split("_") except ValueError: print(pnid) continue print(pdb_id, model_id, chain_id) if __name__ == "__main__": parser = argparse.ArgumentParser(description="Parses the ProteinNet dictionary for PDB IDs so they may be " "downloaded and processed for the all-atom ProteinTransformer.") parser.add_argument('input_pn_dict', type=str, help='Path to PN-parsed dictionary file') parser.add_argument("--pdb_dir", default="/home/jok120/pdb/", type=str, help="Path for ProDy-downloaded PDB files.") args = parser.parse_args() with open(ASTRAL_FILE, "r") as f: ASTRAL_FILE_DATA = f.read() pr.pathPDBFolder(args.pdb_dir) main()
def get_relative_solvent_accessibility(pdb_id, residue_mapper, chain, full_pdb_solvent_accessibility=True, aa_surface_area=AA_SA_VOL): """ Run DSSP on a PDB file and return the resulting AtomGroup Parameters ---------- pdb_id String containing PDB ID residue_mapper Dictionary of residue - unitprot mappings chain String containing the selected chain ID(s) from the residue mapper full_pdb_solvent_accessibility Boolean to use the full PDB for solvent accessibility calculations -- otherwise only the chain residues will be selected. Default is True. aa_surface_area Dictionary with amino acid abbreviations as keys and surface area calculations as values Returns ------- a numpy array containing relative solvent accessibility measurement for residues """ if full_pdb_solvent_accessibility: dssp_chain = None else: dssp_chain = chain with tempfile.TemporaryDirectory() as tdir: pdb_file = os.path.join(tdir, '.'.join([pdb_id, 'pdb'])) dssp_file = os.path.join(tdir, '.'.join([pdb_id, 'dssp'])) # DSSP doesn't work with CIF-based atom groups, so must re-run here pd.pathPDBFolder(tdir) structure = pd.parsePDB(pdb_id, chain=dssp_chain) # Must write PDB file for DSSP with only chain selections # TODO how to silence output from the DSSP functions pd.writePDB(pdb_file, structure) pd.execDSSP(pdb_file, outputdir=tdir) pd.parseDSSP(dssp_file, structure) # Gather results # There should not be missing residues mapped_residue_list = list(residue_mapper.keys()) mapped_residue_list = ' '.join([str(x) for x in mapped_residue_list]) selection_string = f"resnum {mapped_residue_list}" if dssp_chain is not None: selection_string += f" AND chain {chain}" iter_resi_list = sorted( set(structure.select(selection_string).getResnums())) rel_acc_list = list() for resi in iter_resi_list: dssp_resi = structure[(chain, resi)] surface_accessibilty = dssp_resi.getData('dssp_acc')[0] resn = dssp_resi.getResname() rel_surface_accessibilty = surface_accessibilty / aa_surface_area[resn] rel_acc_list.append(rel_surface_accessibilty) return np.array(rel_acc_list)
import sys import os import numpy as np import prody as pd # check if rhapsody can be imported correctly sys.path.append('../../') import rhapsody as rd # set folders if not os.path.isdir('workspace'): os.mkdir('workspace') old_rhaps_dir = rd.pathRhapsodyFolder() old_EVmut_dir = rd.pathEVmutationFolder() old_prody_dir = pd.pathPDBFolder() rd.pathRhapsodyFolder('./workspace') rd.pathEVmutationFolder('./data') pd.pathPDBFolder('./data') # test cases test_SAVs = [ 'O00294 496 A T', # "good" SAV where all features are well-defined 'O00238 31 R H' # "bad" SAV with no PDB structure (but has Pfam domain) ] # initialize a rhapsody object rh = rd.Rhapsody() # import precomputed PolyPhen-2 output file rh.importPolyPhen2output('data/pph2-full.txt')
help="Path for ProDy-downloaded PDB files.") parser.add_argument('--training_set', type=int, default=100, help='Which thinning of the training set to parse. ' '{30,50,70,90,95,100}. Default 100.') args = parser.parse_args() VALID_SPLITS = [10, 20, 30, 40, 50, 70, 90] TRAIN_FILE = f"training_{args.training_set}.pt" PN_TRAIN_DICT, PN_VALID_DICT, PN_TEST_DICT = None, None, None ASTRAL_FILE = "../data/proteinnet/astral_pdb_map.txt" #"data/fullDict.txt" # combined previous versions of dir.des.scope.2.xx-stable.txt into one big dict ASTRAL_ID_MAPPING = parse_astral_summary_file(ASTRAL_FILE) SUFFIX = str( datetime.datetime.today().strftime("%y%m%d")) + f"_{args.training_set}" match = re.search(r"casp\d+", args.input_dir, re.IGNORECASE) assert match, "The input_dir is not titled with 'caspX'." CASP_VERSION = match.group(0) pr.pathPDBFolder(args.pdb_dir) # Set PDB download location np.set_printoptions( suppress=True) # suppresses scientific notation when printing np.set_printoptions( threshold=sys.maxsize) # suppresses '...' when printing try: main() except Exception as e: ERRORS.summarize() raise e
def alignment_monstrosity(self, rmsd_cutoff=0.5, use_local_pdb_database=False, verify_substructure=True): """ Consequences of not thinking ahead... For each fragment, align all fragment-containing ligands to fragment Generate PDBs with aligned coordinate systems :param args: :param rmsd_cutoff: fragment alignment RMSD cutoff, anything higher gets rejected :return: """ # Create directory for processed PDBs rejected_dict = self.load_previously_rejected_pdbs() # Create directories... if not use_local_pdb_database: os.makedirs(self.pdb_bank_dir, exist_ok=True) os.makedirs(self.processed_PDBs_path, exist_ok=True) # If use_local_pdb_database=False, use PDB FTP to download all structures # Otherwise, all relevant structures should be found in the local PDB database if not use_local_pdb_database: prody.pathPDBFolder(folder=self.pdb_bank_dir) for current_fragment in self.pdb_ligand_json: # Only download PDBs that aren't already in PDB bank directory existing_PDBs = [ pdb[:4].lower() for pdb in os.listdir(self.pdb_bank_dir) ] PDBs_to_download = list( set(self.pdb_ligand_json[current_fragment]['PDBs']) - set(existing_PDBs)) if len(PDBs_to_download) > 0: print(f'Downloading PDBs for {current_fragment}...\n') prody.fetchPDBviaFTP(*PDBs_to_download) else: print( f'All relevant PDBs for {current_fragment} found in {self.pdb_bank_dir}!\n' ) # Fragment_1, Fragment_2, ... for current_fragment in self.pdb_ligand_json: # Create directory for processed PDBs processed_dir = os.path.join(self.processed_PDBs_path, current_fragment) processed_dir_exists = os.path.exists(processed_dir) os.makedirs(processed_dir, exist_ok=True) # Get list of already processed PDBs for current_fragment already_processed_pdbs = [ file[:4].lower() for file in os.listdir(processed_dir) ] # Save ideal_ligand_containers for each fragment so things are only downloaded once ideal_ligand_dict = dict() ideal_ligand_dict['Ligands'] = dict() ideal_ligand_dict['Failed'] = list() # Align_PDB class holds all information for the current fragment align = Align_PDB(self.user_defined_dir, current_fragment, self.sanitized_smiles_dict[current_fragment], verify_substructure=verify_substructure) # Get PDB IDs that are viable for extracting protein-fragment contacts reject_pdbs = rejected_dict[ current_fragment] if current_fragment in rejected_dict.keys( ) else list() if not processed_dir_exists: reject_pdbs = list() reject_pdbs.append('3k87') # DEBUGGING viable_pdbs = list( set(self.pdb_ligand_json[current_fragment]['PDBs']) - set(reject_pdbs) - set(already_processed_pdbs)) # For each PDB containing a fragment-containing compound for pdbid in viable_pdbs: # Return path of PDB file to use for processing found_pdb, pdb_path = self.return_PDB_to_use_for_alignments( pdbid, use_local_pdb_database=use_local_pdb_database) if not found_pdb: print(f'Cannot find {pdbid}!') continue # Proceed with processing if the current PDB passes all filters print("\n\nProcessing {}...".format(pdbid)) # --- Check which ligands contain relevant fragments --- # relevant_ligands = self.return_substructure_containing_ligands( pdb_path, self.pdb_ligand_json, current_fragment) # Set things up! Get ligands from Ligand Expo if haven't already tried and failed for ligand in relevant_ligands: if not ideal_ligand_dict['Ligands'].get( ligand ) and ligand not in ideal_ligand_dict['Failed']: ideal_ligand_container = Ideal_Ligand_PDB_Container( ligand) if ideal_ligand_container.success: ideal_ligand_dict['Ligands'][ ligand] = ideal_ligand_container else: ideal_ligand_dict['Failed'].append(ligand) # Create a temp list for ligands that will be pulled from the current PDB ligand_container_dict_for_current_pdb = { lig: ideal_ligand_dict['Ligands'][lig] for lig in ideal_ligand_dict['Ligands'] if lig in relevant_ligands } relevant_ligands_prody_dict = align.extract_ligand_records( pdb_path, ligand_container_dict_for_current_pdb) # Reject if no ligands with all atoms represented can be found for the given PDB if len(relevant_ligands_prody_dict) < 1: if current_fragment in rejected_dict.keys(): rejected_dict[current_fragment].append(pdbid) else: rejected_dict[current_fragment] = [pdbid] print( 'REJECTED - no target ligands were fully represented in the PDB' ) continue # --- Perform alignment of PDB fragment substructure (mobile) onto defined fragment (target) --- # # ...if PDB has not been processed, rejected, or excluded by the user else: # Iterate over ligands found to contain fragments as substructures for ligand_resname, ligand_chain, ligand_resnum in relevant_ligands_prody_dict: # Mapping of fragment atoms to target ligand atoms target_ligand_ideal_smiles = ligand_container_dict_for_current_pdb[ ligand_resname].smiles # todo: catch ligands with missing SMILES strings earlier... if target_ligand_ideal_smiles is None: continue target_ligand_pdb_string = io.StringIO() target_ligand_prody = relevant_ligands_prody_dict[( ligand_resname, ligand_chain, ligand_resnum)].select('not hydrogen') prody.writePDBStream(target_ligand_pdb_string, target_ligand_prody) mapping_successful, fragment_target_map = align.fragment_target_mapping( target_ligand_ideal_smiles, target_ligand_pdb_string) if not mapping_successful: if current_fragment in rejected_dict.keys(): rejected_dict[current_fragment].append(pdbid) else: rejected_dict[current_fragment] = [pdbid] print( 'REJECTED - failed atom mapping between target and reference fragment' ) continue print( f'\n{len(fragment_target_map)} possible mapping(s) of fragment onto {pdbid}:{ligand} found...\n' ) # Iterate over possible mappings of fragment onto current ligand rmsd_success = False for count, mapping in enumerate(fragment_target_map): # todo: refactor to use RDKit's atom.GetMonomerInfo() for atom selections... # Determine translation vector and rotation matrix target_coords_and_serials, frag_atom_coords, transformation_matrix = align.determine_rotation_and_translation( mapping, target_ligand_prody) trgt_atom_coords, target_fragment_atom_serials = target_coords_and_serials # Apply transformation to protein_ligand complex if rmsd if below cutoff # Use information from PubChem fragment SMILES in determining correct mappings # Actually, map fragment onto source ligand and use valence information to determine correct mappings rmsd = prody.calcRMSD( frag_atom_coords, prody.applyTransformation( transformation_matrix, trgt_atom_coords)) print( 'RMSD of target onto reference fragment:\t{}'. format(rmsd)) if rmsd < rmsd_cutoff: transformed_pdb = align.apply_transformation( pdb_path, ligand_resnum, target_fragment_atom_serials, transformation_matrix) # Continue if transformed_pdb - ligand is None if transformed_pdb.select( f'not (resname {ligand_resname})' ) is None: continue transformed_pdb_name = f'{pdbid}_{ligand_resname}_{ligand_chain}_{ligand_resnum}-{count}.pdb' prody.writePDB( os.path.join(processed_dir, transformed_pdb_name), transformed_pdb) rmsd_success = True else: print( 'REJECTED - high RMSD upon alignment to reference fragment' ) if rmsd_success is False: if current_fragment in rejected_dict.keys(): rejected_dict[current_fragment].append(pdbid) else: rejected_dict[current_fragment] = [pdbid] # Remember rejected PDBs with open(self.rejected_dict_pickle, 'wb') as reject_pickle: pickle.dump(rejected_dict, reject_pickle)
__date__ = "December 2019" __maintainer__ = "Luca Ponzoni" __email__ = "*****@*****.**" __status__ = "Production" # temporarily switch to new set of folders if not os.path.isdir('workspace'): os.mkdir('workspace') if not os.path.isdir('workspace/pickles'): os.mkdir('workspace/pickles') old_rhaps_dir = pd.SETTINGS.get('rhapsody_local_folder') old_EVmut_dir = pd.SETTINGS.get('EVmutation_local_folder') old_prody_dir = pd.SETTINGS.get('pdb_local_folder') pd.SETTINGS['rhapsody_local_folder'] = os.path.abspath('./workspace') pd.SETTINGS['EVmutation_local_folder'] = os.path.abspath('./data') pd.pathPDBFolder('./data') # test cases test_SAVs = [ 'O00294 496 A T', # "good" SAV where all features are well-defined 'O00238 31 R H' # "bad" SAV with no PDB structure (but has Pfam domain) ] # initialize a rhapsody object rh = rd.Rhapsody() # import precomputed PolyPhen-2 output file rh.importPolyPhen2output('data/pph2-full.txt') # we would like to compute all features rh.setFeatSet('all')