def convertSomePDBtoDSSP(self, quantity): #converts a given number of pdb to DSSP for root, dirs, files in os.walk(self.pdbdir): for filename in files: quantity -= 1 if (quantity < 0): break print(quantity, ' )', filename, ' converted to dssp') # here uses from prody import execDSSP execDSSP(self.pdbdir + filename, outputdir=self.dsspdir) break
def _launchDSSP(self, ag): LOGGER.info('Running DSSP...') LOGGER.timeit('_DSSP') try: pdb_file = writePDB('_temp.pdb', ag, secondary=False) dssp_file = execDSSP(pdb_file, outputname='_temp') ag = parseDSSP(dssp_file, ag) except: raise finally: os.remove('_temp.pdb') os.remove('_temp.dssp') LOGGER.report('DSSP finished in %.1fs.', '_DSSP') return ag
def _launchDSSP(self, ag): LOGGER.info('Running DSSP...') LOGGER.timeit('_DSSP') pdb_file = writePDB('_temp.pdb', ag, secondary=False) try: dssp_file = execDSSP(pdb_file, outputname='_temp') except EnvironmentError: raise EnvironmentError("dssp executable not found: please install " "with 'sudo apt install dssp'") ag = parseDSSP(dssp_file, ag) os.remove('_temp.pdb') os.remove('_temp.dssp') LOGGER.report('DSSP finished in %.1fs.', '_DSSP') return ag
def get_features(pdb_file: str, dssp_dir: str, only_dssp=True, force_overwrite=True): """ Extract features from a pdb_file Parameters ---------- pdb_file dssp_dir directory to store tmp dssp files only_dssp extract only dssp features (use if not interested in features) force_overwrite force rerun DSSP Returns ------- dict of features """ pdb_file = str(pdb_file) name = Path(pdb_file).stem protein = pd.parsePDB(pdb_file).select("protein") pdb_file = str(Path(dssp_dir) / f"{name}.pdb") pd.writePDB(pdb_file, protein) protein = pd.parsePDB(pdb_file) dssp_file = Path(dssp_dir) / f"{name}.dssp" if force_overwrite or not dssp_file.exists(): dssp_file = pd.execDSSP(str(pdb_file), outputname=name, outputdir=str(dssp_dir)) protein = pd.parseDSSP(dssp=str(dssp_file), ag=protein, parseall=True) data = get_dssp_features(protein) if only_dssp: return data else: data = {**data, **get_fluctuations(protein)} data = {**data, **get_residue_depths(pdb_file)} return data
def __init__(self, comb, pdb_acc_code, chain, **kwargs): """ :comb: arg: instance of cls Comb with attributes pdbchain_dict, ifg_selection_info :pdb_acc_code: type: str: 4 character pdb accession code :param kwargs: path_to_pdb path_to_dssp """ #search for acc code in input_dir_pdb from comb object. assert isinstance(pdb_acc_code, str), 'PDB accession code needs to be a string' pdb_file = [ file.name for file in os.scandir(comb.input_dir_pdb) if pdb_acc_code in file.name ] try: if pdb_file: pdb_file = pdb_file[0] self.prody_pdb = pr.parsePDB(comb.input_dir_pdb + pdb_file, altloc='A', model=1) elif 'path_to_pdb' in kwargs: self.prody_pdb = pr.parsePDB(kwargs.get('path_to_pdb'), altloc='A', model=1) else: # NEED TO UPDATE: note if going to fetch pdb, it should be sent through Reduce first... try: os.mkdir(comb.input_dir_pdb + 'raw') os.mkdir(comb.input_dir_pdb + 'reduce') except: pass pr.fetchPDB(pdb_acc_code, compressed=False, folder=comb.input_dir_pdb + 'raw') os.system(comb.path_to_reduce + comb.reduce + ' -FLIP -Quiet -DB ' + comb.path_to_reduce + 'reduce_wwPDB_het_dict.txt ' + comb.input_dir_pdb + 'raw/' + pdb_acc_code.lower() + '.pdb > ' + comb.input_dir_pdb + 'reduce/' + pdb_acc_code.lower() + 'H.pdb') self.prody_pdb = pr.parsePDB(comb.input_dir_pdb + 'reduce/' + pdb_acc_code.lower() + 'H.pdb', altloc='A', model=1) except NameError: raise NameError( 'ParsePDB instance needs a pdb file path or a valid pdb accession code.' ) self.pdb_acc_code = pdb_acc_code.lower() self.pdb_chain = chain if len(self.prody_pdb) == len(self.prody_pdb.select('icode _')) \ and self.prody_pdb.select('protein and chain ' + self.pdb_chain) is not None: self.contacts = pr.Contacts(self.prody_pdb) self.set_bonds() if pdb_file: self.fs_struct = freesasa.Structure(comb.input_dir_pdb + pdb_file) elif 'path_to_pdb' in kwargs: self.fs_struct = freesasa.Structure(kwargs.get('path_to_pdb')) else: path = comb.input_dir_pdb + 'reduce/' self.fs_struct = freesasa.Structure(path + next( file.name for file in os.scandir(path) if self.pdb_acc_code in file.name)) self.fs_result = freesasa.calc(self.fs_struct) self.fs_result_cb_3A = self.freesasa_cb(probe_radius=3) self.fs_result_cb_4A = self.freesasa_cb(probe_radius=4) self.fs_result_cb_5A = self.freesasa_cb(probe_radius=5) self.prody_pdb_bb_cb_atom_ind = self.prody_pdb.select( 'protein and (backbone or name CB) ' 'and not element H D').getIndices() dssp_file = [ file.name for file in os.scandir(comb.input_dir_dssp) if pdb_acc_code in file.name ] if dssp_file: dssp_file = dssp_file[0] self.dssp = pr.parseDSSP(comb.input_dir_dssp + dssp_file, self.prody_pdb) elif 'path_to_dssp' in kwargs: self.dssp = pr.parseDSSP(kwargs.get('path_to_dssp'), self.prody_pdb) else: if pdb_file: pr.execDSSP(comb.input_dir_pdb + pdb_file, outputdir=comb.input_dir_dssp) elif 'path_to_pdb' in kwargs: pr.execDSSP(kwargs.get('path_to_pdb'), outputdir=comb.input_dir_dssp) else: path = comb.input_dir_pdb + 'reduce/' + next( file.name for file in os.scandir(comb.input_dir_pdb + 'reduce') if pdb_acc_code in file.name) pr.execDSSP(path, outputdir=comb.input_dir_dssp) self.dssp = pr.parseDSSP( comb.input_dir_dssp + next(file.name for file in os.scandir(comb.input_dir_dssp) if pdb_acc_code in file.name), self.prody_pdb) self.possible_ifgs = self.find_possible_ifgs(comb) else: self.possible_ifgs = None # valence and hydrogen bond data for vandermers and iFGs of ParsedPDB protein instance # iFG specific: self._ifg_pdb_info = [] self._ifg_atom_density = [] self._ifg_contact_water = [] self._ifg_contact_ligand = [] self._ifg_contact_metal = [] # vdM specific: self._vdm_pdb_info = [] self._vdm_sasa_info = [] self._ifg_contact_vdm = [] self._ifg_hbond_vdm = [] self._ifg_hbond_water = [] self._ifg_hbond_ligand = [] self._ifg_ca_hbond_vdm = []
def convertiSinglePDBtoDSSP(self, file): # con estensione if (os.path.isfile(self.pdbdir + file)): execDSSP(self.pdbdir + file, outputdir=self.dsspdir) else: print("File " + self.pdbdir + file + " not found..\n")
def get_relative_solvent_accessibility(pdb_id, residue_mapper, chain, full_pdb_solvent_accessibility=True, aa_surface_area=AA_SA_VOL): """ Run DSSP on a PDB file and return the resulting AtomGroup Parameters ---------- pdb_id String containing PDB ID residue_mapper Dictionary of residue - unitprot mappings chain String containing the selected chain ID(s) from the residue mapper full_pdb_solvent_accessibility Boolean to use the full PDB for solvent accessibility calculations -- otherwise only the chain residues will be selected. Default is True. aa_surface_area Dictionary with amino acid abbreviations as keys and surface area calculations as values Returns ------- a numpy array containing relative solvent accessibility measurement for residues """ if full_pdb_solvent_accessibility: dssp_chain = None else: dssp_chain = chain with tempfile.TemporaryDirectory() as tdir: pdb_file = os.path.join(tdir, '.'.join([pdb_id, 'pdb'])) dssp_file = os.path.join(tdir, '.'.join([pdb_id, 'dssp'])) # DSSP doesn't work with CIF-based atom groups, so must re-run here pd.pathPDBFolder(tdir) structure = pd.parsePDB(pdb_id, chain=dssp_chain) # Must write PDB file for DSSP with only chain selections # TODO how to silence output from the DSSP functions pd.writePDB(pdb_file, structure) pd.execDSSP(pdb_file, outputdir=tdir) pd.parseDSSP(dssp_file, structure) # Gather results # There should not be missing residues mapped_residue_list = list(residue_mapper.keys()) mapped_residue_list = ' '.join([str(x) for x in mapped_residue_list]) selection_string = f"resnum {mapped_residue_list}" if dssp_chain is not None: selection_string += f" AND chain {chain}" iter_resi_list = sorted( set(structure.select(selection_string).getResnums())) rel_acc_list = list() for resi in iter_resi_list: dssp_resi = structure[(chain, resi)] surface_accessibilty = dssp_resi.getData('dssp_acc')[0] resn = dssp_resi.getResname() rel_surface_accessibilty = surface_accessibilty / aa_surface_area[resn] rel_acc_list.append(rel_surface_accessibilty) return np.array(rel_acc_list)
def generateFeatures(modelFilename): tempdir = tempfile.mkdtemp() datadict = {} modelPDB = prody.parsePDB(modelFilename) if not modelPDB: raise Exception('Model file %s cannot be parsed' % modelFilename) #if model has no chainID, let's assign one. That makes STRIDE parser #happy if np.unique(modelPDB.getChids()) == ' ': modelPDB.all.setChids('A') modelFilename = os.path.join(tempdir, os.path.basename(modelFilename)) modelFilename = prody.writePDB(modelFilename, modelPDB, autoext=False) #run STRIDE prody.parseSTRIDE(prody.execSTRIDE(modelFilename, outputdir=tempdir), modelPDB) datadict['STRIDEarea'] = pd.Series(modelPDB.ca.getData('stride_area')) ss = pd.Series(modelPDB.ca.getSecstrs()) ss[ss == ''] = '-' #empty strings cause trouble in csv load/save datadict['STRIDEss'] = ss #run DSSP prody.parseDSSP(prody.execDSSP(modelFilename, outputdir=tempdir), modelPDB) datadict['DSSPacc'] = pd.Series(modelPDB.ca.getData('dssp_acc')) ss = pd.Series(modelPDB.ca.getSecstrs()) ss[ss == ''] = '-' datadict['DSSPss'] = ss #run NetSurfP netsurfp.parseNetSurfP(netsurfp.execNetSurfP(modelFilename, outputdir=tempdir), modelPDB) datadict['NetSurfP_exp'] = \ pd.Series(modelPDB.ca.getData('netsurfp_exposure')) datadict['NetSurfP_asa'] = \ pd.Series(modelPDB.ca.getData('netsurfp_asa')) datadict['NetSurfP_rsa'] = \ pd.Series(modelPDB.ca.getData('netsurfp_rsa')) datadict['NetSurfP_alpha'] = \ pd.Series(modelPDB.ca.getData('netsurfp_alphascore')) datadict['NetSurfP_beta'] = \ pd.Series(modelPDB.ca.getData('netsurfp_betascore')) datadict['NetSurfP_coil'] = \ pd.Series(modelPDB.ca.getData('netsurfp_coilscore')) #run PSIPRED psipred.parsePSIPRED(psipred.execPSIPRED(modelFilename, outputdir=tempdir), modelPDB) datadict['PSIPRED_ss'] = \ pd.Series(modelPDB.ca.getData('psipred_ss')) datadict['PSIPRED_coilscore'] = \ pd.Series(modelPDB.ca.getData('psipred_coilscore')) datadict['PSIPRED_helixscore'] = \ pd.Series(modelPDB.ca.getData('psipred_helixscore')) datadict['PSIPRED_strandscore'] = \ pd.Series(modelPDB.ca.getData('psipred_strandscore')) dataframe = pd.DataFrame(datadict) #remove temporary directory shutil.rmtree(tempdir, ignore_errors=True) return dataframe
def generateTrainingSet(inputdict, distance, output=None, combineOutput=True): devnull = open(os.devnull, 'w') subprocess.check_call('dssp --version', shell=True, stdout=devnull, stderr=devnull) subprocess.check_call('stride -h', shell=True, stdout=devnull, stderr=devnull) subprocess.check_call('netsurfp -h', shell=True, stdout=devnull, stderr=devnull) subprocess.check_call('runpsipred', shell=True, stdout=devnull, stderr=devnull) devnull.close() finalData = pd.DataFrame() if combineOutput else [] for target, models in inputdict.items(): targetPDB = prody.parsePDB(target) assert distance, "Distance is not valid" tempdir = tempfile.mkdtemp() # we don't want to run NetSurfP and PSIPRED over and over again for all # model structures. we compute them for target structure and just reuse # on model structures netsurfp.parseNetSurfP(netsurfp.execNetSurfP(target, outputdir=tempdir), targetPDB) psipred.parsePSIPRED(psipred.execPSIPRED(target, outputdir=tempdir), targetPDB) for i, modelFilename in enumerate(models): datadict = {} modelPDB = prody.parsePDB(modelFilename) if not modelPDB: print('Model file %s cannot be parsed, skipping...' % modelFilename) continue #if model has no chainID, let's assign one. That makes STRIDE parser #happy if np.unique(modelPDB.getChids()) == ' ': modelPDB.all.setChids('A') modelFilename = os.path.join(tempdir, os.path.basename(modelFilename)) modelFilename = prody.writePDB(modelFilename, modelPDB, autoext=False) #superimpose model onto target structure match = prody.matchAlign(modelPDB, targetPDB, tarsel='calpha', seqid=50, overlap=20) mapmodel = match[1] maptarget = match[2] #and copy NetSurfP and PSIPRED data from target to model copyDataFromTarget(targetPDB, modelPDB) #run STRIDE prody.parseSTRIDE(prody.execSTRIDE(modelFilename, outputdir=tempdir), modelPDB) datadict['STRIDEarea'] = \ pd.Series(modelPDB.ca.getData('stride_area')[mapmodel.getResindices()], index=maptarget.getResindices()) ss = pd.Series(mapmodel.getSecstrs(), index=maptarget.getResindices()) ss[ss == ''] = '-' #empty strings cause trouble in csv load/save datadict['STRIDEss'] = ss #run DSSP prody.parseDSSP(prody.execDSSP(modelFilename, outputdir=tempdir), modelPDB) datadict['DSSPacc'] = \ pd.Series(modelPDB.ca.getData('dssp_acc')[mapmodel.getResindices()], index=maptarget.getResindices()) ss = pd.Series(mapmodel.getSecstrs(), index=maptarget.getResindices()) ss[ss == ''] = '-' #empty strings cause trouble in csv load/save datadict['DSSPss'] = ss #save NetSurfP data datadict['NetSurfP_exp'] = \ pd.Series(modelPDB.ca.getData('netsurfp_exposure')[mapmodel.getResindices()], index=maptarget.getResindices()) datadict['NetSurfP_asa'] = \ pd.Series(modelPDB.ca.getData('netsurfp_asa')[mapmodel.getResindices()], index=maptarget.getResindices()) datadict['NetSurfP_rsa'] = \ pd.Series(modelPDB.ca.getData('netsurfp_rsa')[mapmodel.getResindices()], index=maptarget.getResindices()) datadict['NetSurfP_alpha'] = \ pd.Series(modelPDB.ca.getData('netsurfp_alphascore')[mapmodel.getResindices()], index=maptarget.getResindices()) datadict['NetSurfP_beta'] = \ pd.Series(modelPDB.ca.getData('netsurfp_betascore')[mapmodel.getResindices()], index=maptarget.getResindices()) datadict['NetSurfP_coil'] = \ pd.Series(modelPDB.ca.getData('netsurfp_coilscore')[mapmodel.getResindices()], index=maptarget.getResindices()) #save PSIPRED data datadict['PSIPRED_ss'] = \ pd.Series(modelPDB.ca.getData('psipred_ss')[mapmodel.getResindices()], index=maptarget.getResindices()) datadict['PSIPRED_coilscore'] = \ pd.Series(modelPDB.ca.getData('psipred_coilscore')[mapmodel.getResindices()], index=maptarget.getResindices()) datadict['PSIPRED_helixscore'] = \ pd.Series(modelPDB.ca.getData('psipred_helixscore')[mapmodel.getResindices()], index=maptarget.getResindices()) datadict['PSIPRED_strandscore'] = \ pd.Series(modelPDB.ca.getData('psipred_strandscore')[mapmodel.getResindices()], index=maptarget.getResindices()) #Compute class labels based on the distance argument datadict['ClassLabel'] = pd.Series((np.abs( prody.calcDistance(maptarget.copy(), mapmodel.copy())) < distance).astype(int), index=maptarget.getResindices()) if combineOutput: finalData = pd.concat([finalData, pd.DataFrame(datadict)]) else: finalData.append(pd.DataFrame(datadict)) #remove temporary directory shutil.rmtree(tempdir, ignore_errors=True) if output: if combineOutput: finalData.to_csv(output, index=False, quoting=csv.QUOTE_NONNUMERIC) #dataframe.to_csv(output) else: print('Warning! Output must be combined to be saved into a CSV ' 'file') return finalData
def make_dssp(pdbfile,outdir = "results"): """Make dssp for pdb file Needs modify dssp bin name 'mkdssp' in prody dssp.py""" return prody.execDSSP(pdbfile,outputdir=outdir)