示例#1
0
 def _launchDSSP(self, ag):
     LOGGER.info('Running DSSP...')
     LOGGER.timeit('_DSSP')
     try:
         pdb_file = writePDB('_temp.pdb', ag, secondary=False)
         dssp_file = execDSSP(pdb_file, outputname='_temp')
         ag = parseDSSP(dssp_file, ag)
     except:
         raise
     finally:
         os.remove('_temp.pdb')
         os.remove('_temp.dssp')
     LOGGER.report('DSSP finished in %.1fs.', '_DSSP')
     return ag
示例#2
0
文件: PDB.py 项目: shulp2211/rhapsody
 def _launchDSSP(self, ag):
     LOGGER.info('Running DSSP...')
     LOGGER.timeit('_DSSP')
     pdb_file = writePDB('_temp.pdb', ag, secondary=False)
     try:
         dssp_file = execDSSP(pdb_file, outputname='_temp')
     except EnvironmentError:
         raise EnvironmentError("dssp executable not found: please install "
                                "with 'sudo apt install dssp'")
     ag = parseDSSP(dssp_file, ag)
     os.remove('_temp.pdb')
     os.remove('_temp.dssp')
     LOGGER.report('DSSP finished in %.1fs.', '_DSSP')
     return ag
示例#3
0
def calc_sasa(pdb, sel='protein', per_res=False, path_dssp='C:\Python27\Scripts\dssp-2.0.4-win32.exe'):
    import os as os
    import tempfile as tempfile
    import shutil as shutil
    import prody as prody
    import numpy as np

    dssp_in = 'dssp.in'

    prevdir = os.getcwd()
    with tempfile.TemporaryDirectory() as tmpdir:
        os.chdir(os.path.expanduser(tmpdir))

        prody.writePDB(dssp_in+'.pdb', pdb.select(sel))
        pdb_dssp = prody.parsePDB(dssp_in+'.pdb')
        execDSSP(dssp_in+'.pdb', dssp=path_dssp)
        prody.parseDSSP(dssp_in+'.dssp', pdb_dssp)

        if per_res:
            return(pdb_dssp._data['dssp_acc'])
        else:
            return(np.sum(pdb_dssp._data['dssp_acc']))

    os.chdir(prevdir)
示例#4
0
def get_features(pdb_file: str,
                 dssp_dir: str,
                 only_dssp=True,
                 force_overwrite=True):
    """
    Extract features from a pdb_file

    Parameters
    ----------
    pdb_file
    dssp_dir
        directory to store tmp dssp files
    only_dssp
        extract only dssp features (use if not interested in features)
    force_overwrite
        force rerun DSSP

    Returns
    -------
    dict of features
    """
    pdb_file = str(pdb_file)
    name = Path(pdb_file).stem
    protein = pd.parsePDB(pdb_file).select("protein")
    pdb_file = str(Path(dssp_dir) / f"{name}.pdb")
    pd.writePDB(pdb_file, protein)
    protein = pd.parsePDB(pdb_file)
    dssp_file = Path(dssp_dir) / f"{name}.dssp"
    if force_overwrite or not dssp_file.exists():
        dssp_file = pd.execDSSP(str(pdb_file),
                                outputname=name,
                                outputdir=str(dssp_dir))
    protein = pd.parseDSSP(dssp=str(dssp_file), ag=protein, parseall=True)
    data = get_dssp_features(protein)
    if only_dssp:
        return data
    else:
        data = {**data, **get_fluctuations(protein)}
        data = {**data, **get_residue_depths(pdb_file)}
        return data
示例#5
0
    def __init__(self, comb, pdb_acc_code, chain, **kwargs):
        """ :comb: arg: instance of cls Comb with attributes pdbchain_dict, ifg_selection_info
        :pdb_acc_code: type: str: 4 character pdb accession code
        :param kwargs: 
            path_to_pdb
            path_to_dssp 
        """
        #search for acc code in input_dir_pdb from comb object.
        assert isinstance(pdb_acc_code,
                          str), 'PDB accession code needs to be a string'
        pdb_file = [
            file.name for file in os.scandir(comb.input_dir_pdb)
            if pdb_acc_code in file.name
        ]
        try:
            if pdb_file:
                pdb_file = pdb_file[0]
                self.prody_pdb = pr.parsePDB(comb.input_dir_pdb + pdb_file,
                                             altloc='A',
                                             model=1)
            elif 'path_to_pdb' in kwargs:
                self.prody_pdb = pr.parsePDB(kwargs.get('path_to_pdb'),
                                             altloc='A',
                                             model=1)
            else:  # NEED TO UPDATE: note if going to fetch pdb, it should be sent through Reduce first...
                try:
                    os.mkdir(comb.input_dir_pdb + 'raw')
                    os.mkdir(comb.input_dir_pdb + 'reduce')
                except:
                    pass
                pr.fetchPDB(pdb_acc_code,
                            compressed=False,
                            folder=comb.input_dir_pdb + 'raw')
                os.system(comb.path_to_reduce + comb.reduce +
                          ' -FLIP -Quiet -DB ' + comb.path_to_reduce +
                          'reduce_wwPDB_het_dict.txt ' + comb.input_dir_pdb +
                          'raw/' + pdb_acc_code.lower() + '.pdb > ' +
                          comb.input_dir_pdb + 'reduce/' +
                          pdb_acc_code.lower() + 'H.pdb')
                self.prody_pdb = pr.parsePDB(comb.input_dir_pdb + 'reduce/' +
                                             pdb_acc_code.lower() + 'H.pdb',
                                             altloc='A',
                                             model=1)
        except NameError:
            raise NameError(
                'ParsePDB instance needs a pdb file path or a valid pdb accession code.'
            )

        self.pdb_acc_code = pdb_acc_code.lower()
        self.pdb_chain = chain
        if len(self.prody_pdb) == len(self.prody_pdb.select('icode _')) \
                and self.prody_pdb.select('protein and chain ' + self.pdb_chain) is not None:
            self.contacts = pr.Contacts(self.prody_pdb)
            self.set_bonds()

            if pdb_file:
                self.fs_struct = freesasa.Structure(comb.input_dir_pdb +
                                                    pdb_file)
            elif 'path_to_pdb' in kwargs:
                self.fs_struct = freesasa.Structure(kwargs.get('path_to_pdb'))
            else:
                path = comb.input_dir_pdb + 'reduce/'
                self.fs_struct = freesasa.Structure(path + next(
                    file.name for file in os.scandir(path)
                    if self.pdb_acc_code in file.name))

            self.fs_result = freesasa.calc(self.fs_struct)

            self.fs_result_cb_3A = self.freesasa_cb(probe_radius=3)
            self.fs_result_cb_4A = self.freesasa_cb(probe_radius=4)
            self.fs_result_cb_5A = self.freesasa_cb(probe_radius=5)
            self.prody_pdb_bb_cb_atom_ind = self.prody_pdb.select(
                'protein and (backbone or name CB) '
                'and not element H D').getIndices()

            dssp_file = [
                file.name for file in os.scandir(comb.input_dir_dssp)
                if pdb_acc_code in file.name
            ]
            if dssp_file:
                dssp_file = dssp_file[0]
                self.dssp = pr.parseDSSP(comb.input_dir_dssp + dssp_file,
                                         self.prody_pdb)
            elif 'path_to_dssp' in kwargs:
                self.dssp = pr.parseDSSP(kwargs.get('path_to_dssp'),
                                         self.prody_pdb)
            else:
                if pdb_file:
                    pr.execDSSP(comb.input_dir_pdb + pdb_file,
                                outputdir=comb.input_dir_dssp)
                elif 'path_to_pdb' in kwargs:
                    pr.execDSSP(kwargs.get('path_to_pdb'),
                                outputdir=comb.input_dir_dssp)
                else:
                    path = comb.input_dir_pdb + 'reduce/' + next(
                        file.name
                        for file in os.scandir(comb.input_dir_pdb + 'reduce')
                        if pdb_acc_code in file.name)
                    pr.execDSSP(path, outputdir=comb.input_dir_dssp)

                self.dssp = pr.parseDSSP(
                    comb.input_dir_dssp +
                    next(file.name for file in os.scandir(comb.input_dir_dssp)
                         if pdb_acc_code in file.name), self.prody_pdb)
            self.possible_ifgs = self.find_possible_ifgs(comb)
        else:
            self.possible_ifgs = None
        # valence and hydrogen bond data for vandermers and iFGs of ParsedPDB protein instance
        # iFG specific:
        self._ifg_pdb_info = []
        self._ifg_atom_density = []
        self._ifg_contact_water = []
        self._ifg_contact_ligand = []
        self._ifg_contact_metal = []
        # vdM specific:
        self._vdm_pdb_info = []
        self._vdm_sasa_info = []
        self._ifg_contact_vdm = []
        self._ifg_hbond_vdm = []
        self._ifg_hbond_water = []
        self._ifg_hbond_ligand = []
        self._ifg_ca_hbond_vdm = []
示例#6
0
def get_relative_solvent_accessibility(pdb_id,
                                       residue_mapper,
                                       chain,
                                       full_pdb_solvent_accessibility=True,
                                       aa_surface_area=AA_SA_VOL):
    """
    Run DSSP on a PDB file and return the resulting AtomGroup
    
    Parameters
    ----------
    pdb_id
        String containing PDB ID
    residue_mapper
        Dictionary of residue - unitprot mappings
    chain
        String containing the selected chain ID(s) from the residue mapper
    full_pdb_solvent_accessibility
        Boolean to use the full PDB for solvent accessibility calculations -- otherwise
        only the chain residues will be selected. Default is True.
    aa_surface_area
        Dictionary with amino acid abbreviations as keys and surface area 
        calculations as values

    Returns
    -------
    a numpy array containing relative solvent accessibility measurement for residues
    """

    if full_pdb_solvent_accessibility:
        dssp_chain = None
    else:
        dssp_chain = chain

    with tempfile.TemporaryDirectory() as tdir:
        pdb_file = os.path.join(tdir, '.'.join([pdb_id, 'pdb']))
        dssp_file = os.path.join(tdir, '.'.join([pdb_id, 'dssp']))

        # DSSP doesn't work with CIF-based atom groups, so must re-run here
        pd.pathPDBFolder(tdir)
        structure = pd.parsePDB(pdb_id, chain=dssp_chain)

        # Must write PDB file for DSSP with only chain selections
        # TODO how to silence output from the DSSP functions
        pd.writePDB(pdb_file, structure)
        pd.execDSSP(pdb_file, outputdir=tdir)
        pd.parseDSSP(dssp_file, structure)

    # Gather results
    # There should not be missing residues
    mapped_residue_list = list(residue_mapper.keys())
    mapped_residue_list = ' '.join([str(x) for x in mapped_residue_list])

    selection_string = f"resnum {mapped_residue_list}"
    if dssp_chain is not None:
        selection_string += f" AND chain {chain}"

    iter_resi_list = sorted(
        set(structure.select(selection_string).getResnums()))
    rel_acc_list = list()

    for resi in iter_resi_list:
        dssp_resi = structure[(chain, resi)]
        surface_accessibilty = dssp_resi.getData('dssp_acc')[0]
        resn = dssp_resi.getResname()
        rel_surface_accessibilty = surface_accessibilty / aa_surface_area[resn]
        rel_acc_list.append(rel_surface_accessibilty)

    return np.array(rel_acc_list)
示例#7
0
def generateFeatures(modelFilename):
    tempdir = tempfile.mkdtemp()

    datadict = {}
    modelPDB = prody.parsePDB(modelFilename)

    if not modelPDB:
        raise Exception('Model file %s cannot be parsed' % modelFilename)

    #if model has no chainID, let's assign one. That makes STRIDE parser
    #happy
    if np.unique(modelPDB.getChids()) == ' ':
        modelPDB.all.setChids('A')
        modelFilename = os.path.join(tempdir,
                                     os.path.basename(modelFilename))

        modelFilename = prody.writePDB(modelFilename,
                                       modelPDB,
                                       autoext=False)

    #run STRIDE
    prody.parseSTRIDE(prody.execSTRIDE(modelFilename, outputdir=tempdir),
                                       modelPDB)

    datadict['STRIDEarea'] = pd.Series(modelPDB.ca.getData('stride_area'))
    ss = pd.Series(modelPDB.ca.getSecstrs())
    ss[ss == ''] = '-' #empty strings cause trouble in csv load/save
    datadict['STRIDEss'] = ss

    #run DSSP
    prody.parseDSSP(prody.execDSSP(modelFilename, outputdir=tempdir), modelPDB)

    datadict['DSSPacc'] = pd.Series(modelPDB.ca.getData('dssp_acc'))
    ss = pd.Series(modelPDB.ca.getSecstrs())
    ss[ss == ''] = '-'
    datadict['DSSPss'] = ss

    #run NetSurfP
    netsurfp.parseNetSurfP(netsurfp.execNetSurfP(modelFilename,
                                                 outputdir=tempdir),
                                                 modelPDB)
    datadict['NetSurfP_exp'] = \
        pd.Series(modelPDB.ca.getData('netsurfp_exposure'))
    datadict['NetSurfP_asa'] = \
        pd.Series(modelPDB.ca.getData('netsurfp_asa'))
    datadict['NetSurfP_rsa'] = \
        pd.Series(modelPDB.ca.getData('netsurfp_rsa'))
    datadict['NetSurfP_alpha'] = \
        pd.Series(modelPDB.ca.getData('netsurfp_alphascore'))
    datadict['NetSurfP_beta'] = \
        pd.Series(modelPDB.ca.getData('netsurfp_betascore'))
    datadict['NetSurfP_coil'] = \
        pd.Series(modelPDB.ca.getData('netsurfp_coilscore'))

    #run PSIPRED
    psipred.parsePSIPRED(psipred.execPSIPRED(modelFilename,
                                             outputdir=tempdir),
                                                 modelPDB)
    datadict['PSIPRED_ss'] = \
        pd.Series(modelPDB.ca.getData('psipred_ss'))
    datadict['PSIPRED_coilscore'] = \
        pd.Series(modelPDB.ca.getData('psipred_coilscore'))
    datadict['PSIPRED_helixscore'] = \
        pd.Series(modelPDB.ca.getData('psipred_helixscore'))
    datadict['PSIPRED_strandscore'] = \
        pd.Series(modelPDB.ca.getData('psipred_strandscore'))


    dataframe = pd.DataFrame(datadict)

    #remove temporary directory
    shutil.rmtree(tempdir, ignore_errors=True)

    return dataframe
示例#8
0
def generateTrainingSet(inputdict, distance, output=None, combineOutput=True):

    devnull = open(os.devnull, 'w')
    subprocess.check_call('dssp --version', shell=True, stdout=devnull,
                          stderr=devnull)
    subprocess.check_call('stride -h', shell=True, stdout=devnull,
                          stderr=devnull)
    subprocess.check_call('netsurfp -h', shell=True, stdout=devnull,
                          stderr=devnull)
    subprocess.check_call('runpsipred', shell=True, stdout=devnull,
                          stderr=devnull)
    devnull.close()

    finalData = pd.DataFrame() if combineOutput else []

    for target, models in inputdict.items():

        targetPDB = prody.parsePDB(target)
        assert distance, "Distance is not valid"

        tempdir = tempfile.mkdtemp()

        # we don't want to run NetSurfP and PSIPRED over and over again for all
        # model structures. we compute them for target structure and just reuse
        # on model structures
        netsurfp.parseNetSurfP(netsurfp.execNetSurfP(target, outputdir=tempdir),
                                                     targetPDB)

        psipred.parsePSIPRED(psipred.execPSIPRED(target, outputdir=tempdir),
                                                 targetPDB)

        for i, modelFilename in enumerate(models):
            datadict = {}
            modelPDB = prody.parsePDB(modelFilename)

            if not modelPDB:
                print('Model file %s cannot be parsed, skipping...' %
                      modelFilename)
                continue

            #if model has no chainID, let's assign one. That makes STRIDE parser
            #happy
            if np.unique(modelPDB.getChids()) == ' ':
                modelPDB.all.setChids('A')
                modelFilename = os.path.join(tempdir,
                                             os.path.basename(modelFilename))

                modelFilename = prody.writePDB(modelFilename,
                                               modelPDB,
                                               autoext=False)

            #superimpose model onto target structure
            match = prody.matchAlign(modelPDB, targetPDB, tarsel='calpha',
                                     seqid=50, overlap=20)
            mapmodel = match[1]
            maptarget = match[2]

            #and copy NetSurfP and PSIPRED data from target to model
            copyDataFromTarget(targetPDB, modelPDB)

            #run STRIDE
            prody.parseSTRIDE(prody.execSTRIDE(modelFilename, outputdir=tempdir),
                                               modelPDB)

            datadict['STRIDEarea'] = \
                pd.Series(modelPDB.ca.getData('stride_area')[mapmodel.getResindices()],
                          index=maptarget.getResindices())

            ss = pd.Series(mapmodel.getSecstrs(), index=maptarget.getResindices())
            ss[ss == ''] = '-' #empty strings cause trouble in csv load/save
            datadict['STRIDEss'] = ss

            #run DSSP
            prody.parseDSSP(prody.execDSSP(modelFilename, outputdir=tempdir), modelPDB)

            datadict['DSSPacc'] = \
                pd.Series(modelPDB.ca.getData('dssp_acc')[mapmodel.getResindices()],
                          index=maptarget.getResindices())

            ss = pd.Series(mapmodel.getSecstrs(), index=maptarget.getResindices())
            ss[ss == ''] = '-' #empty strings cause trouble in csv load/save
            datadict['DSSPss'] = ss


            #save NetSurfP data
            datadict['NetSurfP_exp'] = \
                pd.Series(modelPDB.ca.getData('netsurfp_exposure')[mapmodel.getResindices()],
                          index=maptarget.getResindices())

            datadict['NetSurfP_asa'] = \
                pd.Series(modelPDB.ca.getData('netsurfp_asa')[mapmodel.getResindices()],
                          index=maptarget.getResindices())
            datadict['NetSurfP_rsa'] = \
                pd.Series(modelPDB.ca.getData('netsurfp_rsa')[mapmodel.getResindices()],
                          index=maptarget.getResindices())
            datadict['NetSurfP_alpha'] = \
                pd.Series(modelPDB.ca.getData('netsurfp_alphascore')[mapmodel.getResindices()],
                          index=maptarget.getResindices())
            datadict['NetSurfP_beta'] = \
                pd.Series(modelPDB.ca.getData('netsurfp_betascore')[mapmodel.getResindices()],
                          index=maptarget.getResindices())
            datadict['NetSurfP_coil'] = \
                pd.Series(modelPDB.ca.getData('netsurfp_coilscore')[mapmodel.getResindices()],
                          index=maptarget.getResindices())

            #save PSIPRED data
            datadict['PSIPRED_ss'] = \
                pd.Series(modelPDB.ca.getData('psipred_ss')[mapmodel.getResindices()],
                          index=maptarget.getResindices())
            datadict['PSIPRED_coilscore'] = \
                pd.Series(modelPDB.ca.getData('psipred_coilscore')[mapmodel.getResindices()],
                          index=maptarget.getResindices())
            datadict['PSIPRED_helixscore'] = \
                pd.Series(modelPDB.ca.getData('psipred_helixscore')[mapmodel.getResindices()],
                          index=maptarget.getResindices())
            datadict['PSIPRED_strandscore'] = \
                pd.Series(modelPDB.ca.getData('psipred_strandscore')[mapmodel.getResindices()],
                          index=maptarget.getResindices())

            #Compute class labels based on the distance argument
            datadict['ClassLabel'] = pd.Series((np.abs(
                prody.calcDistance(maptarget.copy(), mapmodel.copy())) < distance).astype(int),
                index=maptarget.getResindices())

            if combineOutput:
                finalData = pd.concat([finalData, pd.DataFrame(datadict)])
            else:
                finalData.append(pd.DataFrame(datadict))

        #remove temporary directory
        shutil.rmtree(tempdir, ignore_errors=True)

    if output:
        if combineOutput:
            finalData.to_csv(output, index=False, quoting=csv.QUOTE_NONNUMERIC)
            #dataframe.to_csv(output)
        else:
            print('Warning! Output must be combined to be saved into a CSV '
                  'file')

    return finalData