Exemplo n.º 1
0
    def _get_protein_features(self, pdb_code, file_path, chain_selection):
        """
        :param file_path: (str) file path to PDB file
        :param pdb_code: (str) String containing four letter PDB accession
        :return df (pd.DataFrame): Dataframe containing output of DSSP (Solvent accessibility, secondary structure for each residue)
        """

        # Run DSSP on relevant PDB file
        if pdb_code:
            d = dssp_dict_from_pdb_file(self.pdb_dir + pdb_code + '.pdb')
        if file_path:
            d = dssp_dict_from_pdb_file(file_path)

        # Parse DSSP output to DataFrame
        appender = []
        for k in d[1]:
            to_append = []
            y = d[0][k]
            chain = k[0]
            residue = k[1]
            het = residue[0]
            resnum = residue[1]
            icode = residue[2]
            to_append.extend([chain, resnum, icode])
            to_append.extend(y)
            appender.append(to_append)

        cols = [
            'chain', 'resnum', 'icode', 'aa', 'ss', 'exposure_rsa', 'phi',
            'psi', 'dssp_index', 'NH_O_1_relidx', 'NH_O_1_energy',
            'O_NH_1_relidx', 'O_NH_1_energy', 'NH_O_2_relidx', 'NH_O_2_energy',
            'O_NH_2_relidx', 'O_NH_2_energy'
        ]

        df = pd.DataFrame.from_records(appender, columns=cols)
        # Subset dataframe to those in chain_selection
        if chain_selection != 'all':
            df = df.loc[df['chain'].isin(chain_selection)]
        # Rename cysteines to 'C'
        df['aa'] = df['aa'].str.replace('[a-z]', 'C')
        df = df[df['aa'].isin(list(aa1))]

        # Drop alt_loc residues
        df = df.loc[df['icode'] == ' ']

        # Add additional Columns
        df['aa_three'] = df['aa'].apply(one_to_three)
        df['max_acc'] = df['aa_three'].map(residue_max_acc['Sander'].get)
        df[['exposure_rsa', 'max_acc']] = df[['exposure_rsa',
                                              'max_acc']].astype(float)
        df['exposure_asa'] = df['exposure_rsa'] * df['max_acc']
        df['index'] = df['chain'] + ':' + df['aa_three'] + ':' + df[
            'resnum'].apply(str)
        return df
Exemplo n.º 2
0
def calc_ss(pdbfile) -> [str]:
    '''
    Calculate the secondary structure of the protein.
    Code Structure
    H 	Alpha helix (4-12)
    B 	Isolated beta-bridge residue
    E 	Strand
    G 	3-10 helix
    I 	Pi helix
    T 	Turn
    S 	Bend
    - 	None
    '''
    '''
    dssp_dict_from_pdb_file simply Popen 'mkdssp' and then deals with its output,
    src: http://biopython.org/DIST/docs/api/Bio.PDB.DSSP%27-pysrc.html#dssp_dict_from_pdb_file
    '''
    # keys :: [(chainid, res_id)], eg [('A', (' ', 12, ' ')), ...]
    ss_dict, keys = dssp_dict_from_pdb_file(pdbfile)  # this supports .cif also

    # make the resides' order consistent as it is in C-alpha file (i.e 'modes_CA.pdb')
    # to plot fluctuations with 2nd structure correctly
    parser = set_parser(pdbfile)
    protein = parser.get_structure(pdbfile[:4], pdbfile)
    ss_list = []
    for a in protein.get_atoms():
        if is_ca(a):
            full_id = a.get_full_id()
            new_key = (full_id[2], full_id[3])
            if new_key in ss_dict:
                ss_list.append(ss_dict[new_key][1])
    return ss_list
Exemplo n.º 3
0
def generate_2structures(pdbs_to_process,output_path,pdb_id,logger):
    print(pdbs_to_process)
    pdb_files = []
    for pdbs_id in pdbs_to_process:
        url = "https://files.rcsb.org/download/"+ pdbs_id +".pdb"
        try:
            urllib.request.urlretrieve(url, output_path +"/" + pdbs_id + ".pdb")
            pdb_files.append(output_path +"/" + pdbs_id + ".pdb")
        except Exception as e:
            print(str(e))
    all_seq_fasta = output_path + "/"+pdb_id+".fasta"
    for pdb_file in pdb_files:
        dssp_tuple = dssp_dict_from_pdb_file(pdb_file)
        dssp_dict = dssp_tuple[0]
        #EL PRIMER VALOR DE LA TUPLA ES UN DICCIONARIO (TUPLA KEY, DATA DE LA ESTRUCTURA)
        #EL SEGUNDO VALOR ES LA LISTA DE KEYS QUE SON DEL FORMATO ("CADENA",('',NRO DE RESIDUO,''))
        #LAS CADENAS ESTAN SEPARADAS , POR EJ LA CADENA A SON MUCHAS KEYS TODAS EMPEZANDO CON A PERO CON DISTINTO NUMERO DE RESIDUO
        chain_map = {}
        for key in dssp_tuple[1]:
            if(key[0] in chain_map.keys()):
                chain_map[key[0]].append(key)
            else:
                chain_map[key[0]] = [key]
        for chain,keys in chain_map.items():
            seq = ""
            for chainPart in keys:
                seq += dssp_dict[chainPart][1]

            pdb_name = pdb_file.split('/')[2].split('.')[0] + "_" + chain
            secondary_map[pdb_name] = seq

    return generate_secondary_fasta(get_primary_map(pdb_id,output_path),output_path)
Exemplo n.º 4
0
def get_dssp_df(pdb_file, pdb_name, dir, dssp_exec='dssp'):
    d = dssp_dict_from_pdb_file(pdb_file)
    appender = []
    for k in d[1]:
            to_append = []
            y = d[0][k]
            chain = k[0]
            residue = k[1]
            het = residue[0]
            resnum = residue[1]
            icode = residue[2]
            to_append.extend([chain, resnum, icode])
            to_append.extend(y)
            appender.append(to_append)

    cols = ['chain', 'resnum', 'icode',
                'dssp_index', 'aa', 'ss', 'exposure_rsa', 'phi', 'psi',
                'NH_O_1_relidx', 'NH_O_1_energy', 'O_NH_1_relidx',
                'O_NH_1_energy', 'NH_O_2_relidx', 'NH_O_2_energy',
                'O_NH_2_relidx', 'O_NH_2_energy']

    df = pd.DataFrame.from_records(appender, columns=cols)

        # Adding additional columns
    df = df[df['aa'].isin(list(aa1))]
    df['aa_three'] = df['aa'].apply(one_to_three)
    df['max_acc'] = df['aa_three'].map(residue_max_acc['Sander'].get)
    df[['exposure_rsa', 'max_acc']] = df[['exposure_rsa', 'max_acc']].astype(float)
    df['exposure_asa'] = df['exposure_rsa'] * df['max_acc']

    df.to_csv(dir + pdb_name + '_sasa.csv')
    return df
Exemplo n.º 5
0
def pdb2cd(name):
    f = name + ".pdb"
    dssp_tuple = dssp_dict_from_pdb_file(f)
    dssp_dict = dssp_tuple[0]
    p = PDBParser(QUIET=True).get_structure("file", f)

    # Initiates and fills array ("cc") with chains.
    cc = [chain.get_id() for model in p for chain in model]

    # Determines length of sequence, initiates an array ("ss") of same length.
    howLong = ss_out = 0
    for c in cc:
        howLong += len([_ for _ in p[0][c].get_residues() if PDB.is_aa(_)])
    if not howLong == len(dssp_tuple[1]): howLong = len(dssp_tuple[1])
    ss = np.arange(1, howLong + 1)

    # Fills the array ("ss") with secondary structures.
    for i in ss:
        ss_lib = dssp_dict[dssp_tuple[1][
            i -
            3]]  # ss_lib = dssp_dict[(dssp_tuple[1][0][0], (' ', i-1, ' '))]
        dict_ss = ss_lib[1]
        if dict_ss == 'H':
            ss_out = 0
        if dict_ss == 'E':
            ss_out = 1
        if dict_ss == '-':  # else:# dict_ss == '-':
            ss_out = 2
        ss[i - 1] = ss_out
    # Returns the fractional composition of alpha helix, beta sheet or random coil.
    alpha = (ss == 0).sum() / ss.__len__()
    beta = (ss == 1).sum() / ss.__len__()
    coil = (ss == 2).sum() / ss.__len__()
    abc = [alpha, beta, coil]
    return abc
Exemplo n.º 6
0
def get_nf1(pdb, res, chain, nf1_window):
    PROJECT_PATH = os.path.dirname(__file__) + "/"
    filename_pdb = PROJECT_PATH + '/PDB_Data/' + pdb + '.pdb'
    dssp = dssp_dict_from_pdb_file(filename_pdb)
    dssp = dssp[0]
    nf1 = []
    start = res - nf1_window
    end = res + nf1_window
    structure = ''
    for k, v in dssp:
        chain = k
        break
    for j in range(start - 1, end):
        try:
            structure = dssp[chain, (' ', j, ' ')][1]
            if structure == 'H' or structure == 'G' or structure == 'I':
                nf1.append(1)
            elif structure == 'T' or structure == 'S':
                nf1.append(2)
            elif structure == 'B':
                nf1.append(3)
            elif structure == 'E':
                nf1.append(4)
            else:
                nf1.append(5)
        except:
            nf1.append(6)

    print("NF1_" + str(nf1_window) + ": " + str(nf1))

    return nf1
Exemplo n.º 7
0
def get_dssp_dict_for_pdb_file(pdb_filename):
    """Run DSSP to calculate secondary structure features for a given PDB file."""
    dssp_dict = {}
    try:
        dssp_tuple = dssp_dict_from_pdb_file(pdb_filename)
        dssp_dict = dssp_tuple[0]
    except Exception:
        logging.info("No DSSP features found for {:}".format(pdb_filename))
    return dssp_dict
Exemplo n.º 8
0
def compute_dssp(fname):
    '''
	computes dssp from fname
	source: https://biopython.org/docs/1.75/api/Bio.PDB.DSSP.html
	'''
    assert os.path.isfile(fname), 'no such file'
    dssp_tuple = dssp_dict_from_pdb_file(fname)
    sec_struc = []
    for k, v in dssp_tuple[0].items():
        sec_struc.append(v[1])
    return sec_struc
Exemplo n.º 9
0
def create_dssp_csv(pdb_chain_file, dssp_csv_file):
    """create a dssp csv file

    Parameters
    ----------
    pdb_chain_file : str
        The file location of the pdb chain
    dssp_csv_file : str
        The file location of the output dssp_csv
    """
    values, keys = dssp_dict_from_pdb_file(pdb_chain_file)
    data = [x + y for x, y in zip(keys, values.values())]
    pd.DataFrame(data).to_csv(dssp_csv_file, index=False)
Exemplo n.º 10
0
def get_secondary_structure_residues(chain, pdb_code='1KX5'):
    p = PDBList()
    fn = p.retrieve_pdb_file(pdb_code=pdb_code,
                             file_format='pdb',
                             overwrite=False)

    dssp_dict = dssp_dict_from_pdb_file(fn)[0]

    residues = []

    for k in dssp_dict.keys():
        cName = k[0]
        rId = k[1][1]
        DSSP = dssp_dict[k][1]
        if not (DSSP in 'TS-'):
            if cName == chain:
                residues.append(rId)

    return residues
Exemplo n.º 11
0
def add_dssp_df(G: nx.Graph, dssp_config: Optional[DSSPConfig]) -> nx.Graph:
    """
    Construct DSSP dataframe and add as graph level variable to protein graph

    :param G: Input protein graph
    :param G: nx.Graph
    :param dssp_config: DSSPConfig object. Specifies which executable to run. Located in graphein.protein.config
    :type dssp_config: DSSPConfig, optional
    :return: Protein graph with DSSP dataframe added
    :rtype: nx.Graph
    """

    config = G.graph["config"]
    pdb_id = G.graph["pdb_id"]

    # TODO - Check for DSSP installation

    # Check for existence of pdb file. If not, download it.
    if not os.path.isfile(config.pdb_dir / pdb_id):
        pdb_file = download_pdb(config, pdb_id)
    else:
        pdb_file = config.pdb_dir + pdb_id + ".pdb"

    # Extract DSSP executable
    executable = dssp_config.executable

    if config.verbose:
        print(f"Using DSSP executable '{executable}'")

    # Run DSSP
    dssp_dict = dssp_dict_from_pdb_file(pdb_file, DSSP=executable)
    dssp_dict = parse_dssp_df(dssp_dict)
    dssp_dict = process_dssp_df(dssp_dict)

    if config.verbose:
        print(dssp_dict)

    # Assign DSSP Dict
    G.graph["dssp_df"] = dssp_dict

    return G
Exemplo n.º 12
0
    def __init__(self, pdb_file, AA_kind='common'):
        if AA_kind == 'common':
            self.AA_dict = {
                'ALA': 'A',
                'ARG': 'R',
                'ASN': 'N',
                'ASP': 'D',
                'CYS': 'C',
                'GLN': 'Q',
                'GLU': 'E',
                'GLY': 'G',
                'HIS': 'H',
                'ILE': 'I',
                'LEU': 'L',
                'LYS': 'K',
                'MET': 'M',
                'PHE': 'F',
                'PRO': 'P',
                'SER': 'S',
                'THR': 'T',
                'TRP': 'W',
                'TYR': 'Y',
                'VAL': 'V',
                'UNK': 'X'
            }
        else:
            self.AA_dict = {
                'ALA': 'A',
                'ARG': 'R',
                'ASN': 'N',
                'ASP': 'D',
                'CYS': 'C',
                'GLN': 'Q',
                'GLU': 'E',
                'GLY': 'G',
                'HIS': 'H',
                'ILE': 'I',
                'LEU': 'L',
                'LYS': 'K',
                'MET': 'M',
                'PHE': 'F',
                'PRO': 'P',
                'SER': 'S',
                'THR': 'T',
                'TRP': 'W',
                'TYR': 'Y',
                'VAL': 'V',
                'UNK': 'X',
                'SEC': 'B'
            }

        self.ss_dict_8_3 = {
            'H': 'H',
            'G': 'H',
            'I': 'H',
            'E': 'E',
            'B': 'E',
            'S': 'C',
            'T': 'C',
            '-': 'C',
            'C': 'C',
            'X': 'X',
            'x': 'x',
            'M': 'M'
        }  # 8-classes to 3-classes and 3 to 3

        self.pdb_file = pdb_file
        self.protein_dict = read_pdb(pdb_file)

        try:
            dssp_dict = dssp_dict_from_pdb_file(pdb_file)[0]
            self.dssp_read = True
        except:
            self.dssp_read = False

        self.pdb_read = (type(self.protein_dict) == dict)

        if self.pdb_read and self.dssp_read:

            self.Seq_dict = {}
            self.SS_dict_8 = {}
            self.SS_dict_3 = {}

            for chain in self.protein_dict.keys():
                Complete_Seq = ''
                Complete_SS_8 = ''
                Complete_SS_3 = ''

                index_info = sorted(
                    [index_split(i) for i in self.protein_dict[chain].keys()],
                    key=lambda x: x[0])

                indv_pre = index_info[0][0] - 1

                resi_dict_problem = False

                for index_value, index_foot, index in index_info:

                    Complete_Seq += 'x' * (index_value - indv_pre - 1)
                    Complete_SS_8 += 'x' * (index_value - indv_pre - 1)
                    Complete_SS_3 += 'x' * (index_value - indv_pre - 1)

                    dssp_key = (chain, (' ', index_value, index_foot))

                    if self.protein_dict[chain][index][
                            'resi'] in self.AA_dict.keys():
                        resi_abbre = self.AA_dict[self.protein_dict[chain]
                                                  [index]['resi']]
                    else:
                        resi_dict_problem = True
                        break

                    if dssp_key in dssp_dict.keys():

                        resi_ss = dssp_dict[dssp_key][1]
                        self.protein_dict[chain][index]['SeconStru'] = resi_ss

                        if resi_abbre != dssp_dict[dssp_key][0]:
                            print('Residue Error! %s and %s do not match!' %
                                  (self.protein_dict[chain][index]['resi'],
                                   dssp_dict[dssp_key][0]))
                        else:
                            self.protein_dict[chain][index][
                                'AminoAci'] = dssp_dict[dssp_key][0]

                    else:
                        resi_ss = 'M'

                    Complete_Seq += resi_abbre
                    Complete_SS_8 += resi_ss
                    Complete_SS_3 += self.ss_dict_8_3[resi_ss]

                    indv_pre = index_value

                if resi_dict_problem:
                    self.Seq_dict[chain] = None
                    self.SS_dict_8[chain] = None
                    self.SS_dict_3[chain] = None
                else:
                    self.Seq_dict[chain] = Complete_Seq
                    self.SS_dict_8[chain] = Complete_SS_8
                    self.SS_dict_3[chain] = Complete_SS_3
Exemplo n.º 13
0
res = ds.iloc[:, 2]
chain = ds.iloc[:, 3]

# Structures
# H,G,I: 1
# T: 2 (T, S)
# S: 3
# B: 4
# E: 5
# - 6
# Exception: 7

ssf_list = []
p = PDBParser()
last_file = '../../../../pdb/' + str(pdb[0]) + '.pdb'
last_dssp = dssp_dict_from_pdb_file(last_file)
for i in range(len(pdb)):
    try:
        pdb_id = str(pdb[i])
        print(pdb_id)
        try:
            file = '../../../../pdb/' + pdb_id.lower() + '.pdb'
            if file == last_file:
                dssp = last_dssp
            else:
                last_file = file
                dssp = dssp_dict_from_pdb_file(file)
                last_dssp = dssp
        except:
            file = '../../../../pdb/' + pdb_id.upper() + '.pdb'
            if file == last_file:
Exemplo n.º 14
0
from Bio.PDB import PDBParser
from Bio.PDB.DSSP import DSSP
from Bio.PDB.DSSP import dssp_dict_from_pdb_file
import json
import sys

# p = PDBParser()
# structure = p.get_structure("3S7I", "./3s7i.pdb")
# model = structure[0]
# dssp = DSSP(model, "./3s7i.pdb", acc_array="Miller")

# print(dssp['A', (' ', 173, ' ')])
pdbFile = sys.argv[1]

dssp_tup = dssp_dict_from_pdb_file(pdbFile, DSSP="./mkdssp")
dssp = dssp_tup[0]

# (dssp index, amino acid, secondary structure, relative ASA, phi, psi,
# NH_O_1_relidx, NH_O_1_energy, O_NH_1_relidx, O_NH_1_energy,
# NH_O_2_relidx, NH_O_2_energy, O_NH_2_relidx, O_NH_2_energy)
# ^ if using DSSP object, different to dssp_dict_from_pdb_file

# construct sequence from dssp output
sequence = ""
dssp_array = []
# accessible surface area
asa = []
# secondary structure
ss = []
# residue ids
res_id = []
Exemplo n.º 15
0
def dssp_sse_extract_from_pdb(filename):
    """
    Construct a list of SSEs from an un-annotated PDB file.
    @author: Travis Peters
    
    # Test... ##################################
    #aa_type = dssp[0][key][0]
    #sse_code = dssp[0][key][1]        
    #x = float(dssp[0][key][2])
    #y = float(dssp[0][key][3])
    #z = float(dssp[0][key][4])

    #print(str(key) + ":" + str(dssp[0][key]))
    #print "  AA Type  = " + str(aa_type),
    #print "\n  SSE Code = " + str(sse_code),
    #print "\n  Location = " + str((x,y,z))
    ############################################
    """

    # Assumes dssp executable is located in root of project directory
    if sys.platform[0:2] == "win":
        DSSP_EXEC = "dssp"
    else:
        DSSP_EXEC = "./dssp"

    # NOTE: The DSSP codes for secondary structure used here are: 
    # - H        Alpha helix (4-12) 
    # - G        3-10 helix 
    # - I        pi helix 
    # - B        Isolated beta-bridge residue 
    # - E        Strand 
    # - T        Turn 
    # - S        Bend 
    # - -        None 
    HELIX = ['H', 'G', 'I']
    SHEET = ['B', 'E']

    # DSSP call returns a dictionary that maps (chainid, resid) to 
    # (amino acid type, secondary structure code, and accessibility).
    dssp = dssp_dict_from_pdb_file(filename, DSSP_EXEC)
        
    sses = []
    sse_start = None
    sse_type = None
    res_count = 0
    for key in dssp[1]:

        # Extract the residue number
        resnum = key[1][1]

        # Extract SSE code for a residue
        sse_code = dssp[0][key][1]
        
        # Record SSEs by examining sse_codes of consecutive residues
        if sse_code in HELIX:
            if sse_type == 'HELIX':
                res_count += 1
            else:
                # Did we just detect an SSE
                if res_count >= REQUIRED_SSE_RES_NUM:
                    sses.append( SSE(sse_type, sse_start, resnum-1) )

                # Start recording a new SSE
                res_count = 0
                sse_start = resnum
                sse_type = 'HELIX'
            
        elif sse_code in SHEET:
            if sse_type == 'SHEET':
                res_count += 1
            else:
                # Did we just detect an SSE
                if res_count >= REQUIRED_SSE_RES_NUM:
                    sses.append( SSE(sse_type, sse_start, resnum-1) )

                # Start recording a new SSE
                res_count = 0
                sse_start = resnum
                sse_type = 'SHEET'
            
        else:
            if not (sse_type == None):
                # Did we just detect an SSE
                if res_count >= REQUIRED_SSE_RES_NUM:
                    sses.append( SSE(sse_type, sse_start, resnum-1) )

            # sse_code suggests we are not detecting an SSE
            res_count = 0
            sse_start = resnum
            sse_type = None
            
    return sses
Exemplo n.º 16
0
def get_dssp_df_on_file(pdb_file, outfile=None, outdir=None, outext='_dssp.df', force_rerun=False):
    """Run DSSP directly on a structure file with the Biopython method Bio.PDB.DSSP.dssp_dict_from_pdb_file

    Avoids errors like: PDBException: Structure/DSSP mismatch at <Residue MSE het=  resseq=19 icode= >
        by not matching information to the structure file (DSSP fills in the ID "X" for unknown residues)

    Args:
        pdb_file: Path to PDB file
        outfile: Name of output file
        outdir: Path to output directory
        outext: Extension of output file
        force_rerun: If DSSP should be rerun if the outfile exists

    Returns:
        Pandas DataFrame: DSSP results, summarized

    """
    # TODO: function unfinished
    # Create the output file name
    outfile = ssbio.utils.outfile_maker(inname=pdb_file, outname=outfile, outdir=outdir, outext=outext)

    if ssbio.utils.force_rerun(flag=force_rerun, outfile=outfile):
        try:
            d = dssp_dict_from_pdb_file(pdb_file)
        except Exception('DSSP failed to produce an output'):
            log.error('{}: unable to run DSSP'.format(pdb_file))
            return pd.DataFrame()

        appender = []
        # TODO: WARNING: d is slightly different than when using function get_dssp_df
        for k in d[1]:
            to_append = []
            y = d[0][k]
            chain = k[0]
            residue = k[1]
            het = residue[0]
            resnum = residue[1]
            icode = residue[2]
            to_append.extend([chain, resnum, icode])
            to_append.extend(y)
            appender.append(to_append)

        cols = ['chain', 'resnum', 'icode',
                'dssp_index', 'aa', 'ss', 'exposure_rsa', 'phi', 'psi',
                'NH_O_1_relidx', 'NH_O_1_energy', 'O_NH_1_relidx',
                'O_NH_1_energy', 'NH_O_2_relidx', 'NH_O_2_energy',
                'O_NH_2_relidx', 'O_NH_2_energy']

        df = pd.DataFrame.from_records(appender, columns=cols)

        # Adding additional columns
        df = df[df['aa'].isin(list(aa1))]
        df['aa_three'] = df['aa'].apply(one_to_three)
        df['max_acc'] = df['aa_three'].map(residue_max_acc['Sander'].get)
        df[['exposure_rsa', 'max_acc']] = df[['exposure_rsa', 'max_acc']].astype(float)
        df['exposure_asa'] = df['exposure_rsa'] * df['max_acc']

        df.to_csv(outfile)
    else:
        log.debug('{}: already ran DSSP and force_rerun={}, loading results'.format(outfile, force_rerun))
        df = pd.read_csv(outfile, index_col=0)

    return df
Exemplo n.º 17
0
 def get_dssp_dict(self):
     dssp_dict = dssp_dict_from_pdb_file(self.pdb_fname)[0]
     return dssp_dict