示例#1
0
def _create_interval_dict(df):
    """Create a dictionary of intervals.

    The first interval corresponds with RES_BEG, RES_END, which are the
    indexes (starting from 1) for the PDB chain. Note that this is
    required because the PDB chain is not always fully composed of the
    UniProt entry. The second interval corresponds to SP_BEG, SP_END,
    this is the corresponding interval on the full UniProt entry,
    starting with an index of 1.

    Args:
        df (DataFrame): A pre-filtered dataframe from pdb_chain_uniprot.tsv.

    Returns:
        interval_dict (dictionary): A dictionary in the following form:
            {
                    '11BG_A_Q3E840':
                        [
                            [
                                [1, 124],
                                [27, 150]
                            ]
                        ]
                }

    """
    interval_dict = {}
    progress = ProgressBar(
        len(df.index),
        start_msg="Creating interval dictionary..",
        end_msg="Done creating interval dictionary."
    )
    for i, row in df.iterrows():
        uni_id = row.SP_PRIMARY
        pdb_chain_uni = ''.join([
            row.PDB,
            '_',
            row.CHAIN,
            '_',
            uni_id
        ])
        intervals = [[row.RES_BEG, row.RES_END], [row.SP_BEG, row.SP_END]]
        if pdb_chain_uni in interval_dict.keys():
            interval_dict[pdb_chain_uni].append(intervals)
        else:
            interval_dict[pdb_chain_uni] = [intervals]
        progress.inc()
    return interval_dict
示例#2
0
def create_intervals(pdb_df, uni_df):
    """Add a column to uni_df with the missing interval regions.

    Missing interval regions are of the following form:
        [
            ['conserved', (0, 10)],
            ['conflict', (34, 45)]
        ]

    Args:
        pdb_df (DataFrame): DataFrame with PDB_chains and composite structure.
        uni_df (DataFrame): DataFrame with UniProt IDs and
            composite UniProt structure.

    Returns:
        uni_df (DataFrame): A DataFrame which includes a column for
            the missing interval regions.

    """
    uni_df["MISSING"] = ""
    progress = ProgressBar(
        len(uni_df.index),
        approx_percentage=1,
        start_msg="Adding a column to uni_df with missing interval regions.",
        end_msg="Done adding columns.",
    )
    for i, row in uni_df.iterrows():
        struct_indexes = []
        uni_id = row.SP_PRIMARY
        uni_struct = row.STRUCT
        pdbs = pdb_df[pdb_df.SP_PRIMARY == uni_id]
        indexes = _find_indexes(uni_struct)
        for ind in indexes:
            struct_intervals = []
            for j, line in pdbs.iterrows():
                assert len(line.SEC_STRUCT) == len(uni_struct)
                struct_intervals.append(line.SEC_STRUCT[ind[0] : ind[1]])
            for struct in struct_intervals:
                assert len(struct) == len(struct_intervals[0])
            assert len(struct_intervals) == len(pdbs.index)
            dis_type = determine_dis_type(struct_intervals)
            struct_indexes.append([dis_type, ind])
        assert len(indexes) == len(struct_indexes)
        uni_df.set_value(i, "MISSING", struct_indexes)
        progress.inc()
    return uni_df
示例#3
0
def create_uni_struct(pdb_df):
    """ Creates a DataFrame that has UniProt ID and composite
    structure for UniProt.

    Args:
        pdb_df (DataFrame): A DataFrame produced by create_composite.

    Returns:
        df (DataFrame): A new DataFrame with UniProt ID and
        composite UniProt structure.

    """
    uni_struct = {'SP_PRIMARY': [], 'STRUCT': []}
    uni_list = read_pdb_chain_uniprot_uniIDs(pdb_df)
    progress = ProgressBar(
        len(uni_list),
        approx_percentage=1,
        start_msg="Creating DataFrame with ID and composite structure.",
        end_msg="Done creating DataFrame with ID and composite structure."
    )
    for uni in uni_list:
        pdbs = pdb_df[pdb_df.SP_PRIMARY == uni]
        struct_list = []
        assert len(pdbs.index) > 1
        for i, row in pdbs.iterrows():
            struct_list.append(row.SEC_STRUCT)
        assert len(struct_list) > 1
        for struct in struct_list:
            assert len(struct) > 0
            assert len(struct) == len(struct_list[0])
        assert len(struct_list) == len(pdbs.index)
        comp_struct = _uni_struct(struct_list)
        uni_struct['SP_PRIMARY'].append(uni)
        uni_struct['STRUCT'].append(comp_struct)
        progress.inc()

    df = pd.DataFrame(uni_struct)
    return df
示例#4
0
def filter_single(df):
    """Removes UniProt IDs with only one unique PDB chain.

    This filters out any UniProt IDs that do not have > 1 unique PDB chains.
    It will accept a dataframe that has PDB_CHAIN combined or PDB and CHAIN
    separate.

    Note that it is possible to have the same protein multiple times
    on a single chain, and that's why I have chosen to actually iterate
    through and check for unique chains for those tsv files that have the
    PDB and CHAIN separated. Note also that a unique key is created by
    pdb id, chain and uniprot ID.

    Args:
        df: This can accept a DataFrame that has the PDB_CHAIN combined,
        or that has the PDB and CHAIN as separate columns.

    Returns:
        A filtered DataFrame

    """
    log_error = getLogger('pdb_app_logger')
    if 'PDB_CHAIN' in df.columns:
        uni_list = read_pdb_chain_uniprot_uniIDs(df)
        drop_list = []
        progress = ProgressBar(
            len(uni_list),
            start_msg="Removing UniProt IDs with only one unique PDB chain.",
            end_msg="Done removing UniProt IDs."
        )
        for uni in uni_list:
            pdbs = df[df.SP_PRIMARY == uni]
            if len(pdbs.index) < 2:
                drop_list.append(uni)
            progress.inc()
        df = df[-df.SP_PRIMARY.isin(drop_list)]

    else:
        uni_list = read_pdb_chain_uniprot_uniIDs(df)
        drop_list = []
        progress = ProgressBar(
            len(uni_list),
            start_msg="Removing UniProt IDs with only one unique PDB chain.",
            end_msg="Done removing UniProt IDs."
        )
        for uni in uni_list:
            pdbs = df[df.SP_PRIMARY == uni]
            pdb_chains = []
            for i, row in pdbs.iterrows():
                try:
                    new_chain = ''.join([
                        row.PDB,
                        '_',
                        row.CHAIN
                    ])
                except TypeError as append_err:
                    log_error.warning(
                        "Error appending row. Error was:\n"
                        "\t{}\n"
                        "\trow.PDB was: [{}] {}\n"
                        "\trow.CHAIN was: [{}]{}".format(
                            append_err.args,
                            type(row.PDB),
                            row.PDB,
                            type(row.CHAIN),
                            row.CHAIN
                        )
                    )
                else:
                    pdb_chains.append(new_chain)

            if len(set(pdb_chains)) < 2:
                drop_list.append(uni)
            progress.inc()
        df = df[-df.SP_PRIMARY.isin(drop_list)]

    return df
示例#5
0
def add_pdbseq_to_df(df, ss_dis):
    """ Removes rows not in ss_dis, adds a column with PDB peptide sequence.

    Removes any rows for pdb_chains not in ss_dis.
    Adds a column called 'PDB_SEQ' that has the section of the PDB
    chain corresponding to the interval in RES_BEG:RES_END.

    Notes:
        Download Uniprot files after this step.

        RES_BEG, RES_END start with a numbering of 1.

        This function adds the a section of the PDB sequence to the
        DataFrame. This is later removed after comparison. My reasoning
        for adding this now was that I have to go through and remove
        rows anyways, so I already have the PDB sequence data available
        at this point, and it makes the next step quicker.

    Args:
        df (DataFrame): A pre-filtered DataFrame from pdb_chain_uniprot.tsv.
        ss_dis (dictionary): A dictionary extracted from ss_dis.txt.
        ss_dis has the following form:
            ss_dis[pdb_A] = {
                'sequence': '',
                'secstr': '',
                'disorder': ''
            }
    Returns:
        A filtered DataFrame with an added column.

    """
    log_pdb = getLogger('pdb_app_logger')
    log_root = getLogger('root')
    df['PDB_SEQ'] = ''
    progress = ProgressBar(
        len(df.index),
        start_msg="Removing rows for pdb_chains not in ss_dis.",
        end_msg="Done removing rows for pdb_chains not in ss_dis.."
    )
    log_root.info("Adding PDB_SEQ can take quite a while.")
    for i, row in df.iterrows():
        try:
            id_chain = ''.join([
                row.PDB,
                '_',
                row.CHAIN
            ])
        except TypeError as append_err:
            log_root.warning(
                "Error appending row. Error was:\n"
                "\t{}\n"
                "\trow.PDB was: [{}] {}\n"
                "\trow.CHAIN was: [{}]{}".format(
                    append_err.args,
                    type(row.PDB),
                    row.PDB,
                    type(row.CHAIN),
                    row.CHAIN
                )
            )
        if id_chain in ss_dis.keys():
            peptide = ss_dis[id_chain]['sequence'][row.RES_BEG-1:row.RES_END]
            df.set_value(i, 'PDB_SEQ', peptide)
        else:
            try:
                df.drop(i, inplace=True)
            except MemoryError as mem_err:
                log_pdb.error(
                    "Memory error while adding PDB peptide "
                    "sequence to DataFrame:\n"
                    "\t{}\n"
                    "\t{}\n"
                    "\t{}\n".format(
                        mem_err.args[0],
                        mem_err.args[1],
                        mem_err.args[2]
                    )
                )
                from struct import calcsize
                int_struct_size = calcsize("P") * 8
                if int_struct_size == 32:
                    log_root.warning(
                        "It appears you're on a 32-bit system and/or "
                        "using a 32-bit version of Python. You may need "
                        "to use a 64-bit version to avoid memory "
                        "errors for these large calculations."
                    )
        progress.inc()
    return df