Пример #1
0
def filter_pdb_chain_uniprot(df, obs, xray):
    """Step 1 filtering of the DataFrame from pdb_chain_uniprot.tsv.

    Removes the PDB_BEG, PDB_END columns.
    Converts all PDB IDs to upper case.
    Removes any rows where the PDB ID isn't in the xray list.
    Removes any rows where the PDB ID is in the obs list.
    Removes any rows where the RES_BEG or SP_BEG are < 0.
    Removes any rows where the length of the intervals doesn't match.
    Removes any rows where the length of the interval is <= 3.

    Args:
        df (DataFrame): A pandas DataFrame read from pdb_chain_uniprot.tsv.
        obs (list of Unicode): A list of PDB IDs that are obsolete entries.
        xray (list of Unicode): A list of PDB IDs that are xray entries.

    Returns:
        A filtered DataFrame

    """
    df.drop('PDB_BEG', axis=1, inplace=True)
    df.drop('PDB_END', axis=1, inplace=True)
    df.PDB = df.PDB.str.upper()
    df = df[df.PDB.isin(xray)]
    df = df[-df.PDB.isin(obs)]
    df = df[(df.RES_BEG > 0) & (df.SP_BEG > 0)]
    df = df[(df.RES_END-df.RES_BEG) == (df.SP_END-df.SP_BEG)]
    df = df[(df.RES_END-df.RES_BEG) > 3]
    df = filter_single(df)
    return df
Пример #2
0
def final_filtering(dirs):
    """Create PDB composite.

    Args:
        dirs (ProjectFolders): A named tuple of directory paths.

    Returns:
        None
    """
    pdb_initial_composite_fp = os.path.join(
        dirs.tsv_data,
        'pdb_initial_composite_df.tsv'
    )

    uni_filtered_path = os.path.join(
        dirs.working,
        'pdb_seq_uni_filtered.tsv'
    )
    if not os.path.exists(pdb_initial_composite_fp):
        df = pd.read_csv(
            uni_filtered_path,
            sep='\t',
            index_col=0,
            keep_default_na=False,
            na_values=['NULL', 'N/A']
        )
        ss_dis = fetch_ss_dis(dirs.working)
        print("Creating PDB composite.")
        df = create_pdb_composite(df, ss_dis, dirs.uni_data)
        print("\nPDB composite finished.")

        print(
            "Removing UniProt entries with < 2 PDB "
            "chains. Starting with {0} rows".format(len(df.index))
        )
        df = filter_single(df)
        print(
            "Entries removed. There are now {0} rows".format(len(df.index))
        )

        print("Writing final PDB chain DataFrame.")
        delimiter = create_delimiter('\t')
        df.to_csv(pdb_initial_composite_fp, sep=delimiter, encoding='utf-8')
        print(
            "Finished writing {}:\n"
            "\t{}\n"
            "This is the final PDB_CHAIN DataFrame.\n"
            "Note that only pdb_chain_uniprot.tsv provides a "
            "unique key".format(
                basename(pdb_initial_composite_fp),
                pdb_initial_composite_fp,
            )
        )
    else:
        print(
            "Found {}. Using local file:\n"
            "\t{}".format(
                basename(pdb_initial_composite_fp),
                pdb_initial_composite_fp
            )
        )
    print("")
    return None
Пример #3
0
def initial_filtering(dirs):
    """Creates a dataframe from pdb_chain_uniprot.tsv.

    Perform initial filtering with pdb_chain_uniprot.tsv
    and supplementary files.

    Supplementary file processing steps:
        1. Removes the PDB_BEG, PDB_END columns.
        2. Converts all PDB IDs to upper case.
        3. Removes any rows where the PDB ID isn't in the xray list.
        4. Removes any rows where the PDB ID is in the obs list.
        5. Removes any rows where the RES_BEG or SP_BEG are < 1.
        6. Removes any rows where the length of the intervals doesn't match.
        7. Removes any rows where the length of the interval is <= 3.
        8. Removes any rows for pdb_chains not in ss_dis.
        9. Removes uniIDs with < 2 pdb chains.
        10. Adds a column called 'PDB_SEQ' that has the section of the PDB
            chain corresponding to the interval in RES_BEG:RES_END.

    Args:
        dirs (ProjectFolders): A named tuple of directory paths.

    Returns:
        None

    """
    # Return used_local for unittest because of problems capturing stdout
    # with logging instance.
    used_local = False
    pdb_seq_fp = os.path.join(dirs.working, 'pdb_seq.tsv')
    msg = getLogger('root')

    if not os.path.exists(pdb_seq_fp):
        obs_fp = os.path.join(dirs.working, 'obs.yaml')
        xray_fp = os.path.join(dirs.working, 'xray.yaml')
        chain_fp = os.path.join(dirs.tsv_data, 'pdb_chain_uniprot.tsv')

        msg.info('START: Initial filtering.')

        msg.debug("START: Fetch ss_dis.tsv.")
        ss_dis = fetch_ss_dis(dirs.working)
        msg.debug("COMPLETE: Fetch ss_dis.tsv.")

        msg.debug("START: Read obs.yaml.")
        obs = read_yaml(obs_fp)
        msg.debug("COMPLETE: Read obs.yaml.")

        msg.debug("START: Read xray.yaml.")
        xray = read_yaml(xray_fp)
        msg.debug("COMPLETE: Read xray.yaml.")

        msg.debug("START: Create initial DataFrame.")
        df = pd.read_csv(
            chain_fp,
            sep='\t',
            header=1,
            encoding='utf-8',
            keep_default_na=False,
            na_values=['NULL', 'N/A'])
        msg.debug("COMPLETE: Create initial DataFrame.")
        msg.debug("Initial DataFrame has {} rows.".format(len(df.index)))

        msg.debug("START: Remove rows where "
                  "the PDB ID is not in the xray list.")
        df = filter_pdb_chain_uniprot(df, obs, xray)
        msg.debug("COMPLETE: Remove rows where "
                  "the PDB ID is not in the xray list.")
        msg.debug("DataFrame now has {} rows.".format(len(df.index)))

        msg.debug("START: Remove entries not in ss_dis "
                  "and add the PDB peptide.")
        df = add_pdbseq_to_df(df, ss_dis)
        msg.debug("COMPLETE: Remove entries not in ss_dis "
                  "and add the PDB peptide.")
        msg.debug("DataFrame now has {} rows.".format(len(df.index)))

        msg.debug("START: Remove UniProt IDs with < 2 pdb chains.")
        df = filter_single(df)
        msg.debug("COMPLETE: Remove UniProt IDs with < 2 pdb chains.")
        msg.debug("DataFrame now has {} rows.".format(len(df.index)))

        msg.debug("START: Writing DataFrame to TSV file.")
        delimiter = create_delimiter('\t')
        df.to_csv(pdb_seq_fp, sep=delimiter, encoding='utf-8')
        msg.debug("COMPLETE: Writing DataFrame to TSV file.")
        msg.info(
            "Wrote {} to:\n\t{}".format(basename(pdb_seq_fp), pdb_seq_fp)
        )
        msg.info('COMPLETE: Initial filtering.')

    else:
        used_local = True
        msg.info(
            "Found and using local {filename}: \n"
            "\t{filepath}".format(
                filename=basename(pdb_seq_fp),
                filepath=pdb_seq_fp
            )
        )
        msg.info('COMPLETE: Initial filtering.')

    return used_local