Exemplo n.º 1
0
    def test_generate_json(self):
        time_stamp = now_utc()
        json_file_name = (
            "{}.{}.{}".format(
                'ss_dis',
                time_stamp,
                'json'
            )
        )
        json_file_path = os.path.join(self.temp_dir, json_file_name)

        self.write_fasta(time_stamp)
        self.assertFalse(os.path.isfile(json_file_path))
        ss.fetch_ss_dis(self.temp_dir)
        self.assertTrue(os.path.isfile(json_file_path))
        return None
Exemplo n.º 2
0
 def test_generate_json_and_return_dic(self):
     # TODO: Value of expected updated to include '\r' after from __future__ import unicode_literals and from io import os. Validate this data.
     expected = {
         u'1DBO_A': {
             u'disorder': u'',
             u'secstr': u'',
             u'sequence': u'MKMLNKLAGYLLPIMVLLNVAPCLGQVVASNETLYQVVKEVKPGGLVQIADGTYKDVQLIVSNSGKSGLPITIKALNPGKVFFTGDAKVELRGEHLILEGIWFKDGNRAIQAWKSHGPGLVAIYGSYNRITACVFDCFDEANSAYITTSLTEDGKVPQHCRIDHCSFTDKITFDQVINLNNTARAIKDGSVGGPGMYHRVDHCFFSNPQKPGNAGGGIRIGYYRNDIGRCLVDSNLFMRQDSEAEIITSKSQENVYYGNTYLNCQGTMNFRHGDHQVAINNFYIGNDQRFGYGGMFVWGSRHVIACNYFELSETIKSRGNAALYLNPGAMASEHALAFDMLIANNAFINVNGYAIHFNPLDERRKEYCAANRLKFETPHQLMLKGNLFFKDKPYVYPFFKDDYFIAGKNSWTGNVALGVEKGIPVNISANRSAYKPVKIKDIQPIEGIALDLNALISKGITGKPLSWDEVRPYWLKEMPGTYALTARLSADRAAKFKAVIKRNKEH'
         },
         u'5C1Z_B': {
             u'disorder': u'',
             u'secstr': u' EEEEESSSSS EEEE  TT BHHHHHHHHHHHHTS GGGEEEEETTEEE TT BHHHHT  TT EEEEEE    TT     EEEETTTT EEEEEEEEEEETTT SS EEESS   SHHHHHSTT SBEEE STT    BEEEEEEESSS   TT   EE TTEE  TT  B TTT  B SSEEE  STT BEEEHHHHHHHHHHHHHTT  EEETTTEEE   TT  TT   S GGGGGG',
             u'sequence': u'MIVFVRFNSSHGFPVEVDSDTSIFQLKEVVAKRQGVPADQLRVIFAGKELRNDWTVQNCDLDQQSIVHIVQRPWRKGQEMNATNSFYVYCKGPCQRVQPGKLRVQCSTCRQATLTLTQGPSCWDDVLIPNRMSGECQSPHCPGTSAEFFFKCGAHPTSDKETSVALHLIATNSRNITCITCTDVRSPVLVFQCNSRHVICLDCFHLYCVTRLNDRQFVHDPQLGYSLPCVAGCPNSLIKELHHFRILGEEQYNRYQQYGAEECVLQMGGVLCPRPGCGAGLLPEPDQRKVTCEGGNGLGCGFAFCRECKEAYHEGECSAVFEASGTTTQAYRVDERAAEQARWEAASKETIKKTTKPCPRCHVPVEKNGGCMHMKCPQPQCRLEWCWNCGCEWNRVCMGDHWFDV'
         }
     }
     time_stamp = now_utc()
     self.write_fasta(time_stamp)
     result = ss.fetch_ss_dis(self.temp_dir)
     self.assertEqual(expected, result)
     return None
Exemplo n.º 3
0
def final_filtering(dirs):
    """Create PDB composite.

    Args:
        dirs (ProjectFolders): A named tuple of directory paths.

    Returns:
        None
    """
    pdb_initial_composite_fp = os.path.join(
        dirs.tsv_data,
        'pdb_initial_composite_df.tsv'
    )

    uni_filtered_path = os.path.join(
        dirs.working,
        'pdb_seq_uni_filtered.tsv'
    )
    if not os.path.exists(pdb_initial_composite_fp):
        df = pd.read_csv(
            uni_filtered_path,
            sep='\t',
            index_col=0,
            keep_default_na=False,
            na_values=['NULL', 'N/A']
        )
        ss_dis = fetch_ss_dis(dirs.working)
        print("Creating PDB composite.")
        df = create_pdb_composite(df, ss_dis, dirs.uni_data)
        print("\nPDB composite finished.")

        print(
            "Removing UniProt entries with < 2 PDB "
            "chains. Starting with {0} rows".format(len(df.index))
        )
        df = filter_single(df)
        print(
            "Entries removed. There are now {0} rows".format(len(df.index))
        )

        print("Writing final PDB chain DataFrame.")
        delimiter = create_delimiter('\t')
        df.to_csv(pdb_initial_composite_fp, sep=delimiter, encoding='utf-8')
        print(
            "Finished writing {}:\n"
            "\t{}\n"
            "This is the final PDB_CHAIN DataFrame.\n"
            "Note that only pdb_chain_uniprot.tsv provides a "
            "unique key".format(
                basename(pdb_initial_composite_fp),
                pdb_initial_composite_fp,
            )
        )
    else:
        print(
            "Found {}. Using local file:\n"
            "\t{}".format(
                basename(pdb_initial_composite_fp),
                pdb_initial_composite_fp
            )
        )
    print("")
    return None
Exemplo n.º 4
0
def initial_filtering(dirs):
    """Creates a dataframe from pdb_chain_uniprot.tsv.

    Perform initial filtering with pdb_chain_uniprot.tsv
    and supplementary files.

    Supplementary file processing steps:
        1. Removes the PDB_BEG, PDB_END columns.
        2. Converts all PDB IDs to upper case.
        3. Removes any rows where the PDB ID isn't in the xray list.
        4. Removes any rows where the PDB ID is in the obs list.
        5. Removes any rows where the RES_BEG or SP_BEG are < 1.
        6. Removes any rows where the length of the intervals doesn't match.
        7. Removes any rows where the length of the interval is <= 3.
        8. Removes any rows for pdb_chains not in ss_dis.
        9. Removes uniIDs with < 2 pdb chains.
        10. Adds a column called 'PDB_SEQ' that has the section of the PDB
            chain corresponding to the interval in RES_BEG:RES_END.

    Args:
        dirs (ProjectFolders): A named tuple of directory paths.

    Returns:
        None

    """
    # Return used_local for unittest because of problems capturing stdout
    # with logging instance.
    used_local = False
    pdb_seq_fp = os.path.join(dirs.working, 'pdb_seq.tsv')
    msg = getLogger('root')

    if not os.path.exists(pdb_seq_fp):
        obs_fp = os.path.join(dirs.working, 'obs.yaml')
        xray_fp = os.path.join(dirs.working, 'xray.yaml')
        chain_fp = os.path.join(dirs.tsv_data, 'pdb_chain_uniprot.tsv')

        msg.info('START: Initial filtering.')

        msg.debug("START: Fetch ss_dis.tsv.")
        ss_dis = fetch_ss_dis(dirs.working)
        msg.debug("COMPLETE: Fetch ss_dis.tsv.")

        msg.debug("START: Read obs.yaml.")
        obs = read_yaml(obs_fp)
        msg.debug("COMPLETE: Read obs.yaml.")

        msg.debug("START: Read xray.yaml.")
        xray = read_yaml(xray_fp)
        msg.debug("COMPLETE: Read xray.yaml.")

        msg.debug("START: Create initial DataFrame.")
        df = pd.read_csv(
            chain_fp,
            sep='\t',
            header=1,
            encoding='utf-8',
            keep_default_na=False,
            na_values=['NULL', 'N/A'])
        msg.debug("COMPLETE: Create initial DataFrame.")
        msg.debug("Initial DataFrame has {} rows.".format(len(df.index)))

        msg.debug("START: Remove rows where "
                  "the PDB ID is not in the xray list.")
        df = filter_pdb_chain_uniprot(df, obs, xray)
        msg.debug("COMPLETE: Remove rows where "
                  "the PDB ID is not in the xray list.")
        msg.debug("DataFrame now has {} rows.".format(len(df.index)))

        msg.debug("START: Remove entries not in ss_dis "
                  "and add the PDB peptide.")
        df = add_pdbseq_to_df(df, ss_dis)
        msg.debug("COMPLETE: Remove entries not in ss_dis "
                  "and add the PDB peptide.")
        msg.debug("DataFrame now has {} rows.".format(len(df.index)))

        msg.debug("START: Remove UniProt IDs with < 2 pdb chains.")
        df = filter_single(df)
        msg.debug("COMPLETE: Remove UniProt IDs with < 2 pdb chains.")
        msg.debug("DataFrame now has {} rows.".format(len(df.index)))

        msg.debug("START: Writing DataFrame to TSV file.")
        delimiter = create_delimiter('\t')
        df.to_csv(pdb_seq_fp, sep=delimiter, encoding='utf-8')
        msg.debug("COMPLETE: Writing DataFrame to TSV file.")
        msg.info(
            "Wrote {} to:\n\t{}".format(basename(pdb_seq_fp), pdb_seq_fp)
        )
        msg.info('COMPLETE: Initial filtering.')

    else:
        used_local = True
        msg.info(
            "Found and using local {filename}: \n"
            "\t{filepath}".format(
                filename=basename(pdb_seq_fp),
                filepath=pdb_seq_fp
            )
        )
        msg.info('COMPLETE: Initial filtering.')

    return used_local