def test_read_hmm_evalue(self): """ Test that the hmm hits are well read, and returned only if evalue is < to the given threshold. """ rep_name = "acba.007.p01.13" replicon_id = 'ACBA.007.P01_13' replicon_path = self.find_data( os.path.join('Replicons', rep_name + '.fst')) prot_file = self.find_data( os.path.join('Proteins', replicon_id + '.prt')) args = argparse.Namespace() args.gembase = False args.replicon = replicon_path cfg = Config(args) sequences_db = read_multi_prot_fasta(replicon_path) replicon = next(sequences_db) prot_db = ProdigalDB(replicon, cfg, prot_file=prot_file) infile = self.find_data( os.path.join("Results_Integron_Finder_{}".format(rep_name), "tmp_{}".format(replicon_id), "{}_intI.res".format(replicon_id))) df1 = read_hmm(rep_name, prot_db, infile, cfg, evalue=1.95e-25) exp1 = pd.DataFrame(data={ "Accession_number": rep_name, "query_name": "intI_Cterm", "ID_query": "-", "ID_prot": "ACBA.007.P01_13_1", "strand": 1, "pos_beg": 55, "pos_end": 1014, "evalue": 1.9e-25 }, index=[0]) exp1 = exp1[[ "Accession_number", "query_name", "ID_query", "ID_prot", "strand", "pos_beg", "pos_end", "evalue" ]] pdt.assert_frame_equal(df1, exp1) df2 = read_hmm(replicon_id, prot_db, infile, cfg, evalue=1.9e-25) exp2 = pd.DataFrame(columns=[ "Accession_number", "query_name", "ID_query", "ID_prot", "strand", "pos_beg", "pos_end", "evalue" ]) intcols = ["pos_beg", "pos_end", "strand"] floatcol = ["evalue"] exp2[intcols] = exp2[intcols].astype(int) exp2[floatcol] = exp2[floatcol].astype(float) pdt.assert_frame_equal(df2, exp2)
def test_read_multi(self): """ Test reading hmm results when there are multiple hits: 2 hits on the same protein: keep only the one with the best evalue. 2 hits on 2 different proteins: keep the 2 proteins. """ replicon_id = 'ACBA.0917.00019' contig_id = 'ACBA.0917.00019.0001' result_dir_expected = self.find_data( "Results_Integron_Finder_{}.gembase".format(replicon_id)) replicon_path = self.find_data( os.path.join('Gembase', 'Replicons', replicon_id + '.fna')) prot_file = os.path.join(result_dir_expected, "tmp_{}".format(contig_id), contig_id + '.prt') args = argparse.Namespace() args.gembase = True args.replicon = replicon_path cfg = Config(args) sequences_db = read_multi_prot_fasta(replicon_path) replicon = next(sequences_db) prot_db = GembaseDB(replicon, cfg, prot_file=prot_file) infile = self.find_data( os.path.join('fictive_results', "{}_intI_multi.res".format(contig_id))) df = read_hmm(contig_id, prot_db, infile, cfg) exp = pd.DataFrame(data={ "Accession_number": [contig_id] * 2, "query_name": ["Phage_integrase"] * 2, "ID_query": ["PF00589.16"] * 2, "ID_prot": ["ACBA.0917.00019.i0001_00298", "ACBA.0917.00019.i0001_00338"], "strand": [-1, -1], "pos_beg": [311597, 350328], "pos_end": [312631, 351248], "evalue": [5.5e-66, 3.4e-51] }, index=[0, 1]) exp = exp[[ "Accession_number", "query_name", "ID_query", "ID_prot", "strand", "pos_beg", "pos_end", "evalue" ]] pdt.assert_frame_equal(df, exp)
def test_read_hmm_gembase(self): """ Test that the hmm hits are well read, when the gembase format is used (.prt file is provided, prodigal is not used to find the proteins). """ replicon_id = 'ACBA.0917.00019' contig_id = 'ACBA.0917.00019.0001' result_dir_expected = self.find_data( "Results_Integron_Finder_{}.gembase".format(replicon_id)) replicon_path = self.find_data( os.path.join('Gembase', 'Replicons', replicon_id + '.fna')) prot_file = os.path.join(result_dir_expected, "tmp_{}".format(contig_id), contig_id + '.prt') infile = os.path.join(result_dir_expected, "tmp_{}".format(contig_id), "{}_intI.res".format(contig_id)) args = argparse.Namespace() args.gembase = True args.replicon = replicon_path cfg = Config(args) sequences_db = read_multi_prot_fasta(replicon_path) replicon = next(sequences_db) prot_db = GembaseDB(replicon, cfg, prot_file=prot_file) df = read_hmm(contig_id, prot_db, infile, cfg) exp = pd.DataFrame(data={ "Accession_number": contig_id, "query_name": "intI_Cterm", "ID_query": "-", "ID_prot": "ACBA.0917.00019.i0001_00298", "strand": -1, "pos_beg": 311597, "pos_end": 312631, "evalue": 3.6e-25 }, index=[0]) exp = exp[[ "Accession_number", "query_name", "ID_query", "ID_prot", "strand", "pos_beg", "pos_end", "evalue" ]] pdt.assert_frame_equal(df, exp)
def test_read_hmm(self): """ Test that the hmm hits are well read """ rep_name = "acba.007.p01.13" replicon_id = 'ACBA.007.P01_13' replicon_path = self.find_data( os.path.join('Replicons', rep_name + '.fst')) prot_file = self.find_data( os.path.join('Proteins', replicon_id + '.prt')) args = argparse.Namespace() args.gembase = False args.replicon = replicon_path cfg = Config(args) sequences_db = read_multi_prot_fasta(replicon_path) replicon = next(sequences_db) prot_db = ProdigalDB(replicon, cfg, prot_file=prot_file) infile = self.find_data( os.path.join("Results_Integron_Finder_{}".format(rep_name), "tmp_{}".format(replicon_id), "{}_intI.res".format(replicon_id))) df = read_hmm(rep_name, prot_db, infile, cfg) exp = pd.DataFrame(data={ "Accession_number": rep_name, "query_name": "intI_Cterm", "ID_query": "-", "ID_prot": "ACBA.007.P01_13_1", "strand": 1, "pos_beg": 55, "pos_end": 1014, "evalue": 1.9e-25 }, index=[0]) exp = exp[[ "Accession_number", "query_name", "ID_query", "ID_prot", "strand", "pos_beg", "pos_end", "evalue" ]] pdt.assert_frame_equal(df, exp)
def test_read_hmm_cov2(self): """ Test that the hmm hits are well read, it returns only the hits with coverage > given threshold """ rep_name = "acba.007.p01.13" replicon_id = 'ACBA.007.P01_13' replicon_path = self.find_data( os.path.join('Replicons', rep_name + '.fst')) prot_file = self.find_data( os.path.join('Proteins', replicon_id + '.prt')) args = argparse.Namespace() args.gembase = False args.replicon = replicon_path cfg = Config(args) sequences_db = read_multi_prot_fasta(replicon_path) replicon = next(sequences_db) prot_db = ProdigalDB(replicon, cfg, prot_file=prot_file) infile = self.find_data( os.path.join("fictive_results", "{}_intI.res".format(replicon_id))) df1 = read_hmm(rep_name, prot_db, infile, cfg, coverage=0.7) exp1 = pd.DataFrame(data={ "Accession_number": [rep_name] * 2, "query_name": ["intI_Cterm"] * 2, "ID_query": ["-", "-"], "ID_prot": ["ACBA.007.P01_13_1", "ACBA.007.P01_13_2"], "strand": [1, -1], "pos_beg": [55, 905], "pos_end": [1014, 1609], "evalue": [1.9e-25, 1e-3] }, index=[0, 1]) exp1 = exp1[[ "Accession_number", "query_name", "ID_query", "ID_prot", "strand", "pos_beg", "pos_end", "evalue" ]] pdt.assert_frame_equal(df1, exp1)
def test_read_empty(self): """ Test that when there are no hits in the hmm result file, it returns an empty dataframe, without error. """ rep_name = "acba.007.p01.13" replicon_id = 'ACBA.007.P01_13' replicon_path = self.find_data( os.path.join('Replicons', rep_name + '.fst')) prot_file = self.find_data( os.path.join('Proteins', replicon_id + '.prt')) args = argparse.Namespace() args.gembase = False args.replicon = replicon_path cfg = Config(args) sequences_db = read_multi_prot_fasta(replicon_path) replicon = next(sequences_db) prot_db = ProdigalDB(replicon, cfg, prot_file=prot_file) infile = self.find_data( os.path.join("fictive_results", "{}_intI-empty.res".format(replicon_id))) df = read_hmm(rep_name, prot_db, infile, cfg) exp = pd.DataFrame(columns=[ "Accession_number", "query_name", "ID_query", "ID_prot", "strand", "pos_beg", "pos_end", "evalue" ]) intcols = ["pos_beg", "pos_end", "strand"] floatcol = ["evalue"] exp[intcols] = exp[intcols].astype(int) exp[floatcol] = exp[floatcol].astype(float) pdt.assert_frame_equal(df, exp)