def test_filter_evalue_thres(self): """ Test that the filter by a maximum attc size works. """ filename = self.find_data( os.path.join( "Results_Integron_Finder_{}".format(self.replicon_name), "tmp_{}".format(self.replicon_id), "{}_attc_table.res".format(self.replicon_id))) df = infernal.read_infernal(filename, self.replicon_id, self.length_cm, evalue=1e-8) expect = pd.DataFrame(columns=[ "Accession_number", "cm_attC", "cm_debut", "cm_fin", "pos_beg", "pos_end", "sens", "evalue" ]) expect = expect.append( { "Accession_number": self.replicon_id, "cm_attC": "attC_4", "cm_debut": 1, "cm_fin": 47, "pos_beg": 17825, "pos_end": 17884, "sens": "-", "evalue": 1e-9 }, ignore_index=True) expect = expect.astype(self.dtype) pdt.assert_frame_equal(df, expect)
def test_no_total_cm_match_strandm(self): """ Test that when the model did not completely match on the sequence, the start and end positions of hit are well recalculated. All hits are on strand - """ filename = self.find_data( os.path.join("fictive_results", "{}_attc_table-partialm.res".format(self.replicon_id))) df = infernal.read_infernal(filename, self.replicon_id, self.length_cm) expect = pd.DataFrame(columns=["Accession_number", "cm_attC", "cm_debut", "cm_fin", "pos_beg", "pos_end", "sens", "evalue"]) expect = expect.append({"Accession_number": self.replicon_id, "cm_attC": "attC_4", "cm_debut": 1, "cm_fin": 40, "pos_beg": 17818, "pos_end": 17884, "sens": "-", "evalue": 1e-9}, ignore_index=True) expect = expect.append({"Accession_number": self.replicon_id, "cm_attC": "attC_4", "cm_debut": 1, "cm_fin": 47, "pos_beg": 19080, "pos_end": 19149, "sens": "-", "evalue": 1e-4}, ignore_index=True) expect = expect.append({"Accession_number": self.replicon_id, "cm_attC": "attC_4", "cm_debut": 10, "cm_fin": 47, "pos_beg": 19618, "pos_end": 19735, "sens": "-", "evalue": 1.1e-7}, ignore_index=True) # convert positions to int intcols = ["cm_debut", "cm_fin", "pos_beg", "pos_end"] expect[intcols] = expect[intcols].astype(int) pdt.assert_frame_equal(df, expect)
def test_generate_df(self): """ Test that if the infernal file exists and there are hits, it returns the dataframe corresponding to it. """ filename = self.find_data(os.path.join("Results_Integron_Finder_{}".format(self.replicon_name), "tmp_{}".format(self.replicon_id), "{}_attc_table.res".format(self.replicon_id))) df = infernal.read_infernal(filename, self.replicon_id, self.length_cm) expect = pd.DataFrame(columns=["Accession_number", "cm_attC", "cm_debut", "cm_fin", "pos_beg", "pos_end", "sens", "evalue"]) expect = expect.append({"Accession_number": self.replicon_id, "cm_attC": "attC_4", "cm_debut": 1, "cm_fin": 47, "pos_beg": 17825, "pos_end": 17884, "sens": "-", "evalue": 1e-9}, ignore_index=True) expect = expect.append({"Accession_number": self.replicon_id, "cm_attC": "attC_4", "cm_debut": 1, "cm_fin": 47, "pos_beg": 19080, "pos_end": 19149, "sens": "-", "evalue": 1e-4}, ignore_index=True) expect = expect.append({"Accession_number": self.replicon_id, "cm_attC": "attC_4", "cm_debut": 1, "cm_fin": 47, "pos_beg": 19618, "pos_end": 19726, "sens": "-", "evalue": 1.1e-7}, ignore_index=True) # convert positions to int intcols = ["cm_debut", "cm_fin", "pos_beg", "pos_end"] expect[intcols] = expect[intcols].astype(int) pdt.assert_frame_equal(df, expect)
def test_nohit(self): """ Test that if the infernal file exists but there is no hit inside, it returns an empty dataframe. """ filename = self.find_data(os.path.join("fictive_results", "{}_attc_table-empty.res".format(self.replicon_id))) df = infernal.read_infernal(filename, self.replicon_id, self.length_cm) expect = pd.DataFrame(columns=["Accession_number", "cm_attC", "cm_debut", "cm_fin", "pos_beg", "pos_end", "sens", "evalue"]) pdt.assert_frame_equal(df, expect)
def test_nofile(self): """ Test that the function returns an empty dataframe if the given infernal file does not exist. """ filename = "infernal.txt" df = infernal.read_infernal(filename, self.replicon_id, self.length_cm) expect = pd.DataFrame(columns=["Accession_number", "cm_attC", "cm_debut", "cm_fin", "pos_beg", "pos_end", "sens", "evalue"]) pdt.assert_frame_equal(df, expect)
def test_evalue_thres(self): """ Test that if the infernal file exists and there are hits, but the given evalue threshold is smaller than the hits thresholds: no hit kept, should return an empty dataframe. """ filename = self.find_data(os.path.join("Results_Integron_Finder_{}".format(self.replicon_name), "tmp_{}".format(self.replicon_id), "{}_attc_table.res".format(self.replicon_id))) df = infernal.read_infernal(filename, self.replicon_id, self.length_cm, evalue=1e-10) expect = pd.DataFrame(columns=["Accession_number", "cm_attC", "cm_debut", "cm_fin", "pos_beg", "pos_end", "sens", "evalue"]) pdt.assert_frame_equal(df, expect)
def test_search_attc_empty(self): """ Test that when there are no attC sites detected, the attc array is empty. """ attc_file = self.find_data( os.path.join("fictive_results", self.replicon_id + "_attc_table-empty.res")) # Construct attC dataframe (read from infernal file) attc_df = infernal.read_infernal(attc_file, self.replicon_id, self.length_cm) attc_array = attc.search_attc(attc_df, True, self.dist_threshold, self.replicon_size) self.assertEqual(len(attc_array), 0) attc_res = [] self.assertEqual(attc_array, attc_res)
def test_filter_evalue_thres(self): """ Test that the filter by a maximum attc size works. """ filename = self.find_data(os.path.join("Results_Integron_Finder_{}".format(self.replicon_name), "tmp_{}".format(self.replicon_id), "{}_attc_table.res".format(self.replicon_id))) df = infernal.read_infernal(filename, self.replicon_id, self.length_cm, evalue=1e-8) expect = pd.DataFrame(columns=["Accession_number", "cm_attC", "cm_debut", "cm_fin", "pos_beg", "pos_end", "sens", "evalue"]) expect = expect.append({"Accession_number": self.replicon_id, "cm_attC": "attC_4", "cm_debut": 1, "cm_fin": 47, "pos_beg": 17825, "pos_end": 17884, "sens": "-", "evalue": 1e-9}, ignore_index=True) # convert positions to int intcols = ["cm_debut", "cm_fin", "pos_beg", "pos_end"] expect[intcols] = expect[intcols].astype(int) pdt.assert_frame_equal(df, expect)
def test_attcsize_minthres(self): """ Test that the filter by a minimum attc size works. """ filename = self.find_data(os.path.join("Results_Integron_Finder_{}".format(self.replicon_name), "tmp_{}".format(self.replicon_id), "{}_attc_table.res".format(self.replicon_id))) df = infernal.read_infernal(filename, self.replicon_id, self.length_cm, size_min_attc=60) expect = pd.DataFrame(columns=["Accession_number", "cm_attC", "cm_debut", "cm_fin", "pos_beg", "pos_end", "sens", "evalue"]) expect = expect.append({"Accession_number": self.replicon_id, "cm_attC": "attC_4", "cm_debut": 1, "cm_fin": 47, "pos_beg": 19080, "pos_end": 19149, "sens": "-", "evalue": 1e-4}, ignore_index=True) expect = expect.append({"Accession_number": self.replicon_id, "cm_attC": "attC_4", "cm_debut": 1, "cm_fin": 47, "pos_beg": 19618, "pos_end": 19726, "sens": "-", "evalue": 1.1e-7}, ignore_index=True) # convert positions to int intcols = ["cm_debut", "cm_fin", "pos_beg", "pos_end"] expect[intcols] = expect[intcols].astype(int) pdt.assert_frame_equal(df, expect)
def test_search_attc_uniq(self): """ Test that it finds a unique attc array when giving a table with 3 attC sites on the same strand and separated by less than 4kb each. """ attc_file = self.find_data( os.path.join( "Results_Integron_Finder_{}".format(self.replicon_name), "tmp_{}".format(self.replicon_id), "{}_attc_table.res".format(self.replicon_id))) # Construct attC dataframe (read from infernal file) attc_df = infernal.read_infernal(attc_file, self.replicon_id, self.length_cm) # search attC arrays, keeping palindromes # 2 attc sites are in the same array if they are on the same strand, and separated by # a distance less than 4kb attc_array = attc.search_attc(attc_df, True, self.dist_threshold, self.replicon_size) self.assertEqual(len(attc_array), 1) # Construct expected output: attc_res = pd.DataFrame(columns=[ "Accession_number", "cm_attC", "cm_debut", "cm_fin", "pos_beg", "pos_end", "sens", "evalue" ], dtype='int') attc_res = attc_res.append( { "Accession_number": self.replicon_id, "cm_attC": "attC_4", "cm_debut": 1, "cm_fin": 47, "pos_beg": 17825, "pos_end": 17884, "sens": "-", "evalue": 1e-9 }, ignore_index=True) attc_res = attc_res.append( { "Accession_number": self.replicon_id, "cm_attC": "attC_4", "cm_debut": 1, "cm_fin": 47, "pos_beg": 19080, "pos_end": 19149, "sens": "-", "evalue": 1e-4 }, ignore_index=True) attc_res = attc_res.append( { "Accession_number": self.replicon_id, "cm_attC": "attC_4", "cm_debut": 1, "cm_fin": 47, "pos_beg": 19618, "pos_end": 19726, "sens": "-", "evalue": 1.1e-7 }, ignore_index=True) # convert positions to int intcols = ["cm_debut", "cm_fin", "pos_beg", "pos_end"] attc_res[intcols] = attc_res[intcols].astype(int) pdt.assert_frame_equal(attc_res, attc_array[0])
def test_search_attc_dist_diff_strand(self): """ Test that it finds a size 3 attc array when giving a table with: - 3 attC sites on the same strand (-) and separated by less than 4 kb - 2 other attC sites separated by less than 4kb but on the other strand (+) - 1 other attC site , also on strand +, but separated by more than 4kb. """ attc_file = self.find_data( os.path.join( "Results_Integron_Finder_{}".format(self.replicon_name), "tmp_{}".format(self.replicon_id), "{}_attc_table.res".format(self.replicon_id))) # Construct attC dataframe (read from infernal file) attc_df = infernal.read_infernal(attc_file, self.replicon_id, self.length_cm) # Add another attC at more than 4kb, same strand attc_df = attc_df.append( { "Accession_number": self.replicon_id, "cm_attC": "attC_4", "cm_debut": 1, "cm_fin": 47, "pos_beg": 15800, "pos_end": 16000, "sens": "+", "evalue": 1e-3 }, ignore_index=True) attc_df = attc_df.append( { "Accession_number": self.replicon_id, "cm_attC": "attC_4", "cm_debut": 1, "cm_fin": 47, "pos_beg": 12000, "pos_end": 12500, "sens": "+", "evalue": 1e-3 }, ignore_index=True) attc_df = attc_df.append( { "Accession_number": self.replicon_id, "cm_attC": "attC_4", "cm_debut": 1, "cm_fin": 47, "pos_beg": 7100, "pos_end": 8200, "sens": "+", "evalue": 1e-3 }, ignore_index=True) attc_df.sort_values(["Accession_number", "pos_beg", "evalue"], inplace=True) # search attC arrays, keeping palindromes # 2 attc sites are in the same array if they are on the same strand, and separated by # a distance less than 4kb attc_array = attc.search_attc(attc_df, True, self.dist_threshold, self.replicon_size) self.assertEqual(len(attc_array), 3) # Construct expected outputs: attc_res = pd.DataFrame(columns=[ "Accession_number", "cm_attC", "cm_debut", "cm_fin", "pos_beg", "pos_end", "sens", "evalue" ]) attc_res = attc_res.append( { "Accession_number": self.replicon_id, "cm_attC": "attC_4", "cm_debut": 1, "cm_fin": 47, "pos_beg": 17825, "pos_end": 17884, "sens": "-", "evalue": 1e-9 }, ignore_index=True) attc_res = attc_res.append( { "Accession_number": self.replicon_id, "cm_attC": "attC_4", "cm_debut": 1, "cm_fin": 47, "pos_beg": 19080, "pos_end": 19149, "sens": "-", "evalue": 1e-4 }, ignore_index=True) attc_res = attc_res.append( { "Accession_number": self.replicon_id, "cm_attC": "attC_4", "cm_debut": 1, "cm_fin": 47, "pos_beg": 19618, "pos_end": 19726, "sens": "-", "evalue": 1.1e-7 }, ignore_index=True) attc_res2 = pd.DataFrame(columns=[ "Accession_number", "cm_attC", "cm_debut", "cm_fin", "pos_beg", "pos_end", "sens", "evalue" ]) attc_res2 = attc_res2.append( { "Accession_number": self.replicon_id, "cm_attC": "attC_4", "cm_debut": 1, "cm_fin": 47, "pos_beg": 12000, "pos_end": 12500, "sens": "+", "evalue": 1e-03 }, ignore_index=True) attc_res2 = attc_res2.append( { "Accession_number": self.replicon_id, "cm_attC": "attC_4", "cm_debut": 1, "cm_fin": 47, "pos_beg": 15800, "pos_end": 16000, "sens": "+", "evalue": 1e-03 }, ignore_index=True) attc_res3 = pd.DataFrame(columns=[ "Accession_number", "cm_attC", "cm_debut", "cm_fin", "pos_beg", "pos_end", "sens", "evalue" ]) attc_res3 = attc_res3.append( { "Accession_number": self.replicon_id, "cm_attC": "attC_4", "cm_debut": 1, "cm_fin": 47, "pos_beg": 7100, "pos_end": 8200, "sens": "+", "evalue": 1e-03 }, ignore_index=True) # convert positions to int intcols = ["cm_debut", "cm_fin", "pos_beg", "pos_end"] attc_res[intcols] = attc_res[intcols].astype(int) attc_res2[intcols] = attc_res2[intcols].astype(int) attc_res3[intcols] = attc_res3[intcols].astype(int) pdt.assert_frame_equal(attc_res, attc_array[2]) pdt.assert_frame_equal(attc_res2, attc_array[1]) pdt.assert_frame_equal(attc_res3, attc_array[0])
def test_find_integron_attC_is_df(self): replicon_name = 'acba.007.p01.13' replicon_id = 'ACBA.007.P01_13' replicon_path = self.find_data( os.path.join('Replicons', replicon_name + '.fst')) prot_file = self.find_data( os.path.join('Proteins', replicon_id + '.prt')) topologies = Topology('lin') with FastaIterator(replicon_path) as sequences_db: sequences_db.topologies = topologies replicon = next(sequences_db) attc_file = self.find_data( os.path.join('Results_Integron_Finder_{}'.format(replicon_name), 'tmp_{}'.format(replicon.id), '{}_attc_table.res'.format(replicon.id))) intI_file = self.find_data( os.path.join('Results_Integron_Finder_{}'.format(replicon_name), 'tmp_{}'.format(replicon.id), '{}_intI.res'.format(replicon.id))) phageI_file = self.find_data( os.path.join('Results_Integron_Finder_{}'.format(replicon_name), 'tmp_{}'.format(replicon.id), '{}_phage_int.res'.format(replicon.id))) args = argparse.Namespace() args.no_proteins = True args.keep_palindromes = True args.attc_model = 'attc_4.cm' args.evalue_attc = 1.0 args.max_attc_size = 200 args.min_attc_size = 40 args.distance_threshold = 4000 args.calin_threshold = 2 args.local_max = False cfg = Config(args) cfg._prefix_data = os.path.join(os.path.dirname(__file__), 'data') len_model_attc = 47 # length in 'CLEN' (value for model attc_4.cm) attc_file = read_infernal(attc_file, replicon_name, len_model_attc, evalue=cfg.evalue_attc, size_max_attc=cfg.max_attc_size, size_min_attc=cfg.min_attc_size) prot_db = ProdigalDB(replicon, cfg, prot_file=prot_file) exp_msg = """In replicon {}, there are: - 0 complete integron(s) found with a total 0 attC site(s) - 1 CALIN element(s) found with a total of 3 attC site(s) - 0 In0 element(s) found with a total of 0 attC site""".format(replicon.id) with self.catch_log() as log: integrons = find_integron(replicon, prot_db, attc_file, intI_file, phageI_file, cfg) catch_msg = log.get_value().strip() self.assertEqual(catch_msg, exp_msg) self.assertEqual(len(integrons), 1) integron = integrons[0] self.assertEqual(integron.replicon.name, replicon_id) exp = pd.DataFrame( { 'annotation': ['attC'] * 3, 'distance_2attC': [np.nan, 1196.0, 469.0], 'evalue': [1.000000e-09, 1.000000e-04, 1.100000e-07], 'model': ['attc_4'] * 3, 'pos_beg': [17825, 19080, 19618], 'pos_end': [17884, 19149, 19726], 'strand': [-1, -1, -1], 'type_elt': 'attC' }, columns=self.columns, index=['attc_001', 'attc_002', 'attc_003']) pdt.assert_frame_equal(integron.attC, exp) exp = pd.DataFrame(columns=self.columns) exp = exp.astype(dtype=self.dtype) pdt.assert_frame_equal(integron.integrase, exp) pdt.assert_frame_equal(integron.promoter, exp) pdt.assert_frame_equal(integron.attI, exp) pdt.assert_frame_equal(integron.proteins, exp)
def test_find_integron_attC_is_df(self): replicon_name = 'acba.007.p01.13' replicon_id = 'ACBA.007.P01_13' replicon_path = self.find_data(os.path.join('Replicons', replicon_name + '.fst')) prot_file = self.find_data(os.path.join('Proteins', replicon_id + '.prt')) topologies = Topology('lin') with FastaIterator(replicon_path) as sequences_db: sequences_db.topologies = topologies replicon = next(sequences_db) attc_file = self.find_data(os.path.join('Results_Integron_Finder_{}'.format(replicon_name), 'tmp_{}'.format(replicon.id), '{}_attc_table.res'.format(replicon.id))) intI_file = self.find_data(os.path.join('Results_Integron_Finder_{}'.format(replicon_name), 'tmp_{}'.format(replicon.id), '{}_intI.res'.format(replicon.id))) phageI_file = self.find_data(os.path.join('Results_Integron_Finder_{}'.format(replicon_name), 'tmp_{}'.format(replicon.id), '{}_phage_int.res'.format(replicon.id))) args = argparse.Namespace() args.no_proteins = True args.keep_palindromes = True args.attc_model = 'attc_4.cm' args.evalue_attc = 1.0 args.max_attc_size = 200 args.min_attc_size = 40 args.distance_threshold = 4000 args.calin_threshold = 2 args.local_max = False cfg = Config(args) cfg._prefix_data = os.path.join(os.path.dirname(__file__), 'data') len_model_attc = 47 # length in 'CLEN' (value for model attc_4.cm) attc_file = read_infernal(attc_file, replicon_name, len_model_attc, evalue=cfg.evalue_attc, size_max_attc=cfg.max_attc_size, size_min_attc=cfg.min_attc_size) prot_db = ProdigalDB(replicon, cfg, prot_file=prot_file) exp_msg = """In replicon {}, there are: - 0 complete integron(s) found with a total 0 attC site(s) - 1 CALIN element(s) found with a total of 3 attC site(s) - 0 In0 element(s) found with a total of 0 attC site""".format(replicon.id) with self.catch_log() as log: integrons = find_integron(replicon, prot_db, attc_file, intI_file, phageI_file, cfg) catch_msg = log.get_value().strip() self.assertEqual(catch_msg, exp_msg) self.assertEqual(len(integrons), 1) integron = integrons[0] self.assertEqual(integron.replicon.name, replicon_id) exp = pd.DataFrame({'annotation': ['attC'] * 3, 'distance_2attC': [np.nan, 1196.0, 469.0], 'evalue': [1.000000e-09, 1.000000e-04, 1.100000e-07], 'model': ['attc_4'] * 3, 'pos_beg': [17825, 19080, 19618], 'pos_end': [17884, 19149, 19726], 'strand': [-1, -1, -1], 'type_elt': 'attC'}, columns=self.columns, index=['attc_001', 'attc_002', 'attc_003']) pdt.assert_frame_equal(integron.attC, exp) exp = pd.DataFrame(columns=self.columns) exp = exp.astype(dtype=self.dtype) pdt.assert_frame_equal(integron.integrase, exp) pdt.assert_frame_equal(integron.promoter, exp) pdt.assert_frame_equal(integron.attI, exp) pdt.assert_frame_equal(integron.proteins, exp)