def test_find_integrase_no_gembase_no_protfile(self): try: cfg = Config(self.args) self.args.gembase = False cfg._prefix_data = os.path.join(os.path.dirname(__file__), 'data') replicon_name = 'acba.007.p01.13' replicon_path = self.find_data( os.path.join('Replicons', replicon_name + '.fst')) topologies = Topology('lin') with FastaIterator(replicon_path) as sequences_db: sequences_db.topologies = topologies replicon = next(sequences_db) len_ori = replicon.__class__.__len__ replicon.__class__.__len__ = lambda x: 500000 prot_file = os.path.join(self.tmp_dir, "foo.prt") open(prot_file, 'w').close() with self.catch_log(): with self.assertRaises(EmptyFileError) as ctx: integrase.find_integrase(replicon.id, prot_file, self.tmp_dir, cfg) finally: replicon.__class__.__len__ = len_ori
def test_find_integrase_gembase(self): cfg = Config(self.args) self.args.gembase = True cfg._prefix_data = os.path.join(os.path.dirname(__file__), 'data') replicon_name = 'acba.007.p01.13' replicon_path = self.find_data( os.path.join('Replicons', replicon_name + '.fst')) topologies = Topology('lin') with FastaIterator(replicon_path) as sequences_db: sequences_db.topologies = topologies replicon = next(sequences_db) prot_file = os.path.join(self.tmp_dir, replicon_name + ".prt") shutil.copyfile( self.find_data(os.path.join('Proteins', replicon.id + ".prt")), prot_file) integrase.find_integrase(replicon.id, prot_file, self.tmp_dir, cfg) for suffix in ('_intI.res', '_intI_table.res', '_phage_int.res', '_phage_int_table.res'): res = os.path.join(self.tmp_dir, replicon.id + suffix) self.assertTrue(os.path.exists(res))
def test_find_integrase_no_gembase_no_protfile_no_prodigal(self): try: self.args.hmmsearch = 'foo' self.args.gembase = False cfg = Config(self.args) cfg._prefix_data = os.path.join(os.path.dirname(__file__), 'data') replicon_name = 'acba.007.p01.13' replicon_path = self.find_data( os.path.join('Replicons', replicon_name + '.fst')) topologies = Topology('lin') with FastaIterator(replicon_path) as sequences_db: sequences_db.topologies = topologies replicon = next(sequences_db) len_ori = replicon.__class__.__len__ replicon.__class__.__len__ = lambda x: 500000 prot_file = os.path.join(self.tmp_dir, replicon.id + ".prt") shutil.copyfile( self.find_data(os.path.join('Proteins', replicon.id + ".prt")), prot_file) with self.assertRaises(RuntimeError) as ctx: integrase.find_integrase(replicon.id, prot_file, self.tmp_dir, cfg) self.assertTrue( re.search( "failed : \[Errno 2\] No such file or directory: 'foo'", str(ctx.exception))) finally: replicon.__class__.__len__ = len_ori
def test_getitem(self): file_name = (('ACBA.0917.00019', '.fna'), ('ESCO001.C.00001.C001', '.fst')) for seq_name, ext in file_name: replicon_path = self.find_data(os.path.join('Gembase', 'Replicons', seq_name + ext)) self.args.replicon = replicon_path cfg = Config(self.args) seq_db = read_multi_prot_fasta(replicon_path) replicon = next(seq_db) os.makedirs(cfg.tmp_dir(replicon.id)) with self.catch_log(): db = GembaseDB(replicon, cfg) exp = read_multi_prot_fasta(self.find_data(os.path.join('Gembase', 'Proteins', seq_name + '.prt'))) specie, date, strain, contig = replicon.id.split('.') pattern = '{}\.{}\.{}\.\w?{}'.format(specie, date, strain, contig) for prot_expected in exp: if re.match(pattern, prot_expected.id): prot_received = db[prot_expected.id] self.assertEqual(prot_received.id, prot_expected.id) self.assertEqual(prot_received.seq, prot_expected.seq) with self.assertRaises(KeyError) as ctx: db['nimport_naoik'] self.assertEqual(str(ctx.exception), "'nimport_naoik'")
def test_find_integrase_no_gembase_with_protfile_empty(self): try: cfg = Config(self.args) self.args.gembase = False cfg._prefix_data = os.path.join(os.path.dirname(__file__), 'data') replicon_name = 'acba.007.p01.13' replicon_path = self.find_data( os.path.join('Replicons', replicon_name + '.fst')) topologies = Topology('lin') with FastaIterator(replicon_path) as sequences_db: sequences_db.topologies = topologies replicon = next(sequences_db) len_ori = replicon.__class__.__len__ replicon.__class__.__len__ = lambda x: 200 prot_file = os.path.join(self.tmp_dir, replicon.id + ".prt") open(prot_file, 'w').close() with self.assertRaises(EmptyFileError) as ctx: with self.catch_log(): integrase.find_integrase(replicon.id, prot_file, self.tmp_dir, cfg) self.assertTrue( re.match( "^The protein file: '.*' is empty cannot perform hmmsearch on it.$", str(ctx.exception))) finally: replicon.__class__.__len__ = len_ori
def test_get_description(self): # SeqDesc(id, strand, strat, stop) file_name = {('ACBA.0917.00019', '.fna'): {'ACBA.0917.00019.b0001_00001': SeqDesc('ACBA.0917.00019.b0001_00001', -1, 266, 1480), 'ACBA.0917.00019.i0001_03957': SeqDesc('ACBA.0917.00019.i0001_03957', -1, 4043755, 4044354)}, } for seq_name, ext in file_name: replicon_path = self.find_data(os.path.join('Gembase', 'Replicons', seq_name + ext)) self.args.replicon = replicon_path cfg = Config(self.args) seq_db = read_multi_prot_fasta(replicon_path) replicon = next(seq_db) replicon.path = replicon_path os.makedirs(cfg.tmp_dir(replicon.id)) db = GembaseDB(replicon, cfg) descriptions = file_name[(seq_name, ext)] for seq_id, desc in descriptions.items(): self.assertEqual(desc, db.get_description(seq_id)) with self.assertRaises(IntegronError) as ctx: db.get_description('nimport_naoik') self.assertEqual(str(ctx.exception), "'nimport_naoik' is not a valid Gembase protein identifier.") with self.assertRaises(KeyError) as ctx: db.get_description('FOO.BAR.00019.i0001_03924') self.assertEqual(str(ctx.exception), "'FOO.BAR.00019.i0001_03924'")
def test_make_protfile(self): file_name = (('ACBA.0917.00019', '.fna', 3870), ('ESCO001.C.00001.C001', '.fst', 3870)) for seq_name, ext, seq_nb in file_name: replicon_path = self.find_data( os.path.join('Gembase', 'Replicons', seq_name + ext)) self.args.replicon = replicon_path cfg = Config(self.args) seq_db = read_multi_prot_fasta(replicon_path) replicon = next(seq_db) replicon.path = replicon_path os.makedirs(cfg.tmp_dir(replicon.id)) with self.catch_log(): db = GembaseDB(replicon, cfg) for seq_nb, seqs in enumerate( zip( read_multi_prot_fasta( self.find_data( os.path.join('Gembase', 'Proteins', seq_name + '.prt'))), read_multi_prot_fasta(db.protfile)), 1): expected, test = seqs self.assertEqual(expected.id, test.id) self.assertEqual(seq_nb, seq_nb)
def test_getitem(self): file_name = (('ACBA.0917.00019', '.fna'), ('ESCO001.C.00001.C001', '.fst')) for seq_name, ext in file_name: replicon_path = self.find_data( os.path.join('Gembase', 'Replicons', seq_name + ext)) self.args.replicon = replicon_path cfg = Config(self.args) seq_db = read_multi_prot_fasta(replicon_path) replicon = next(seq_db) os.makedirs(cfg.tmp_dir(replicon.id)) with self.catch_log(): db = GembaseDB(replicon, cfg) exp = read_multi_prot_fasta( self.find_data( os.path.join('Gembase', 'Proteins', seq_name + '.prt'))) specie, date, strain, contig = replicon.id.split('.') pattern = '{}\.{}\.{}\.\w?{}'.format(specie, date, strain, contig) for prot_expected in exp: if re.match(pattern, prot_expected.id): prot_received = db[prot_expected.id] self.assertEqual(prot_received.id, prot_expected.id) self.assertEqual(prot_received.seq, prot_expected.seq) with self.assertRaises(KeyError) as ctx: db['nimport_naoik'] self.assertEqual(str(ctx.exception), "'nimport_naoik'")
def test_find_integrase_no_gembase_no_protfile_short_seq(self): try: cfg = Config(self.args) self.args.gembase = False cfg._prefix_data = os.path.join(os.path.dirname(__file__), 'data') replicon_name = 'acba.007.p01.13' replicon_path = self.find_data( os.path.join('Replicons', replicon_name + '.fst')) prot_name = 'ACBA.007.P01_13' prot_path = self.find_data( os.path.join('Proteins', prot_name + '.prt')) topologies = Topology('lin') with FastaIterator(replicon_path) as sequences_db: sequences_db.topologies = topologies replicon = next(sequences_db) len_ori = replicon.__class__.__len__ replicon.__class__.__len__ = lambda x: 200 prot_file = os.path.join(self.tmp_dir, replicon.id + ".prt") shutil.copyfile(prot_path, prot_file) integrase.find_integrase(replicon.id, prot_file, self.tmp_dir, cfg) for suffix in ('_intI.res', '_intI_table.res', '_phage_int.res', '_phage_int_table.res'): res = os.path.join(self.tmp_dir, replicon.id + suffix) self.assertTrue(os.path.exists(res)) finally: replicon.__class__.__len__ = len_ori
def test_iter(self): # test Gembase Draft seq_name = 'ACBA.0917.00019' ext = '.fna' replicon_path = self.find_data( os.path.join('Gembase', 'Replicons', seq_name + ext)) self.args.replicon = replicon_path cfg = Config(self.args) seq_db = read_multi_prot_fasta(replicon_path) replicon = next(seq_db) replicon.path = replicon_path db = GembaseDB(replicon, cfg) idx = SeqIO.index(self.find_data( os.path.join('Gembase', 'Proteins', seq_name + '.prt')), 'fasta', alphabet=Seq.IUPAC.extended_protein) specie, date, strain, contig = replicon.id.split('.') pattern = '{}\.{}\.{}\.\w?{}'.format(specie, date, strain, contig) self.assertListEqual(sorted([i for i in idx if re.match(pattern, i)]), sorted([i for i in db])) # test Gembase Complet seq_name = 'ESCO001.C.00001.C001' ext = '.fst' replicon_path = self.find_data( os.path.join('Gembase', 'Replicons', seq_name + ext)) self.args.replicon = replicon_path cfg = Config(self.args) seq_db = read_multi_prot_fasta(replicon_path) replicon = next(seq_db) replicon.path = replicon_path with self.catch_log(): db = GembaseDB(replicon, cfg) idx = SeqIO.index(self.find_data( os.path.join('Gembase', 'Proteins', seq_name + '.prt')), 'fasta', alphabet=Seq.IUPAC.extended_protein) specie, date, strain, contig = replicon.id.split('.') pattern = '{}\.{}\.{}\.\w?{}'.format(specie, date, strain, contig) seqid_from_gembase_protfile = set( [i for i in idx if re.match(pattern, i)]) seqid_from_if = set([i for i in db]) non_common_seq = seqid_from_gembase_protfile ^ seqid_from_if # in Gembase complete the annotation from lstinfo provided from genbank # it appear some times that some CDS are not translate in proteins # So in data I have 3 genes from LSTINFO are not in .prt file diff = { 'ESCO001.C.00001.C001_03974', 'ESCO001.C.00001.C001_01509', 'ESCO001.C.00001.C001_04162' } self.assertSetEqual(non_common_seq, diff)
def test_ProteinDB(self): file_name = 'acba.007.p01.13' replicon_path = self.find_data(os.path.join('Replicons', file_name + '.fst')) self.args.replicon = replicon_path cfg = Config(self.args) seq_db = read_multi_prot_fasta(replicon_path) replicon = next(seq_db) replicon.path = replicon_path os.makedirs(cfg.tmp_dir(replicon.id)) db = ProdigalDB(replicon, cfg) self.assertTrue(db.replicon.id, replicon.id)
def test_ProteinDB_no_prodigal(self): file_name = 'acba.007.p01.13' replicon_path = self.find_data(os.path.join('Replicons', file_name + '.fst')) self.args.replicon = replicon_path cfg = Config(self.args) seq_db = read_multi_prot_fasta(replicon_path) replicon = next(seq_db) replicon.path = replicon_path os.makedirs(cfg.tmp_dir(replicon.id)) self.args.prodigal = None with self.assertRaises(RuntimeError) as ctx: ProdigalDB(replicon, cfg)
def test_ProteinDB(self): file_name = 'acba.007.p01.13' replicon_path = self.find_data( os.path.join('Replicons', file_name + '.fst')) self.args.replicon = replicon_path cfg = Config(self.args) seq_db = read_multi_prot_fasta(replicon_path) replicon = next(seq_db) replicon.path = replicon_path os.makedirs(cfg.tmp_dir(replicon.id)) db = ProdigalDB(replicon, cfg) self.assertTrue(db.replicon.id, replicon.id)
def test_protfile(self): file_name = 'acba.007.p01.13' prot_name = 'ACBA.007.P01_13.prt' replicon_path = self.find_data(os.path.join('Replicons', file_name + '.fst')) self.args.replicon = replicon_path cfg = Config(self.args) seq_db = read_multi_prot_fasta(replicon_path) replicon = next(seq_db) replicon.path = replicon_path os.makedirs(cfg.tmp_dir(replicon.id)) db = ProdigalDB(replicon, cfg) self.assertEqual(os.path.join(cfg.tmp_dir(replicon.id), prot_name), db.protfile)
def test_protfile(self): file_name = (('ACBA.0917.00019', '.fna'), ('ESCO001.C.00001.C001', '.fst')) for seq_name, ext in file_name: replicon_path = self.find_data(os.path.join('Gembase', 'Replicons', seq_name + ext)) self.args.replicon = replicon_path cfg = Config(self.args) seq_db = read_multi_prot_fasta(replicon_path) replicon = next(seq_db) replicon.path = replicon_path os.makedirs(cfg.tmp_dir(replicon.id)) with self.catch_log(): db = GembaseDB(replicon, cfg) self.assertEqual(os.path.join(cfg.tmp_dir(replicon.id), replicon.id + '.prt'), db.protfile)
def test_ProteinDB_no_prodigal(self): file_name = 'acba.007.p01.13' replicon_path = self.find_data( os.path.join('Replicons', file_name + '.fst')) self.args.replicon = replicon_path cfg = Config(self.args) seq_db = read_multi_prot_fasta(replicon_path) replicon = next(seq_db) replicon.path = replicon_path os.makedirs(cfg.tmp_dir(replicon.id)) self.args.prodigal = None with self.assertRaises(RuntimeError) as ctx: ProdigalDB(replicon, cfg)
def test_protfile(self): file_name = 'acba.007.p01.13' prot_name = 'ACBA.007.P01_13.prt' replicon_path = self.find_data( os.path.join('Replicons', file_name + '.fst')) self.args.replicon = replicon_path cfg = Config(self.args) seq_db = read_multi_prot_fasta(replicon_path) replicon = next(seq_db) replicon.path = replicon_path os.makedirs(cfg.tmp_dir(replicon.id)) db = ProdigalDB(replicon, cfg) self.assertEqual(os.path.join(cfg.tmp_dir(replicon.id), prot_name), db.protfile)
def test_ProteinDB(self): # From Gembase Draft , Gembase Complete file_names = ('ACBA.0917.00019.fna', 'ESCO001.C.00001.C001.fst') for file_name in file_names: replicon_path = self.find_data(os.path.join('Gembase', 'Replicons', file_name)) self.args.replicon = replicon_path cfg = Config(self.args) seq_db = read_multi_prot_fasta(replicon_path) replicon = next(seq_db) replicon.path = replicon_path os.makedirs(cfg.tmp_dir(replicon.id)) with self.catch_log(): db = GembaseDB(replicon, cfg) self.assertTrue(db.replicon.id, replicon.id)
def test_ProteinDB(self): # From Gembase Draft , Gembase Complete file_names = ('ACBA.0917.00019.fna', 'ESCO001.C.00001.C001.fst') for file_name in file_names: replicon_path = self.find_data( os.path.join('Gembase', 'Replicons', file_name)) self.args.replicon = replicon_path cfg = Config(self.args) seq_db = read_multi_prot_fasta(replicon_path) replicon = next(seq_db) replicon.path = replicon_path os.makedirs(cfg.tmp_dir(replicon.id)) with self.catch_log(): db = GembaseDB(replicon, cfg) self.assertTrue(db.replicon.id, replicon.id)
def setUp(self): if 'INTEGRON_HOME' in os.environ: self.integron_home = os.environ['INTEGRON_HOME'] self.local_install = True else: self.local_install = False self.integron_home = os.path.normpath( os.path.abspath( os.path.join(os.path.dirname(__file__), '..', '..'))) self.columns = [ 'pos_beg', 'pos_end', 'strand', 'evalue', 'type_elt', 'model', 'distance_2attC', 'annotation' ] self.dtype = { "pos_beg": 'int', "pos_end": 'int', "strand": 'int', "evalue": 'float', "type_elt": 'str', "annotation": 'str', "model": 'str', "distance_2attC": 'float' } args = argparse.Namespace() self.cfg = Config(args) self._prefix_data = os.path.join(os.path.dirname(__file__), 'data')
def test_iter(self): file_name = 'acba.007.p01.13' prot_name = 'ACBA.007.P01_13.prt' replicon_path = self.find_data(os.path.join('Replicons', file_name + '.fst')) self.args.replicon = replicon_path cfg = Config(self.args) seq_db = read_multi_prot_fasta(replicon_path) replicon = next(seq_db) replicon.path = replicon_path os.makedirs(cfg.tmp_dir(replicon.id)) db = ProdigalDB(replicon, cfg) idx = SeqIO.index(self.find_data(os.path.join('Proteins', prot_name)), 'fasta', alphabet=Seq.IUPAC.extended_protein) for exp_seq_id, get_seq_id in zip(idx, db): self.assertEqual(exp_seq_id, get_seq_id)
def test_get_description(self): # SeqDesc(id, strand, strat, stop) file_name = 'acba.007.p01.13' replicon_path = self.find_data(os.path.join('Replicons', file_name + '.fst')) self.args.replicon = replicon_path cfg = Config(self.args) seq_db = read_multi_prot_fasta(replicon_path) replicon = next(seq_db) replicon.path = replicon_path os.makedirs(cfg.tmp_dir(replicon.id)) db = ProdigalDB(replicon, cfg) descriptions = {'ACBA.007.P01_13_23': SeqDesc('ACBA.007.P01_13_23', -1, 19721, 20254), 'ACBA.007.P01_13_1': SeqDesc('ACBA.007.P01_13_1', 1, 55, 1014)} for seq_id, desc in descriptions.items(): self.assertEqual(desc, db.get_description(seq_id))
def test_iter(self): file_name = 'acba.007.p01.13' prot_name = 'ACBA.007.P01_13.prt' replicon_path = self.find_data( os.path.join('Replicons', file_name + '.fst')) self.args.replicon = replicon_path cfg = Config(self.args) seq_db = read_multi_prot_fasta(replicon_path) replicon = next(seq_db) replicon.path = replicon_path os.makedirs(cfg.tmp_dir(replicon.id)) db = ProdigalDB(replicon, cfg) idx = SeqIO.index(self.find_data(os.path.join('Proteins', prot_name)), 'fasta', alphabet=Seq.IUPAC.extended_protein) for exp_seq_id, get_seq_id in zip(idx, db): self.assertEqual(exp_seq_id, get_seq_id)
def test_protfile(self): file_name = (('ACBA.0917.00019', '.fna'), ('ESCO001.C.00001.C001', '.fst')) for seq_name, ext in file_name: replicon_path = self.find_data( os.path.join('Gembase', 'Replicons', seq_name + ext)) self.args.replicon = replicon_path cfg = Config(self.args) seq_db = read_multi_prot_fasta(replicon_path) replicon = next(seq_db) replicon.path = replicon_path os.makedirs(cfg.tmp_dir(replicon.id)) with self.catch_log(): db = GembaseDB(replicon, cfg) self.assertEqual( os.path.join(cfg.tmp_dir(replicon.id), replicon.id + '.prt'), db.protfile)
def test_find_gembase_file_basename_file_not_in_gembase(self): """ test if find_gembase_file_basename get the the right basename for files not located in gembase and file name is the output of split operation a file containing one contig a file representing a chunk """ gembase_path = self.find_data('Gembase') file_names = {'ACBA.0917.00019': self.find_data(os.path.join('Replicons', 'ACBA.0917.00019.0001.fst')), 'ESCO001.C.00001.C001.fst': os.path.join(self.tmp_dir, 'ESCO001.C.00001.C001_chunk_1.fst') } shutil.copyfile(os.path.join(gembase_path, 'Replicons', 'ESCO001.C.00001.C001.fst'), file_names['ESCO001.C.00001.C001.fst']) for base_file_name, replicon_path in file_names.items(): self.args.replicon = replicon_path self.args.gembase_path = gembase_path cfg = Config(self.args) seq_db = read_multi_prot_fasta(replicon_path) replicon = next(seq_db) replicon.path = replicon_path os.makedirs(cfg.tmp_dir(replicon.id)) with self.catch_log(): db = GembaseDB(replicon, cfg, gembase_path=gembase_path) self.assertTrue(db._find_gembase_file_basename(gembase_path, replicon_path), base_file_name) replicon_path = self.find_data(os.path.join('Replicons', 'acba.007.p01.13.fst')) self.args.replicon = replicon_path self.args.gembase_path = gembase_path cfg = Config(self.args) seq_db = read_multi_prot_fasta(replicon_path) replicon = next(seq_db) replicon.path = replicon_path os.makedirs(cfg.tmp_dir(replicon.id)) with self.assertRaises(FileNotFoundError) as ctx: with self.catch_log(): GembaseDB(replicon, cfg, gembase_path=gembase_path) self.assertEqual(str(ctx.exception), 'cannot find lst file matching {} sequence'.format(replicon_path))
def test_read_hmm_evalue(self): """ Test that the hmm hits are well read, and returned only if evalue is < to the given threshold. """ rep_name = "acba.007.p01.13" replicon_id = 'ACBA.007.P01_13' replicon_path = self.find_data( os.path.join('Replicons', rep_name + '.fst')) prot_file = self.find_data( os.path.join('Proteins', replicon_id + '.prt')) args = argparse.Namespace() args.gembase = False args.replicon = replicon_path cfg = Config(args) sequences_db = read_multi_prot_fasta(replicon_path) replicon = next(sequences_db) prot_db = ProdigalDB(replicon, cfg, prot_file=prot_file) infile = self.find_data( os.path.join("Results_Integron_Finder_{}".format(rep_name), "tmp_{}".format(replicon_id), "{}_intI.res".format(replicon_id))) df1 = read_hmm(rep_name, prot_db, infile, cfg, evalue=1.95e-25) exp1 = pd.DataFrame(data={ "Accession_number": rep_name, "query_name": "intI_Cterm", "ID_query": "-", "ID_prot": "ACBA.007.P01_13_1", "strand": 1, "pos_beg": 55, "pos_end": 1014, "evalue": 1.9e-25 }, index=[0]) exp1 = exp1[[ "Accession_number", "query_name", "ID_query", "ID_prot", "strand", "pos_beg", "pos_end", "evalue" ]] pdt.assert_frame_equal(df1, exp1) df2 = read_hmm(replicon_id, prot_db, infile, cfg, evalue=1.9e-25) exp2 = pd.DataFrame(columns=[ "Accession_number", "query_name", "ID_query", "ID_prot", "strand", "pos_beg", "pos_end", "evalue" ]) intcols = ["pos_beg", "pos_end", "strand"] floatcol = ["evalue"] exp2[intcols] = exp2[intcols].astype(int) exp2[floatcol] = exp2[floatcol].astype(float) pdt.assert_frame_equal(df2, exp2)
def test_find_integron_calin_threshold(self): replicon_name = 'ESCO001.B.00018.P002' replicon_path = self.find_data(os.path.join('Replicons', replicon_name + '.fst')) prot_file = self.find_data(os.path.join('Proteins', replicon_name + '.prt')) topologies = Topology('circ') with FastaIterator(replicon_path) as sequences_db: sequences_db.topologies = topologies replicon = next(sequences_db) replicon_results_path = self.find_data(os.path.join('Results_Integron_Finder_{}'.format(replicon_name), 'tmp_{}'.format(replicon.id))) attc_file = os.path.join(replicon_results_path, '{}_attc_table.res'.format(replicon.id)) intI_file = os.path.join(replicon_results_path, '{}_intI.res'.format(replicon.id)) phageI_file = os.path.join(replicon_results_path, '{}_phage_int.res'.format(replicon.id)) args = argparse.Namespace() args.no_proteins = False args.keep_palindromes = True args.distance_threshold = 4000 args.attc_model = 'attc_4.cm' args.evalue_attc = 1.0 args.max_attc_size = 200 args.min_attc_size = 40 args.local_max = False args.gembase = False args.union_integrases = False args.calin_threshold = 2 cfg = Config(args) cfg._prefix_data = os.path.join(os.path.dirname(__file__), 'data') prot_db = ProdigalDB(replicon, cfg, prot_file=prot_file) with self.catch_log() as log: integrons = find_integron(replicon, prot_db, attc_file, intI_file, phageI_file, cfg) self.assertEqual(len(integrons), 2) args.calin_threshold = 3 cfg = Config(args) cfg._prefix_data = os.path.join(os.path.dirname(__file__), 'data') with self.catch_log() as log: integrons = find_integron(replicon, prot_db, attc_file, intI_file, phageI_file, cfg) self.assertEqual(len(integrons), 1)
def test_make_protfile(self): file_name = 'acba.007.p01.13' prot_name = 'ACBA.007.P01_13.prt' replicon_path = self.find_data(os.path.join('Replicons', file_name + '.fst')) self.args.replicon = replicon_path cfg = Config(self.args) seq_db = read_multi_prot_fasta(replicon_path) replicon = next(seq_db) replicon.path = replicon_path os.makedirs(cfg.tmp_dir(replicon.id)) db = ProdigalDB(replicon, cfg) for seq_nb, seqs in enumerate(zip( read_multi_prot_fasta(self.find_data(os.path.join('Proteins', prot_name))), read_multi_prot_fasta(db.protfile)), 1): expected, test = seqs self.assertEqual(expected.id, test.id) self.assertEqual(seq_nb, 23)
def test_make_protfile(self): file_name = (('ACBA.0917.00019', '.fna', 3870), ('ESCO001.C.00001.C001', '.fst', 3870)) for seq_name, ext, seq_nb in file_name: replicon_path = self.find_data(os.path.join('Gembase', 'Replicons', seq_name + ext)) self.args.replicon = replicon_path cfg = Config(self.args) seq_db = read_multi_prot_fasta(replicon_path) replicon = next(seq_db) replicon.path = replicon_path os.makedirs(cfg.tmp_dir(replicon.id)) with self.catch_log(): db = GembaseDB(replicon, cfg) for seq_nb, seqs in enumerate(zip( read_multi_prot_fasta(self.find_data(os.path.join('Gembase', 'Proteins', seq_name + '.prt'))), read_multi_prot_fasta(db.protfile)), 1): expected, test = seqs self.assertEqual(expected.id, test.id) self.assertEqual(seq_nb, seq_nb)
def test_find_gembase_file_basename(self): """ test if find_gembase_file_basename get the the right basename for files in gembase """ gembase_path = self.find_data('Gembase') file_names = ('ACBA.0917.00019.fna', 'ESCO001.C.00001.C001.fst') for file_name in file_names: replicon_path = self.find_data(os.path.join('Gembase', 'Replicons', file_name)) self.args.replicon = replicon_path cfg = Config(self.args) seq_db = read_multi_prot_fasta(replicon_path) replicon = next(seq_db) replicon.path = replicon_path os.makedirs(cfg.tmp_dir(replicon.id)) with self.catch_log(): db = GembaseDB(replicon, cfg) self.assertTrue(db._find_gembase_file_basename(gembase_path, replicon_path), os.path.splitext(file_name)[0])
def test_find_integrase_gembase_hmmer_error(self): self.args.gembase = True self.args.cpu = 'foo' cfg = Config(self.args) cfg._prefix_data = os.path.join(os.path.dirname(__file__), 'data') replicon_name = 'acba.007.p01.13' replicon_path = os.path.join(self._data_dir, 'Replicons', replicon_name + '.fst') topologies = Topology('lin') with FastaIterator(replicon_path) as sequences_db: sequences_db.topologies = topologies replicon = next(sequences_db) prot_file = os.path.join(self.tmp_dir, replicon.id + ".prt") shutil.copyfile( os.path.join(self._data_dir, 'Proteins', replicon.id + ".prt"), prot_file) with self.assertRaises(RuntimeError) as ctx: integrase.find_integrase(replicon.id, prot_file, self.tmp_dir, cfg) self.assertTrue(str(ctx.exception).endswith('failed return code = 1'))
def test_get_description(self): # SeqDesc(id, strand, strat, stop) file_name = 'acba.007.p01.13' replicon_path = self.find_data( os.path.join('Replicons', file_name + '.fst')) self.args.replicon = replicon_path cfg = Config(self.args) seq_db = read_multi_prot_fasta(replicon_path) replicon = next(seq_db) replicon.path = replicon_path os.makedirs(cfg.tmp_dir(replicon.id)) db = ProdigalDB(replicon, cfg) descriptions = { 'ACBA.007.P01_13_23': SeqDesc('ACBA.007.P01_13_23', -1, 19721, 20254), 'ACBA.007.P01_13_1': SeqDesc('ACBA.007.P01_13_1', 1, 55, 1014) } for seq_id, desc in descriptions.items(): self.assertEqual(desc, db.get_description(seq_id))
def setUp(self): if 'INTEGRON_HOME' in os.environ: self.integron_home = os.environ['INTEGRON_HOME'] self.local_install = True else: self.local_install = False self.integron_home = os.path.normpath(os.path.abspath(os.path.join(os.path.dirname(__file__), '..', '..'))) self.tmp_dir = os.path.join(tempfile.gettempdir(), 'tmp_test_integron_finder') if os.path.exists(self.tmp_dir) and os.path.isdir(self.tmp_dir): shutil.rmtree(self.tmp_dir) os.makedirs(self.tmp_dir) args = argparse.Namespace() args.attc_model = 'attc_4.cm' args.max_attc_size = 200 args.min_attc_size = 40 args.distance_threshold = 4000 # (4kb at least between 2 different arrays) args.eagle_eyes = False args.local_max = False self.cfg = Config(args) self.cfg._prefix_data = os.path.join(os.path.dirname(__file__), 'data') replicon_name = 'OBAL001.B.00005.C001' replicon_path = self.find_data(os.path.join('Replicons', replicon_name + '.fst')) topologies = Topology('lin') with FastaIterator(replicon_path) as sequences_db: sequences_db.topologies = topologies self.replicon = next(sequences_db) self.integron = Integron(self.replicon, self.cfg) self.columns = ['pos_beg', 'pos_end', 'strand', 'evalue', 'type_elt', 'model', 'distance_2attC', 'annotation'] self.dtype = {"pos_beg": 'int', "pos_end": 'int', "strand": 'int', "evalue": 'float', "type_elt": 'str', "annotation": 'str', "model": 'str', "distance_2attC": 'float'} self.max_dtype = {'Accession_number': 'str', 'cm_attC': 'str', 'cm_debut': 'int', 'cm_fin': 'int', 'pos_beg': 'int', 'pos_end': 'int', 'sens': 'str', 'evalue': 'float'} self.max_cols = ['Accession_number', 'cm_attC', 'cm_debut', 'cm_fin', 'pos_beg', 'pos_end', 'sens', 'evalue']
def test_getitem(self): file_name = 'acba.007.p01.13' prot_name = 'ACBA.007.P01_13.prt' replicon_path = self.find_data( os.path.join('Replicons', file_name + '.fst')) self.args.replicon = replicon_path cfg = Config(self.args) seq_db = read_multi_prot_fasta(replicon_path) replicon = next(seq_db) replicon.path = replicon_path os.makedirs(cfg.tmp_dir(replicon.id)) db = ProdigalDB(replicon, cfg) exp = read_multi_prot_fasta( self.find_data(os.path.join('Proteins', prot_name))) for prot_expected in exp: prot_received = db[prot_expected.id] self.assertEqual(prot_received.id, prot_expected.id) self.assertEqual(prot_received.seq, prot_expected.seq) with self.assertRaises(KeyError) as ctx: db['nimport_naoik'] self.assertEqual(str(ctx.exception), "'nimport_naoik'")
def test_find_gembase_file_basename(self): """ test if find_gembase_file_basename get the the right basename for files in gembase """ gembase_path = self.find_data('Gembase') file_names = ('ACBA.0917.00019.fna', 'ESCO001.C.00001.C001.fst') for file_name in file_names: replicon_path = self.find_data( os.path.join('Gembase', 'Replicons', file_name)) self.args.replicon = replicon_path cfg = Config(self.args) seq_db = read_multi_prot_fasta(replicon_path) replicon = next(seq_db) replicon.path = replicon_path os.makedirs(cfg.tmp_dir(replicon.id)) with self.catch_log(): db = GembaseDB(replicon, cfg) self.assertTrue( db._find_gembase_file_basename(gembase_path, replicon_path), os.path.splitext(file_name)[0])
def test_make_protfile_no_prodigal(self): file_name = 'acba.007.p01.13' replicon_path = self.find_data( os.path.join('Replicons', file_name + '.fst')) self.args.replicon = replicon_path self.args.prodigal = 'foo_bar' cfg = Config(self.args) seq_db = read_multi_prot_fasta(replicon_path) replicon = next(seq_db) replicon.path = replicon_path with self.assertRaises(RuntimeError) as ctx: ProdigalDB(replicon, cfg)
def test_make_protfile(self): file_name = 'acba.007.p01.13' prot_name = 'ACBA.007.P01_13.prt' replicon_path = self.find_data( os.path.join('Replicons', file_name + '.fst')) self.args.replicon = replicon_path cfg = Config(self.args) seq_db = read_multi_prot_fasta(replicon_path) replicon = next(seq_db) replicon.path = replicon_path os.makedirs(cfg.tmp_dir(replicon.id)) db = ProdigalDB(replicon, cfg) for seq_nb, seqs in enumerate( zip( read_multi_prot_fasta( self.find_data(os.path.join('Proteins', prot_name))), read_multi_prot_fasta(db.protfile)), 1): expected, test = seqs self.assertEqual(expected.id, test.id) self.assertEqual(seq_nb, 23)
def test_getitem(self): file_name = 'acba.007.p01.13' prot_name = 'ACBA.007.P01_13.prt' replicon_path = self.find_data(os.path.join('Replicons', file_name + '.fst')) self.args.replicon = replicon_path cfg = Config(self.args) seq_db = read_multi_prot_fasta(replicon_path) replicon = next(seq_db) replicon.path = replicon_path os.makedirs(cfg.tmp_dir(replicon.id)) db = ProdigalDB(replicon, cfg) exp = read_multi_prot_fasta(self.find_data(os.path.join('Proteins', prot_name))) for prot_expected in exp: prot_received = db[prot_expected.id] self.assertEqual(prot_received.id, prot_expected.id) self.assertEqual(prot_received.seq, prot_expected.seq) with self.assertRaises(KeyError) as ctx: db['nimport_naoik'] self.assertEqual(str(ctx.exception), "'nimport_naoik'")
def test_find_integrase_gembase_no_hmmer_no_replicon(self): self.args.gembase = True self.args.hmmsearch = 'foo' cfg = Config(self.args) cfg._prefix_data = os.path.join(os.path.dirname(__file__), 'data') replicon_name = 'acba.007.p01.13' replicon_path = os.path.join(self._data_dir, 'Replicons', replicon_name + '.fst') topologies = Topology('lin') with FastaIterator(replicon_path) as sequences_db: sequences_db.topologies = topologies replicon = next(sequences_db) prot_file = os.path.join(self.tmp_dir, replicon.id + ".prt") with self.catch_log(): with self.assertRaises(RuntimeError) as ctx: integrase.find_integrase(replicon.id, prot_file, self.tmp_dir, cfg) self.assertEqual( "The protein file: '{}' does not exists cannot perform hmmsearch on it." .format(prot_file), str(ctx.exception))
def test_add_proteins(self): replicon_name = 'pssu.001.c01.13' replicon_path = self.find_data(os.path.join('Replicons', replicon_name + '.fst')) topologies = Topology('lin') with FastaIterator(replicon_path) as sequences_db: sequences_db.topologies = topologies replicon = next(sequences_db) prot_file = os.path.join(self._data_dir, '{}.prt.short'.format(replicon_name)) args = argparse.Namespace() args.gembase = False args.annot_parser_name = None cfg = Config(args) integron = Integron(replicon, cfg) data_attc = {"pos_beg": [3072863, 3073496, 3074121, 3075059, 3075593, 3076281, 3076659], "pos_end": [3072931, 3073555, 3074232, 3075118, 3075652, 3076340, 3076718], "strand": [-1] * 7, "evalue": [2.5e-06, 7e-08, 6.5e-08, 3.2e-06, 4.1e-07, 1.4e-08, 4e-08], "type_elt": ['attC'] * 7, "annotation": ['attC'] * 7, "model": ['attc_4'] * 7, "distance_2attC": [np.nan, 565.0, 566.0, 827.0, 475.0, 629.0, 319.0]} attC = pd.DataFrame(data_attc, columns=self.columns, index=['attc_00{}'.format(i) for i in range(len(data_attc['pos_beg']))]) attC = attC.astype(dtype=self.dtype) integron.attC = attC prot_db = ProdigalDB(replicon, cfg, prot_file=prot_file) integron.add_proteins(prot_db) exp_proteins = pd.DataFrame({'pos_beg': [3071974, 3072950, 3074243, 3076720], 'pos_end': [3072855, 3073468, 3075055, 3077511], 'strand': [-1] * 4, 'evalue': [np.nan] * 4, 'type_elt': ['protein'] * 4, 'annotation': ['protein'] * 4, 'model': ['NA'] * 4, 'distance_2attC': [np.nan] *4 }, index=['PSSU.001.C01_13_281{}'.format(i) for i in range(5, 9)], columns=self.columns ) exp_proteins = exp_proteins.astype(dtype=self.dtype) pdt.assert_frame_equal(exp_proteins.sort_index(), integron.proteins.sort_index())
def test_get_description(self): # SeqDesc(id, strand, strat, stop) file_name = { ('ACBA.0917.00019', '.fna'): { 'ACBA.0917.00019.b0001_00001': SeqDesc('ACBA.0917.00019.b0001_00001', -1, 266, 1480), 'ACBA.0917.00019.i0001_03957': SeqDesc('ACBA.0917.00019.i0001_03957', -1, 4043755, 4044354) }, } for seq_name, ext in file_name: replicon_path = self.find_data( os.path.join('Gembase', 'Replicons', seq_name + ext)) self.args.replicon = replicon_path cfg = Config(self.args) seq_db = read_multi_prot_fasta(replicon_path) replicon = next(seq_db) replicon.path = replicon_path os.makedirs(cfg.tmp_dir(replicon.id)) db = GembaseDB(replicon, cfg) descriptions = file_name[(seq_name, ext)] for seq_id, desc in descriptions.items(): self.assertEqual(desc, db.get_description(seq_id)) with self.assertRaises(IntegronError) as ctx: db.get_description('nimport_naoik') self.assertEqual( str(ctx.exception), "'nimport_naoik' is not a valid Gembase protein identifier.") with self.assertRaises(KeyError) as ctx: db.get_description('FOO.BAR.00019.i0001_03924') self.assertEqual(str(ctx.exception), "'FOO.BAR.00019.i0001_03924'")
def test_read_multi(self): """ Test reading hmm results when there are multiple hits: 2 hits on the same protein: keep only the one with the best evalue. 2 hits on 2 different proteins: keep the 2 proteins. """ replicon_id = 'ACBA.0917.00019' contig_id = 'ACBA.0917.00019.0001' result_dir_expected = self.find_data( "Results_Integron_Finder_{}.gembase".format(replicon_id)) replicon_path = self.find_data( os.path.join('Gembase', 'Replicons', replicon_id + '.fna')) prot_file = os.path.join(result_dir_expected, "tmp_{}".format(contig_id), contig_id + '.prt') args = argparse.Namespace() args.gembase = True args.replicon = replicon_path cfg = Config(args) sequences_db = read_multi_prot_fasta(replicon_path) replicon = next(sequences_db) prot_db = GembaseDB(replicon, cfg, prot_file=prot_file) infile = self.find_data( os.path.join('fictive_results', "{}_intI_multi.res".format(contig_id))) df = read_hmm(contig_id, prot_db, infile, cfg) exp = pd.DataFrame(data={ "Accession_number": [contig_id] * 2, "query_name": ["Phage_integrase"] * 2, "ID_query": ["PF00589.16"] * 2, "ID_prot": ["ACBA.0917.00019.i0001_00298", "ACBA.0917.00019.i0001_00338"], "strand": [-1, -1], "pos_beg": [311597, 350328], "pos_end": [312631, 351248], "evalue": [5.5e-66, 3.4e-51] }, index=[0, 1]) exp = exp[[ "Accession_number", "query_name", "ID_query", "ID_prot", "strand", "pos_beg", "pos_end", "evalue" ]] pdt.assert_frame_equal(df, exp)
def test_find_gembase_file_basename_file_not_in_gembase(self): """ test if find_gembase_file_basename get the the right basename for files not located in gembase and file name is the output of split operation a file containing one contig a file representing a chunk """ gembase_path = self.find_data('Gembase') file_names = { 'ACBA.0917.00019': self.find_data( os.path.join('Replicons', 'ACBA.0917.00019.0001.fst')), 'ESCO001.C.00001.C001.fst': os.path.join(self.tmp_dir, 'ESCO001.C.00001.C001_chunk_1.fst') } shutil.copyfile( os.path.join(gembase_path, 'Replicons', 'ESCO001.C.00001.C001.fst'), file_names['ESCO001.C.00001.C001.fst']) for base_file_name, replicon_path in file_names.items(): self.args.replicon = replicon_path self.args.gembase_path = gembase_path cfg = Config(self.args) seq_db = read_multi_prot_fasta(replicon_path) replicon = next(seq_db) replicon.path = replicon_path os.makedirs(cfg.tmp_dir(replicon.id)) with self.catch_log(): db = GembaseDB(replicon, cfg, gembase_path=gembase_path) self.assertTrue( db._find_gembase_file_basename(gembase_path, replicon_path), base_file_name) replicon_path = self.find_data( os.path.join('Replicons', 'acba.007.p01.13.fst')) self.args.replicon = replicon_path self.args.gembase_path = gembase_path cfg = Config(self.args) seq_db = read_multi_prot_fasta(replicon_path) replicon = next(seq_db) replicon.path = replicon_path os.makedirs(cfg.tmp_dir(replicon.id)) with self.assertRaises(FileNotFoundError) as ctx: with self.catch_log(): GembaseDB(replicon, cfg, gembase_path=gembase_path) self.assertEqual( str(ctx.exception), 'cannot find lst file matching {} sequence'.format(replicon_path))
def test_read_hmm_gembase(self): """ Test that the hmm hits are well read, when the gembase format is used (.prt file is provided, prodigal is not used to find the proteins). """ replicon_id = 'ACBA.0917.00019' contig_id = 'ACBA.0917.00019.0001' result_dir_expected = self.find_data( "Results_Integron_Finder_{}.gembase".format(replicon_id)) replicon_path = self.find_data( os.path.join('Gembase', 'Replicons', replicon_id + '.fna')) prot_file = os.path.join(result_dir_expected, "tmp_{}".format(contig_id), contig_id + '.prt') infile = os.path.join(result_dir_expected, "tmp_{}".format(contig_id), "{}_intI.res".format(contig_id)) args = argparse.Namespace() args.gembase = True args.replicon = replicon_path cfg = Config(args) sequences_db = read_multi_prot_fasta(replicon_path) replicon = next(sequences_db) prot_db = GembaseDB(replicon, cfg, prot_file=prot_file) df = read_hmm(contig_id, prot_db, infile, cfg) exp = pd.DataFrame(data={ "Accession_number": contig_id, "query_name": "intI_Cterm", "ID_query": "-", "ID_prot": "ACBA.0917.00019.i0001_00298", "strand": -1, "pos_beg": 311597, "pos_end": 312631, "evalue": 3.6e-25 }, index=[0]) exp = exp[[ "Accession_number", "query_name", "ID_query", "ID_prot", "strand", "pos_beg", "pos_end", "evalue" ]] pdt.assert_frame_equal(df, exp)
def test_find_integron_calin_threshold(self): replicon_name = 'ESCO001.B.00018.P002' replicon_path = self.find_data( os.path.join('Replicons', replicon_name + '.fst')) prot_file = self.find_data( os.path.join('Proteins', replicon_name + '.prt')) topologies = Topology('circ') with FastaIterator(replicon_path) as sequences_db: sequences_db.topologies = topologies replicon = next(sequences_db) replicon_results_path = self.find_data( os.path.join('Results_Integron_Finder_{}'.format(replicon_name), 'tmp_{}'.format(replicon.id))) attc_file = os.path.join(replicon_results_path, '{}_attc_table.res'.format(replicon.id)) intI_file = os.path.join(replicon_results_path, '{}_intI.res'.format(replicon.id)) phageI_file = os.path.join(replicon_results_path, '{}_phage_int.res'.format(replicon.id)) args = argparse.Namespace() args.no_proteins = False args.keep_palindromes = True args.distance_threshold = 4000 args.attc_model = 'attc_4.cm' args.evalue_attc = 1.0 args.max_attc_size = 200 args.min_attc_size = 40 args.local_max = False args.gembase = False args.union_integrases = False args.calin_threshold = 2 cfg = Config(args) cfg._prefix_data = os.path.join(os.path.dirname(__file__), 'data') prot_db = ProdigalDB(replicon, cfg, prot_file=prot_file) with self.catch_log() as log: integrons = find_integron(replicon, prot_db, attc_file, intI_file, phageI_file, cfg) self.assertEqual(len(integrons), 2) args.calin_threshold = 3 cfg = Config(args) cfg._prefix_data = os.path.join(os.path.dirname(__file__), 'data') with self.catch_log() as log: integrons = find_integron(replicon, prot_db, attc_file, intI_file, phageI_file, cfg) self.assertEqual(len(integrons), 1)
def test_find_integron_proteins_n_union_integrase(self): replicon_name = 'OBAL001.B.00005.C001' replicon_id = 'OBAL001.B.00005.C001' replicon_path = self.find_data(os.path.join('Replicons', replicon_name + '.fst')) prot_file = self.find_data(os.path.join('Proteins', replicon_name + '.prt')) topologies = Topology('lin') with FastaIterator(replicon_path) as sequences_db: sequences_db.topologies = topologies replicon = next(sequences_db) result_dir = 'Results_Integron_Finder_{}.union'.format(replicon_name) attc_file = self.find_data(os.path.join(result_dir, 'tmp_{}'.format(replicon.id), '{}_attc_table.res'.format(replicon.id))) intI_file = self.find_data(os.path.join(result_dir, 'tmp_{}'.format(replicon.id), '{}_intI.res'.format(replicon.id))) phageI_file = self.find_data(os.path.join(result_dir, 'tmp_{}'.format(replicon.id), '{}_phage_int.res'.format(replicon.id))) args = argparse.Namespace() args.evalue_attc = 1. args.max_attc_size = 200 args.min_attc_size = 40 args.distance_threshold = 4000 # (4kb at least between 2 different arrays) args.calin_threshold = 2 args.attc_model = 'attc_4.cm' args.no_proteins = False args.keep_palindromes = True args.union_integrases = True args.gembase = False # needed by read_hmm which is called when no_proteins == False args.local_max = False cfg = Config(args) cfg._prefix_data = os.path.join(os.path.dirname(__file__), 'data') prot_db = ProdigalDB(replicon, cfg, prot_file=prot_file) exp_msg = """In replicon {}, there are: - 3 complete integron(s) found with a total 4 attC site(s) - 0 CALIN element(s) found with a total of 0 attC site(s) - 2 In0 element(s) found with a total of 0 attC site""".format(replicon.id) with self.catch_log() as log: integrons = find_integron(replicon, prot_db, attc_file, intI_file, phageI_file, cfg) catch_msg = log.get_value().strip() self.assertEqual(catch_msg, exp_msg) self.assertEqual(len(integrons), 5) integron = integrons[0] self.assertEqual(integron.replicon.name, replicon_id) empty = pd.DataFrame(columns=self.columns).astype(dtype=self.dtype) exp_int = [] exp_int.append(pd.DataFrame( [[418072, 419283, 1, 5.400000e-25, 'protein', 'Phage_integrase', np.nan, 'intI']], columns=self.columns, index=['OBAL001.B.00005.C001_388']).astype(dtype=self.dtype)) exp_int.append(pd.DataFrame( [[434671, 440118, -1, 0.085, 'protein', 'Phage_integrase', np.nan, 'intI']], columns=self.columns, index=['OBAL001.B.00005.C001_399']).astype(dtype=self.dtype)) exp_int.append(pd.DataFrame( [[516941, 517834, -1, 1.200000e-54, 'protein', 'Phage_integrase', np.nan, 'intI']], columns=self.columns, index=['OBAL001.B.00005.C001_472']).astype(dtype=self.dtype)) exp_int.append(pd.DataFrame( [[1940269, 1941171, 1, 4.200000e-43, 'protein', 'Phage_integrase', np.nan, 'intI']], columns=self.columns, index=['OBAL001.B.00005.C001_1793']).astype(dtype=self.dtype)) exp_int.append(pd.DataFrame( [[1545830, 1546807, -1, 1.100000e-21, 'protein', 'intersection_tyr_intI', np.nan, 'intI']], columns=self.columns, index=['OBAL001.B.00005.C001_1416']).astype(dtype=self.dtype)) exp_attC = [] exp_attC.append(pd.DataFrame( [[421689, 421764, 1, 0.13, 'attC', 'attc_4', np.nan, 'attC']], columns=self.columns, index=['attc_001']).astype(dtype=self.dtype)) exp_attC.append(pd.DataFrame( [[442458, 442514, -1, 7.000000e-07, 'attC', 'attc_4', np.nan, 'attC']], columns=self.columns, index=['attc_001']).astype(dtype=self.dtype)) exp_attC.append(empty) exp_attC.append(empty) exp_attC.append(pd.DataFrame( [[1547800, 1547859, 1, 0.00049, 'attC', 'attc_4', np.nan, 'attC'], [1548775, 1548834, 1, 0.00009, 'attC', 'attc_4', 916.0, 'attC']], columns=self.columns, index=['attc_001', 'attc_002']).astype(dtype=self.dtype)) for i, integron in enumerate(integrons): self.assertEqual(integron.replicon.name, replicon_id) pdt.assert_frame_equal(integron.integrase, exp_int[i]) pdt.assert_frame_equal(integron.attC, exp_attC[i]) pdt.assert_frame_equal(integron.promoter, empty) pdt.assert_frame_equal(integron.attI, empty) pdt.assert_frame_equal(integron.proteins, empty)
def test_find_integron_proteins_lin_replicon(self): replicon_name = 'acba.007.p01.13' replicon_id = 'ACBA.007.P01_13' replicon_path = self.find_data(os.path.join('Replicons', replicon_name + '.fst')) prot_file = self.find_data(os.path.join('Proteins', replicon_id + '.prt')) topologies = Topology('lin') with FastaIterator(replicon_path) as sequences_db: sequences_db.topologies = topologies replicon = next(sequences_db) exp_result_dir = 'Results_Integron_Finder_acba.007.p01.13.linear' attc_file = self.find_data(os.path.join(exp_result_dir, 'tmp_{}'.format(replicon.id), '{}_attc_table.res'.format(replicon.id))) intI_file = self.find_data(os.path.join(exp_result_dir, 'tmp_{}'.format(replicon.id), '{}_intI.res'.format(replicon.id))) phageI_file = self.find_data(os.path.join(exp_result_dir, 'tmp_{}'.format(replicon.id), '{}_phage_int.res'.format(replicon.id))) args = argparse.Namespace() args.no_proteins = False args.keep_palindromes = True args.union_integrases = False args.gembase = False # needed by read_hmm which is called when no_proteins == False args = argparse.Namespace() args.evalue_attc = 1. args.max_attc_size = 200 args.min_attc_size = 40 args.distance_threshold = 4000 # (4kb at least between 2 different arrays) args.attc_model = 'attc_4.cm' args.no_proteins = False args.gembase = False # needed by read_hmm which is called when no_proteins == False args.union_integrases = False args.keep_palindromes = True args.calin_threshold = 2 args.local_max = False cfg = Config(args) cfg._prefix_data = os.path.join(os.path.dirname(__file__), 'data') prot_db = ProdigalDB(replicon, cfg, prot_file=prot_file) exp_msg = """In replicon {}, there are: - 0 complete integron(s) found with a total 0 attC site(s) - 1 CALIN element(s) found with a total of 3 attC site(s) - 1 In0 element(s) found with a total of 0 attC site""".format(replicon.id) with self.catch_log() as log: integrons = find_integron(replicon, prot_db, attc_file, intI_file, phageI_file, cfg) catch_msg = log.get_value().strip() self.assertEqual(catch_msg, exp_msg) self.assertEqual(len(integrons), 2) exp_int = [] exp = pd.DataFrame({'annotation': 'intI', 'distance_2attC': np.nan, 'evalue': 1.900000e-25, 'model': 'intersection_tyr_intI', 'pos_beg': 55, 'pos_end': 1014, 'strand': 1, 'type_elt': 'protein'}, columns=self.columns, index=['ACBA.007.P01_13_1']) exp = exp.astype(dtype=self.dtype) exp_int.append(exp) exp_int.append(pd.DataFrame(columns=self.columns).astype(dtype=self.dtype)) exp_attC = [pd.DataFrame(columns=self.columns).astype(dtype=self.dtype)] exp = pd.DataFrame({'annotation': ['attC'] * 3, 'distance_2attC': [np.nan, 1196.0, 469.0], 'evalue': [1.000000e-09, 1.000000e-04, 1.100000e-07], 'model': ['attc_4'] * 3, 'pos_beg': [17825, 19080, 19618], 'pos_end': [17884, 19149, 19726], 'strand': [-1, -1, -1], 'type_elt': 'attC'}, columns=self.columns, index=['attc_001', 'attc_002', 'attc_003']) exp = exp.astype(dtype=self.dtype) exp_attC.append(exp) empty = pd.DataFrame(columns=self.columns).astype(dtype=self.dtype) for i, integron in enumerate(integrons): self.assertEqual(integron.replicon.name, replicon_id) pdt.assert_frame_equal(integron.integrase, exp_int[i]) pdt.assert_frame_equal(integron.attC, exp_attC[i]) pdt.assert_frame_equal(integron.promoter, empty) pdt.assert_frame_equal(integron.attI, empty) pdt.assert_frame_equal(integron.proteins, empty)
def test_find_integron(self): replicon_name = 'acba.007.p01.13' prot_name = 'ACBA.007.P01_13' replicon_path = self.find_data(os.path.join('Replicons', replicon_name + '.fst')) prot_file = self.find_data(os.path.join('Proteins', prot_name + '.prt')) topologies = Topology('lin') with FastaIterator(replicon_path) as sequences_db: sequences_db.topologies = topologies replicon = next(sequences_db) replicon_results_path = self.find_data(os.path.join('Results_Integron_Finder_{}'.format(replicon_name), 'tmp_{}'.format(replicon.id))) attc_file = os.path.join(replicon_results_path, '{}_attc_table.res'.format(replicon.id)) intI_file = os.path.join(replicon_results_path, '{}_intI.res'.format(replicon.id)) phageI_file = os.path.join(replicon_results_path, '{}_phage_int.res'.format(replicon.id)) args = argparse.Namespace() args.no_proteins = True args.keep_palindromes = True args.distance_threshold = 4000 args.attc_model = 'attc_4.cm' args.evalue_attc = 1.0 args.max_attc_size = 200 args.min_attc_size = 40 args.calin_threshold = 2 args.local_max = False cfg = Config(args) cfg._prefix_data = os.path.join(os.path.dirname(__file__), 'data') prot_db = ProdigalDB(replicon, cfg, prot_file=prot_file) exp_msg = """In replicon {}, there are: - 0 complete integron(s) found with a total 0 attC site(s) - 1 CALIN element(s) found with a total of 3 attC site(s) - 0 In0 element(s) found with a total of 0 attC site""".format(replicon.id) with self.catch_log() as log: integrons = find_integron(replicon, prot_db, attc_file, intI_file, phageI_file, cfg) catch_msg = log.get_value().strip() self.assertEqual(catch_msg, exp_msg) self.assertEqual(len(integrons), 1) integron = integrons[0] self.assertEqual(integron.replicon.id, replicon.id) exp = pd.DataFrame({'annotation': ['attC'] * 3, 'distance_2attC': [np.nan, 1196.0, 469.0], 'evalue': [1.000000e-09, 1.000000e-04, 1.100000e-07], 'model': ['attc_4'] * 3, 'pos_beg': [17825, 19080, 19618], 'pos_end': [17884, 19149, 19726], 'strand': [-1, -1, -1], 'type_elt': 'attC'}, columns=self.columns, index=['attc_001', 'attc_002', 'attc_003']) pdt.assert_frame_equal(integron.attC, exp) exp = pd.DataFrame(columns=self.columns,) exp = exp.astype(dtype=self.dtype) pdt.assert_frame_equal(integron.integrase, exp) pdt.assert_frame_equal(integron.promoter, exp) pdt.assert_frame_equal(integron.attI, exp) pdt.assert_frame_equal(integron.proteins, exp)