def test_find_integrase_no_gembase_no_protfile(self):
        try:
            cfg = Config(self.args)
            self.args.gembase = False
            cfg._prefix_data = os.path.join(os.path.dirname(__file__), 'data')

            replicon_name = 'acba.007.p01.13'
            replicon_path = self.find_data(
                os.path.join('Replicons', replicon_name + '.fst'))
            topologies = Topology('lin')
            with FastaIterator(replicon_path) as sequences_db:
                sequences_db.topologies = topologies
                replicon = next(sequences_db)

            len_ori = replicon.__class__.__len__
            replicon.__class__.__len__ = lambda x: 500000

            prot_file = os.path.join(self.tmp_dir, "foo.prt")
            open(prot_file, 'w').close()
            with self.catch_log():
                with self.assertRaises(EmptyFileError) as ctx:
                    integrase.find_integrase(replicon.id, prot_file,
                                             self.tmp_dir, cfg)
        finally:
            replicon.__class__.__len__ = len_ori
    def test_find_integrase_gembase(self):
        cfg = Config(self.args)
        self.args.gembase = True
        cfg._prefix_data = os.path.join(os.path.dirname(__file__), 'data')

        replicon_name = 'acba.007.p01.13'
        replicon_path = self.find_data(
            os.path.join('Replicons', replicon_name + '.fst'))

        topologies = Topology('lin')
        with FastaIterator(replicon_path) as sequences_db:
            sequences_db.topologies = topologies
            replicon = next(sequences_db)

        prot_file = os.path.join(self.tmp_dir, replicon_name + ".prt")

        shutil.copyfile(
            self.find_data(os.path.join('Proteins', replicon.id + ".prt")),
            prot_file)

        integrase.find_integrase(replicon.id, prot_file, self.tmp_dir, cfg)

        for suffix in ('_intI.res', '_intI_table.res', '_phage_int.res',
                       '_phage_int_table.res'):
            res = os.path.join(self.tmp_dir, replicon.id + suffix)
            self.assertTrue(os.path.exists(res))
    def test_find_integrase_no_gembase_no_protfile_no_prodigal(self):
        try:
            self.args.hmmsearch = 'foo'
            self.args.gembase = False
            cfg = Config(self.args)
            cfg._prefix_data = os.path.join(os.path.dirname(__file__), 'data')

            replicon_name = 'acba.007.p01.13'
            replicon_path = self.find_data(
                os.path.join('Replicons', replicon_name + '.fst'))
            topologies = Topology('lin')
            with FastaIterator(replicon_path) as sequences_db:
                sequences_db.topologies = topologies
                replicon = next(sequences_db)

            len_ori = replicon.__class__.__len__
            replicon.__class__.__len__ = lambda x: 500000

            prot_file = os.path.join(self.tmp_dir, replicon.id + ".prt")

            shutil.copyfile(
                self.find_data(os.path.join('Proteins', replicon.id + ".prt")),
                prot_file)

            with self.assertRaises(RuntimeError) as ctx:
                integrase.find_integrase(replicon.id, prot_file, self.tmp_dir,
                                         cfg)
            self.assertTrue(
                re.search(
                    "failed : \[Errno 2\] No such file or directory: 'foo'",
                    str(ctx.exception)))
        finally:
            replicon.__class__.__len__ = len_ori
예제 #4
0
    def test_getitem(self):
        file_name = (('ACBA.0917.00019', '.fna'), ('ESCO001.C.00001.C001', '.fst'))
        for seq_name, ext in file_name:
            replicon_path = self.find_data(os.path.join('Gembase', 'Replicons', seq_name + ext))
            self.args.replicon = replicon_path
            cfg = Config(self.args)
            seq_db = read_multi_prot_fasta(replicon_path)
            replicon = next(seq_db)
            os.makedirs(cfg.tmp_dir(replicon.id))

            with self.catch_log():
                db = GembaseDB(replicon, cfg)
            exp = read_multi_prot_fasta(self.find_data(os.path.join('Gembase', 'Proteins', seq_name + '.prt')))

            specie, date, strain, contig = replicon.id.split('.')
            pattern = '{}\.{}\.{}\.\w?{}'.format(specie, date, strain, contig)

            for prot_expected in exp:
                if re.match(pattern, prot_expected.id):
                    prot_received = db[prot_expected.id]
                    self.assertEqual(prot_received.id,
                                     prot_expected.id)
                    self.assertEqual(prot_received.seq,
                                     prot_expected.seq)
        with self.assertRaises(KeyError) as ctx:
            db['nimport_naoik']
        self.assertEqual(str(ctx.exception), "'nimport_naoik'")
    def test_find_integrase_no_gembase_with_protfile_empty(self):
        try:
            cfg = Config(self.args)
            self.args.gembase = False
            cfg._prefix_data = os.path.join(os.path.dirname(__file__), 'data')

            replicon_name = 'acba.007.p01.13'
            replicon_path = self.find_data(
                os.path.join('Replicons', replicon_name + '.fst'))
            topologies = Topology('lin')
            with FastaIterator(replicon_path) as sequences_db:
                sequences_db.topologies = topologies
                replicon = next(sequences_db)

            len_ori = replicon.__class__.__len__
            replicon.__class__.__len__ = lambda x: 200

            prot_file = os.path.join(self.tmp_dir, replicon.id + ".prt")
            open(prot_file, 'w').close()
            with self.assertRaises(EmptyFileError) as ctx:
                with self.catch_log():
                    integrase.find_integrase(replicon.id, prot_file,
                                             self.tmp_dir, cfg)
            self.assertTrue(
                re.match(
                    "^The protein file: '.*' is empty cannot perform hmmsearch on it.$",
                    str(ctx.exception)))
        finally:
            replicon.__class__.__len__ = len_ori
예제 #6
0
    def test_get_description(self):
        # SeqDesc(id, strand, strat, stop)
        file_name = {('ACBA.0917.00019', '.fna'):
                         {'ACBA.0917.00019.b0001_00001': SeqDesc('ACBA.0917.00019.b0001_00001', -1, 266, 1480),
                          'ACBA.0917.00019.i0001_03957': SeqDesc('ACBA.0917.00019.i0001_03957', -1, 4043755, 4044354)},
                     }

        for seq_name, ext in file_name:
            replicon_path = self.find_data(os.path.join('Gembase', 'Replicons', seq_name + ext))
            self.args.replicon = replicon_path
            cfg = Config(self.args)
            seq_db = read_multi_prot_fasta(replicon_path)
            replicon = next(seq_db)
            replicon.path = replicon_path
            os.makedirs(cfg.tmp_dir(replicon.id))

            db = GembaseDB(replicon, cfg)

            descriptions = file_name[(seq_name, ext)]
            for seq_id, desc in descriptions.items():
                self.assertEqual(desc, db.get_description(seq_id))

        with self.assertRaises(IntegronError) as ctx:
            db.get_description('nimport_naoik')
        self.assertEqual(str(ctx.exception), "'nimport_naoik' is not a valid Gembase protein identifier.")

        with self.assertRaises(KeyError) as ctx:
            db.get_description('FOO.BAR.00019.i0001_03924')
        self.assertEqual(str(ctx.exception), "'FOO.BAR.00019.i0001_03924'")
예제 #7
0
    def test_make_protfile(self):
        file_name = (('ACBA.0917.00019', '.fna', 3870),
                     ('ESCO001.C.00001.C001', '.fst', 3870))
        for seq_name, ext, seq_nb in file_name:
            replicon_path = self.find_data(
                os.path.join('Gembase', 'Replicons', seq_name + ext))
            self.args.replicon = replicon_path
            cfg = Config(self.args)
            seq_db = read_multi_prot_fasta(replicon_path)
            replicon = next(seq_db)
            replicon.path = replicon_path
            os.makedirs(cfg.tmp_dir(replicon.id))

            with self.catch_log():
                db = GembaseDB(replicon, cfg)
            for seq_nb, seqs in enumerate(
                    zip(
                        read_multi_prot_fasta(
                            self.find_data(
                                os.path.join('Gembase', 'Proteins',
                                             seq_name + '.prt'))),
                        read_multi_prot_fasta(db.protfile)), 1):
                expected, test = seqs
                self.assertEqual(expected.id, test.id)
            self.assertEqual(seq_nb, seq_nb)
예제 #8
0
    def test_getitem(self):
        file_name = (('ACBA.0917.00019', '.fna'), ('ESCO001.C.00001.C001',
                                                   '.fst'))
        for seq_name, ext in file_name:
            replicon_path = self.find_data(
                os.path.join('Gembase', 'Replicons', seq_name + ext))
            self.args.replicon = replicon_path
            cfg = Config(self.args)
            seq_db = read_multi_prot_fasta(replicon_path)
            replicon = next(seq_db)
            os.makedirs(cfg.tmp_dir(replicon.id))

            with self.catch_log():
                db = GembaseDB(replicon, cfg)
            exp = read_multi_prot_fasta(
                self.find_data(
                    os.path.join('Gembase', 'Proteins', seq_name + '.prt')))

            specie, date, strain, contig = replicon.id.split('.')
            pattern = '{}\.{}\.{}\.\w?{}'.format(specie, date, strain, contig)

            for prot_expected in exp:
                if re.match(pattern, prot_expected.id):
                    prot_received = db[prot_expected.id]
                    self.assertEqual(prot_received.id, prot_expected.id)
                    self.assertEqual(prot_received.seq, prot_expected.seq)
        with self.assertRaises(KeyError) as ctx:
            db['nimport_naoik']
        self.assertEqual(str(ctx.exception), "'nimport_naoik'")
    def test_find_integrase_no_gembase_no_protfile_short_seq(self):
        try:
            cfg = Config(self.args)
            self.args.gembase = False
            cfg._prefix_data = os.path.join(os.path.dirname(__file__), 'data')

            replicon_name = 'acba.007.p01.13'
            replicon_path = self.find_data(
                os.path.join('Replicons', replicon_name + '.fst'))
            prot_name = 'ACBA.007.P01_13'
            prot_path = self.find_data(
                os.path.join('Proteins', prot_name + '.prt'))

            topologies = Topology('lin')
            with FastaIterator(replicon_path) as sequences_db:
                sequences_db.topologies = topologies
                replicon = next(sequences_db)

            len_ori = replicon.__class__.__len__
            replicon.__class__.__len__ = lambda x: 200

            prot_file = os.path.join(self.tmp_dir, replicon.id + ".prt")
            shutil.copyfile(prot_path, prot_file)

            integrase.find_integrase(replicon.id, prot_file, self.tmp_dir, cfg)
            for suffix in ('_intI.res', '_intI_table.res', '_phage_int.res',
                           '_phage_int_table.res'):
                res = os.path.join(self.tmp_dir, replicon.id + suffix)
                self.assertTrue(os.path.exists(res))
        finally:
            replicon.__class__.__len__ = len_ori
예제 #10
0
    def test_iter(self):
        # test Gembase Draft
        seq_name = 'ACBA.0917.00019'
        ext = '.fna'
        replicon_path = self.find_data(
            os.path.join('Gembase', 'Replicons', seq_name + ext))
        self.args.replicon = replicon_path
        cfg = Config(self.args)
        seq_db = read_multi_prot_fasta(replicon_path)
        replicon = next(seq_db)
        replicon.path = replicon_path
        db = GembaseDB(replicon, cfg)

        idx = SeqIO.index(self.find_data(
            os.path.join('Gembase', 'Proteins', seq_name + '.prt')),
                          'fasta',
                          alphabet=Seq.IUPAC.extended_protein)

        specie, date, strain, contig = replicon.id.split('.')
        pattern = '{}\.{}\.{}\.\w?{}'.format(specie, date, strain, contig)
        self.assertListEqual(sorted([i for i in idx if re.match(pattern, i)]),
                             sorted([i for i in db]))

        # test Gembase Complet
        seq_name = 'ESCO001.C.00001.C001'
        ext = '.fst'
        replicon_path = self.find_data(
            os.path.join('Gembase', 'Replicons', seq_name + ext))
        self.args.replicon = replicon_path
        cfg = Config(self.args)
        seq_db = read_multi_prot_fasta(replicon_path)
        replicon = next(seq_db)
        replicon.path = replicon_path
        with self.catch_log():
            db = GembaseDB(replicon, cfg)

        idx = SeqIO.index(self.find_data(
            os.path.join('Gembase', 'Proteins', seq_name + '.prt')),
                          'fasta',
                          alphabet=Seq.IUPAC.extended_protein)

        specie, date, strain, contig = replicon.id.split('.')
        pattern = '{}\.{}\.{}\.\w?{}'.format(specie, date, strain, contig)
        seqid_from_gembase_protfile = set(
            [i for i in idx if re.match(pattern, i)])
        seqid_from_if = set([i for i in db])
        non_common_seq = seqid_from_gembase_protfile ^ seqid_from_if
        # in Gembase complete the annotation from lstinfo provided from genbank
        # it appear some times that some CDS are not translate in proteins
        # So in data I have 3 genes from LSTINFO are not in .prt file
        diff = {
            'ESCO001.C.00001.C001_03974', 'ESCO001.C.00001.C001_01509',
            'ESCO001.C.00001.C001_04162'
        }
        self.assertSetEqual(non_common_seq, diff)
예제 #11
0
    def test_ProteinDB(self):
        file_name = 'acba.007.p01.13'
        replicon_path = self.find_data(os.path.join('Replicons', file_name + '.fst'))
        self.args.replicon = replicon_path
        cfg = Config(self.args)
        seq_db = read_multi_prot_fasta(replicon_path)
        replicon = next(seq_db)
        replicon.path = replicon_path
        os.makedirs(cfg.tmp_dir(replicon.id))

        db = ProdigalDB(replicon, cfg)
        self.assertTrue(db.replicon.id, replicon.id)
예제 #12
0
    def test_ProteinDB_no_prodigal(self):
        file_name = 'acba.007.p01.13'
        replicon_path = self.find_data(os.path.join('Replicons', file_name + '.fst'))
        self.args.replicon = replicon_path
        cfg = Config(self.args)
        seq_db = read_multi_prot_fasta(replicon_path)
        replicon = next(seq_db)
        replicon.path = replicon_path
        os.makedirs(cfg.tmp_dir(replicon.id))

        self.args.prodigal = None
        with self.assertRaises(RuntimeError) as ctx:
            ProdigalDB(replicon, cfg)
예제 #13
0
    def test_ProteinDB(self):
        file_name = 'acba.007.p01.13'
        replicon_path = self.find_data(
            os.path.join('Replicons', file_name + '.fst'))
        self.args.replicon = replicon_path
        cfg = Config(self.args)
        seq_db = read_multi_prot_fasta(replicon_path)
        replicon = next(seq_db)
        replicon.path = replicon_path
        os.makedirs(cfg.tmp_dir(replicon.id))

        db = ProdigalDB(replicon, cfg)
        self.assertTrue(db.replicon.id, replicon.id)
예제 #14
0
    def test_protfile(self):
        file_name = 'acba.007.p01.13'
        prot_name = 'ACBA.007.P01_13.prt'
        replicon_path = self.find_data(os.path.join('Replicons', file_name + '.fst'))
        self.args.replicon = replicon_path
        cfg = Config(self.args)
        seq_db = read_multi_prot_fasta(replicon_path)
        replicon = next(seq_db)
        replicon.path = replicon_path
        os.makedirs(cfg.tmp_dir(replicon.id))

        db = ProdigalDB(replicon, cfg)
        self.assertEqual(os.path.join(cfg.tmp_dir(replicon.id), prot_name), db.protfile)
예제 #15
0
    def test_protfile(self):
        file_name = (('ACBA.0917.00019', '.fna'), ('ESCO001.C.00001.C001', '.fst'))
        for seq_name, ext in file_name:
            replicon_path = self.find_data(os.path.join('Gembase', 'Replicons', seq_name + ext))
            self.args.replicon = replicon_path
            cfg = Config(self.args)
            seq_db = read_multi_prot_fasta(replicon_path)
            replicon = next(seq_db)
            replicon.path = replicon_path
            os.makedirs(cfg.tmp_dir(replicon.id))

            with self.catch_log():
                db = GembaseDB(replicon, cfg)
            self.assertEqual(os.path.join(cfg.tmp_dir(replicon.id), replicon.id + '.prt'), db.protfile)
예제 #16
0
    def test_ProteinDB_no_prodigal(self):
        file_name = 'acba.007.p01.13'
        replicon_path = self.find_data(
            os.path.join('Replicons', file_name + '.fst'))
        self.args.replicon = replicon_path
        cfg = Config(self.args)
        seq_db = read_multi_prot_fasta(replicon_path)
        replicon = next(seq_db)
        replicon.path = replicon_path
        os.makedirs(cfg.tmp_dir(replicon.id))

        self.args.prodigal = None
        with self.assertRaises(RuntimeError) as ctx:
            ProdigalDB(replicon, cfg)
예제 #17
0
    def test_protfile(self):
        file_name = 'acba.007.p01.13'
        prot_name = 'ACBA.007.P01_13.prt'
        replicon_path = self.find_data(
            os.path.join('Replicons', file_name + '.fst'))
        self.args.replicon = replicon_path
        cfg = Config(self.args)
        seq_db = read_multi_prot_fasta(replicon_path)
        replicon = next(seq_db)
        replicon.path = replicon_path
        os.makedirs(cfg.tmp_dir(replicon.id))

        db = ProdigalDB(replicon, cfg)
        self.assertEqual(os.path.join(cfg.tmp_dir(replicon.id), prot_name),
                         db.protfile)
예제 #18
0
    def test_ProteinDB(self):
        # From Gembase Draft , Gembase Complete
        file_names = ('ACBA.0917.00019.fna', 'ESCO001.C.00001.C001.fst')
        for file_name in file_names:
            replicon_path = self.find_data(os.path.join('Gembase', 'Replicons', file_name))
            self.args.replicon = replicon_path
            cfg = Config(self.args)
            seq_db = read_multi_prot_fasta(replicon_path)
            replicon = next(seq_db)
            replicon.path = replicon_path
            os.makedirs(cfg.tmp_dir(replicon.id))

            with self.catch_log():
                db = GembaseDB(replicon, cfg)
            self.assertTrue(db.replicon.id, replicon.id)
예제 #19
0
    def test_ProteinDB(self):
        # From Gembase Draft , Gembase Complete
        file_names = ('ACBA.0917.00019.fna', 'ESCO001.C.00001.C001.fst')
        for file_name in file_names:
            replicon_path = self.find_data(
                os.path.join('Gembase', 'Replicons', file_name))
            self.args.replicon = replicon_path
            cfg = Config(self.args)
            seq_db = read_multi_prot_fasta(replicon_path)
            replicon = next(seq_db)
            replicon.path = replicon_path
            os.makedirs(cfg.tmp_dir(replicon.id))

            with self.catch_log():
                db = GembaseDB(replicon, cfg)
            self.assertTrue(db.replicon.id, replicon.id)
예제 #20
0
    def setUp(self):
        if 'INTEGRON_HOME' in os.environ:
            self.integron_home = os.environ['INTEGRON_HOME']
            self.local_install = True
        else:
            self.local_install = False
            self.integron_home = os.path.normpath(
                os.path.abspath(
                    os.path.join(os.path.dirname(__file__), '..', '..')))

        self.columns = [
            'pos_beg', 'pos_end', 'strand', 'evalue', 'type_elt', 'model',
            'distance_2attC', 'annotation'
        ]
        self.dtype = {
            "pos_beg": 'int',
            "pos_end": 'int',
            "strand": 'int',
            "evalue": 'float',
            "type_elt": 'str',
            "annotation": 'str',
            "model": 'str',
            "distance_2attC": 'float'
        }
        args = argparse.Namespace()
        self.cfg = Config(args)
        self._prefix_data = os.path.join(os.path.dirname(__file__), 'data')
예제 #21
0
    def test_iter(self):
        file_name = 'acba.007.p01.13'
        prot_name = 'ACBA.007.P01_13.prt'
        replicon_path = self.find_data(os.path.join('Replicons', file_name + '.fst'))
        self.args.replicon = replicon_path
        cfg = Config(self.args)
        seq_db = read_multi_prot_fasta(replicon_path)
        replicon = next(seq_db)
        replicon.path = replicon_path
        os.makedirs(cfg.tmp_dir(replicon.id))

        db = ProdigalDB(replicon, cfg)
        idx = SeqIO.index(self.find_data(os.path.join('Proteins', prot_name)), 'fasta',
                          alphabet=Seq.IUPAC.extended_protein)
        for exp_seq_id, get_seq_id in zip(idx, db):
            self.assertEqual(exp_seq_id, get_seq_id)
예제 #22
0
    def test_get_description(self):
        # SeqDesc(id, strand, strat, stop)
        file_name = 'acba.007.p01.13'
        replicon_path = self.find_data(os.path.join('Replicons', file_name + '.fst'))
        self.args.replicon = replicon_path
        cfg = Config(self.args)
        seq_db = read_multi_prot_fasta(replicon_path)
        replicon = next(seq_db)
        replicon.path = replicon_path
        os.makedirs(cfg.tmp_dir(replicon.id))

        db = ProdigalDB(replicon, cfg)

        descriptions = {'ACBA.007.P01_13_23': SeqDesc('ACBA.007.P01_13_23', -1, 19721, 20254),
                        'ACBA.007.P01_13_1':  SeqDesc('ACBA.007.P01_13_1', 1, 55, 1014)}
        for seq_id, desc in descriptions.items():
            self.assertEqual(desc, db.get_description(seq_id))
예제 #23
0
    def test_iter(self):
        file_name = 'acba.007.p01.13'
        prot_name = 'ACBA.007.P01_13.prt'
        replicon_path = self.find_data(
            os.path.join('Replicons', file_name + '.fst'))
        self.args.replicon = replicon_path
        cfg = Config(self.args)
        seq_db = read_multi_prot_fasta(replicon_path)
        replicon = next(seq_db)
        replicon.path = replicon_path
        os.makedirs(cfg.tmp_dir(replicon.id))

        db = ProdigalDB(replicon, cfg)
        idx = SeqIO.index(self.find_data(os.path.join('Proteins', prot_name)),
                          'fasta',
                          alphabet=Seq.IUPAC.extended_protein)
        for exp_seq_id, get_seq_id in zip(idx, db):
            self.assertEqual(exp_seq_id, get_seq_id)
예제 #24
0
    def test_protfile(self):
        file_name = (('ACBA.0917.00019', '.fna'), ('ESCO001.C.00001.C001',
                                                   '.fst'))
        for seq_name, ext in file_name:
            replicon_path = self.find_data(
                os.path.join('Gembase', 'Replicons', seq_name + ext))
            self.args.replicon = replicon_path
            cfg = Config(self.args)
            seq_db = read_multi_prot_fasta(replicon_path)
            replicon = next(seq_db)
            replicon.path = replicon_path
            os.makedirs(cfg.tmp_dir(replicon.id))

            with self.catch_log():
                db = GembaseDB(replicon, cfg)
            self.assertEqual(
                os.path.join(cfg.tmp_dir(replicon.id), replicon.id + '.prt'),
                db.protfile)
예제 #25
0
    def test_find_gembase_file_basename_file_not_in_gembase(self):
        """
        test if find_gembase_file_basename get the the right basename
        for files not located in gembase and file name is the output of split operation
        a file containing one contig
        a file representing a chunk
        """
        gembase_path = self.find_data('Gembase')

        file_names = {'ACBA.0917.00019': self.find_data(os.path.join('Replicons', 'ACBA.0917.00019.0001.fst')),
                      'ESCO001.C.00001.C001.fst': os.path.join(self.tmp_dir, 'ESCO001.C.00001.C001_chunk_1.fst')
                      }

        shutil.copyfile(os.path.join(gembase_path, 'Replicons', 'ESCO001.C.00001.C001.fst'),
                        file_names['ESCO001.C.00001.C001.fst'])

        for base_file_name, replicon_path in file_names.items():
            self.args.replicon = replicon_path
            self.args.gembase_path = gembase_path
            cfg = Config(self.args)
            seq_db = read_multi_prot_fasta(replicon_path)
            replicon = next(seq_db)
            replicon.path = replicon_path
            os.makedirs(cfg.tmp_dir(replicon.id))

            with self.catch_log():
                db = GembaseDB(replicon, cfg, gembase_path=gembase_path)
            self.assertTrue(db._find_gembase_file_basename(gembase_path, replicon_path),
                            base_file_name)

        replicon_path = self.find_data(os.path.join('Replicons', 'acba.007.p01.13.fst'))
        self.args.replicon = replicon_path
        self.args.gembase_path = gembase_path
        cfg = Config(self.args)
        seq_db = read_multi_prot_fasta(replicon_path)
        replicon = next(seq_db)
        replicon.path = replicon_path
        os.makedirs(cfg.tmp_dir(replicon.id))

        with self.assertRaises(FileNotFoundError) as ctx:
            with self.catch_log():
                GembaseDB(replicon, cfg, gembase_path=gembase_path)
        self.assertEqual(str(ctx.exception),
                         'cannot find lst file matching {} sequence'.format(replicon_path))
예제 #26
0
    def test_read_hmm_evalue(self):
        """
        Test that the hmm hits are well read, and returned only if evalue is < to the
        given threshold.
        """
        rep_name = "acba.007.p01.13"
        replicon_id = 'ACBA.007.P01_13'

        replicon_path = self.find_data(
            os.path.join('Replicons', rep_name + '.fst'))
        prot_file = self.find_data(
            os.path.join('Proteins', replicon_id + '.prt'))

        args = argparse.Namespace()
        args.gembase = False
        args.replicon = replicon_path
        cfg = Config(args)

        sequences_db = read_multi_prot_fasta(replicon_path)
        replicon = next(sequences_db)
        prot_db = ProdigalDB(replicon, cfg, prot_file=prot_file)

        infile = self.find_data(
            os.path.join("Results_Integron_Finder_{}".format(rep_name),
                         "tmp_{}".format(replicon_id),
                         "{}_intI.res".format(replicon_id)))

        df1 = read_hmm(rep_name, prot_db, infile, cfg, evalue=1.95e-25)
        exp1 = pd.DataFrame(data={
            "Accession_number": rep_name,
            "query_name": "intI_Cterm",
            "ID_query": "-",
            "ID_prot": "ACBA.007.P01_13_1",
            "strand": 1,
            "pos_beg": 55,
            "pos_end": 1014,
            "evalue": 1.9e-25
        },
                            index=[0])
        exp1 = exp1[[
            "Accession_number", "query_name", "ID_query", "ID_prot", "strand",
            "pos_beg", "pos_end", "evalue"
        ]]
        pdt.assert_frame_equal(df1, exp1)

        df2 = read_hmm(replicon_id, prot_db, infile, cfg, evalue=1.9e-25)
        exp2 = pd.DataFrame(columns=[
            "Accession_number", "query_name", "ID_query", "ID_prot", "strand",
            "pos_beg", "pos_end", "evalue"
        ])

        intcols = ["pos_beg", "pos_end", "strand"]
        floatcol = ["evalue"]
        exp2[intcols] = exp2[intcols].astype(int)
        exp2[floatcol] = exp2[floatcol].astype(float)
        pdt.assert_frame_equal(df2, exp2)
    def test_find_integron_calin_threshold(self):
        replicon_name = 'ESCO001.B.00018.P002'
        replicon_path = self.find_data(os.path.join('Replicons', replicon_name + '.fst'))
        prot_file = self.find_data(os.path.join('Proteins', replicon_name + '.prt'))
        topologies = Topology('circ')
        with FastaIterator(replicon_path) as sequences_db:
            sequences_db.topologies = topologies
            replicon = next(sequences_db)
        replicon_results_path = self.find_data(os.path.join('Results_Integron_Finder_{}'.format(replicon_name),
                                                            'tmp_{}'.format(replicon.id)))
        attc_file = os.path.join(replicon_results_path, '{}_attc_table.res'.format(replicon.id))
        intI_file = os.path.join(replicon_results_path, '{}_intI.res'.format(replicon.id))
        phageI_file = os.path.join(replicon_results_path, '{}_phage_int.res'.format(replicon.id))

        args = argparse.Namespace()
        args.no_proteins = False
        args.keep_palindromes = True
        args.distance_threshold = 4000
        args.attc_model = 'attc_4.cm'
        args.evalue_attc = 1.0
        args.max_attc_size = 200
        args.min_attc_size = 40
        args.local_max = False
        args.gembase = False
        args.union_integrases = False
        args.calin_threshold = 2

        cfg = Config(args)
        cfg._prefix_data = os.path.join(os.path.dirname(__file__), 'data')
        prot_db = ProdigalDB(replicon, cfg, prot_file=prot_file)

        with self.catch_log() as log:
            integrons = find_integron(replicon, prot_db, attc_file, intI_file, phageI_file, cfg)

        self.assertEqual(len(integrons), 2)

        args.calin_threshold = 3
        cfg = Config(args)
        cfg._prefix_data = os.path.join(os.path.dirname(__file__), 'data')

        with self.catch_log() as log:
            integrons = find_integron(replicon, prot_db, attc_file, intI_file, phageI_file, cfg)
        self.assertEqual(len(integrons), 1)
예제 #28
0
    def test_make_protfile(self):
        file_name = 'acba.007.p01.13'
        prot_name = 'ACBA.007.P01_13.prt'

        replicon_path = self.find_data(os.path.join('Replicons', file_name + '.fst'))
        self.args.replicon = replicon_path
        cfg = Config(self.args)
        seq_db = read_multi_prot_fasta(replicon_path)
        replicon = next(seq_db)
        replicon.path = replicon_path
        os.makedirs(cfg.tmp_dir(replicon.id))

        db = ProdigalDB(replicon, cfg)
        for seq_nb, seqs in enumerate(zip(
                read_multi_prot_fasta(self.find_data(os.path.join('Proteins', prot_name))),
                read_multi_prot_fasta(db.protfile)), 1):
            expected, test = seqs
            self.assertEqual(expected.id, test.id)
        self.assertEqual(seq_nb, 23)
예제 #29
0
    def test_make_protfile(self):
        file_name = (('ACBA.0917.00019', '.fna', 3870), ('ESCO001.C.00001.C001', '.fst', 3870))
        for seq_name, ext, seq_nb in file_name:
            replicon_path = self.find_data(os.path.join('Gembase', 'Replicons', seq_name + ext))
            self.args.replicon = replicon_path
            cfg = Config(self.args)
            seq_db = read_multi_prot_fasta(replicon_path)
            replicon = next(seq_db)
            replicon.path = replicon_path
            os.makedirs(cfg.tmp_dir(replicon.id))

            with self.catch_log():
                db = GembaseDB(replicon, cfg)
            for seq_nb, seqs in enumerate(zip(
                    read_multi_prot_fasta(self.find_data(os.path.join('Gembase', 'Proteins', seq_name + '.prt'))),
                    read_multi_prot_fasta(db.protfile)), 1):
                expected, test = seqs
                self.assertEqual(expected.id, test.id)
            self.assertEqual(seq_nb, seq_nb)
예제 #30
0
    def test_find_gembase_file_basename(self):
        """
        test if find_gembase_file_basename get the the right basename
        for files in gembase
        """
        gembase_path = self.find_data('Gembase')
        file_names = ('ACBA.0917.00019.fna', 'ESCO001.C.00001.C001.fst')
        for file_name in file_names:
            replicon_path = self.find_data(os.path.join('Gembase', 'Replicons', file_name))
            self.args.replicon = replicon_path
            cfg = Config(self.args)
            seq_db = read_multi_prot_fasta(replicon_path)
            replicon = next(seq_db)
            replicon.path = replicon_path
            os.makedirs(cfg.tmp_dir(replicon.id))

            with self.catch_log():
                db = GembaseDB(replicon, cfg)
            self.assertTrue(db._find_gembase_file_basename(gembase_path, replicon_path),
                            os.path.splitext(file_name)[0])
예제 #31
0
    def test_find_integrase_gembase_hmmer_error(self):
        self.args.gembase = True
        self.args.cpu = 'foo'
        cfg = Config(self.args)
        cfg._prefix_data = os.path.join(os.path.dirname(__file__), 'data')

        replicon_name = 'acba.007.p01.13'
        replicon_path = os.path.join(self._data_dir, 'Replicons',
                                     replicon_name + '.fst')
        topologies = Topology('lin')
        with FastaIterator(replicon_path) as sequences_db:
            sequences_db.topologies = topologies
            replicon = next(sequences_db)

        prot_file = os.path.join(self.tmp_dir, replicon.id + ".prt")
        shutil.copyfile(
            os.path.join(self._data_dir, 'Proteins', replicon.id + ".prt"),
            prot_file)
        with self.assertRaises(RuntimeError) as ctx:
            integrase.find_integrase(replicon.id, prot_file, self.tmp_dir, cfg)
        self.assertTrue(str(ctx.exception).endswith('failed return code = 1'))
예제 #32
0
    def test_get_description(self):
        # SeqDesc(id, strand, strat, stop)
        file_name = 'acba.007.p01.13'
        replicon_path = self.find_data(
            os.path.join('Replicons', file_name + '.fst'))
        self.args.replicon = replicon_path
        cfg = Config(self.args)
        seq_db = read_multi_prot_fasta(replicon_path)
        replicon = next(seq_db)
        replicon.path = replicon_path
        os.makedirs(cfg.tmp_dir(replicon.id))

        db = ProdigalDB(replicon, cfg)

        descriptions = {
            'ACBA.007.P01_13_23': SeqDesc('ACBA.007.P01_13_23', -1, 19721,
                                          20254),
            'ACBA.007.P01_13_1': SeqDesc('ACBA.007.P01_13_1', 1, 55, 1014)
        }
        for seq_id, desc in descriptions.items():
            self.assertEqual(desc, db.get_description(seq_id))
예제 #33
0
    def setUp(self):
        if 'INTEGRON_HOME' in os.environ:
            self.integron_home = os.environ['INTEGRON_HOME']
            self.local_install = True
        else:
            self.local_install = False
            self.integron_home = os.path.normpath(os.path.abspath(os.path.join(os.path.dirname(__file__), '..', '..')))

        self.tmp_dir = os.path.join(tempfile.gettempdir(), 'tmp_test_integron_finder')
        if os.path.exists(self.tmp_dir) and os.path.isdir(self.tmp_dir):
            shutil.rmtree(self.tmp_dir)
        os.makedirs(self.tmp_dir)

        args = argparse.Namespace()
        args.attc_model = 'attc_4.cm'
        args.max_attc_size = 200
        args.min_attc_size = 40
        args.distance_threshold = 4000  # (4kb at least between 2 different arrays)
        args.eagle_eyes = False
        args.local_max = False
        self.cfg = Config(args)
        self.cfg._prefix_data = os.path.join(os.path.dirname(__file__), 'data')

        replicon_name = 'OBAL001.B.00005.C001'
        replicon_path = self.find_data(os.path.join('Replicons', replicon_name + '.fst'))

        topologies = Topology('lin')
        with FastaIterator(replicon_path) as sequences_db:
            sequences_db.topologies = topologies
            self.replicon = next(sequences_db)

        self.integron = Integron(self.replicon, self.cfg)

        self.columns = ['pos_beg', 'pos_end', 'strand', 'evalue', 'type_elt', 'model', 'distance_2attC', 'annotation']
        self.dtype = {"pos_beg": 'int',
                      "pos_end": 'int',
                      "strand": 'int',
                      "evalue": 'float',
                      "type_elt": 'str',
                      "annotation": 'str',
                      "model": 'str',
                      "distance_2attC": 'float'}

        self.max_dtype = {'Accession_number': 'str',
                          'cm_attC': 'str',
                          'cm_debut': 'int',
                          'cm_fin': 'int',
                          'pos_beg': 'int',
                          'pos_end': 'int',
                          'sens': 'str',
                          'evalue': 'float'}
        self.max_cols = ['Accession_number', 'cm_attC', 'cm_debut', 'cm_fin', 'pos_beg', 'pos_end', 'sens', 'evalue']
예제 #34
0
    def test_getitem(self):
        file_name = 'acba.007.p01.13'
        prot_name = 'ACBA.007.P01_13.prt'
        replicon_path = self.find_data(
            os.path.join('Replicons', file_name + '.fst'))
        self.args.replicon = replicon_path
        cfg = Config(self.args)
        seq_db = read_multi_prot_fasta(replicon_path)
        replicon = next(seq_db)
        replicon.path = replicon_path
        os.makedirs(cfg.tmp_dir(replicon.id))

        db = ProdigalDB(replicon, cfg)
        exp = read_multi_prot_fasta(
            self.find_data(os.path.join('Proteins', prot_name)))
        for prot_expected in exp:
            prot_received = db[prot_expected.id]
            self.assertEqual(prot_received.id, prot_expected.id)
            self.assertEqual(prot_received.seq, prot_expected.seq)
        with self.assertRaises(KeyError) as ctx:
            db['nimport_naoik']
        self.assertEqual(str(ctx.exception), "'nimport_naoik'")
예제 #35
0
    def test_find_gembase_file_basename(self):
        """
        test if find_gembase_file_basename get the the right basename
        for files in gembase
        """
        gembase_path = self.find_data('Gembase')
        file_names = ('ACBA.0917.00019.fna', 'ESCO001.C.00001.C001.fst')
        for file_name in file_names:
            replicon_path = self.find_data(
                os.path.join('Gembase', 'Replicons', file_name))
            self.args.replicon = replicon_path
            cfg = Config(self.args)
            seq_db = read_multi_prot_fasta(replicon_path)
            replicon = next(seq_db)
            replicon.path = replicon_path
            os.makedirs(cfg.tmp_dir(replicon.id))

            with self.catch_log():
                db = GembaseDB(replicon, cfg)
            self.assertTrue(
                db._find_gembase_file_basename(gembase_path, replicon_path),
                os.path.splitext(file_name)[0])
예제 #36
0
    def test_make_protfile_no_prodigal(self):
        file_name = 'acba.007.p01.13'
        replicon_path = self.find_data(
            os.path.join('Replicons', file_name + '.fst'))
        self.args.replicon = replicon_path
        self.args.prodigal = 'foo_bar'
        cfg = Config(self.args)
        seq_db = read_multi_prot_fasta(replicon_path)
        replicon = next(seq_db)
        replicon.path = replicon_path

        with self.assertRaises(RuntimeError) as ctx:
            ProdigalDB(replicon, cfg)
예제 #37
0
    def test_make_protfile(self):
        file_name = 'acba.007.p01.13'
        prot_name = 'ACBA.007.P01_13.prt'

        replicon_path = self.find_data(
            os.path.join('Replicons', file_name + '.fst'))
        self.args.replicon = replicon_path
        cfg = Config(self.args)
        seq_db = read_multi_prot_fasta(replicon_path)
        replicon = next(seq_db)
        replicon.path = replicon_path
        os.makedirs(cfg.tmp_dir(replicon.id))

        db = ProdigalDB(replicon, cfg)
        for seq_nb, seqs in enumerate(
                zip(
                    read_multi_prot_fasta(
                        self.find_data(os.path.join('Proteins', prot_name))),
                    read_multi_prot_fasta(db.protfile)), 1):
            expected, test = seqs
            self.assertEqual(expected.id, test.id)
        self.assertEqual(seq_nb, 23)
예제 #38
0
    def test_getitem(self):
        file_name = 'acba.007.p01.13'
        prot_name = 'ACBA.007.P01_13.prt'
        replicon_path = self.find_data(os.path.join('Replicons', file_name + '.fst'))
        self.args.replicon = replicon_path
        cfg = Config(self.args)
        seq_db = read_multi_prot_fasta(replicon_path)
        replicon = next(seq_db)
        replicon.path = replicon_path
        os.makedirs(cfg.tmp_dir(replicon.id))

        db = ProdigalDB(replicon, cfg)
        exp = read_multi_prot_fasta(self.find_data(os.path.join('Proteins', prot_name)))
        for prot_expected in exp:
            prot_received = db[prot_expected.id]
            self.assertEqual(prot_received.id,
                             prot_expected.id)
            self.assertEqual(prot_received.seq,
                             prot_expected.seq)
        with self.assertRaises(KeyError) as ctx:
            db['nimport_naoik']
        self.assertEqual(str(ctx.exception), "'nimport_naoik'")
예제 #39
0
    def test_find_integrase_gembase_no_hmmer_no_replicon(self):
        self.args.gembase = True
        self.args.hmmsearch = 'foo'
        cfg = Config(self.args)
        cfg._prefix_data = os.path.join(os.path.dirname(__file__), 'data')

        replicon_name = 'acba.007.p01.13'
        replicon_path = os.path.join(self._data_dir, 'Replicons',
                                     replicon_name + '.fst')
        topologies = Topology('lin')
        with FastaIterator(replicon_path) as sequences_db:
            sequences_db.topologies = topologies
            replicon = next(sequences_db)

        prot_file = os.path.join(self.tmp_dir, replicon.id + ".prt")

        with self.catch_log():
            with self.assertRaises(RuntimeError) as ctx:
                integrase.find_integrase(replicon.id, prot_file, self.tmp_dir,
                                         cfg)
            self.assertEqual(
                "The protein file: '{}' does not exists cannot perform hmmsearch on it."
                .format(prot_file), str(ctx.exception))
예제 #40
0
    def test_add_proteins(self):
        replicon_name = 'pssu.001.c01.13'
        replicon_path = self.find_data(os.path.join('Replicons', replicon_name + '.fst'))
        topologies = Topology('lin')
        with FastaIterator(replicon_path) as sequences_db:
            sequences_db.topologies = topologies
            replicon = next(sequences_db)

        prot_file = os.path.join(self._data_dir,
                                 '{}.prt.short'.format(replicon_name))

        args = argparse.Namespace()
        args.gembase = False
        args.annot_parser_name = None
        cfg = Config(args)
        integron = Integron(replicon, cfg)

        data_attc = {"pos_beg": [3072863, 3073496, 3074121, 3075059, 3075593, 3076281, 3076659],
                     "pos_end": [3072931, 3073555, 3074232, 3075118, 3075652, 3076340, 3076718],
                     "strand": [-1] * 7,
                     "evalue": [2.5e-06, 7e-08, 6.5e-08, 3.2e-06, 4.1e-07, 1.4e-08, 4e-08],
                     "type_elt": ['attC'] * 7,
                     "annotation": ['attC'] * 7,
                     "model": ['attc_4'] * 7,
                     "distance_2attC": [np.nan, 565.0, 566.0, 827.0, 475.0, 629.0, 319.0]}

        attC = pd.DataFrame(data_attc,
                            columns=self.columns,
                            index=['attc_00{}'.format(i) for i in range(len(data_attc['pos_beg']))])
        attC = attC.astype(dtype=self.dtype)

        integron.attC = attC
        prot_db = ProdigalDB(replicon, cfg, prot_file=prot_file)
        integron.add_proteins(prot_db)

        exp_proteins = pd.DataFrame({'pos_beg': [3071974, 3072950, 3074243, 3076720],
                                     'pos_end': [3072855, 3073468, 3075055, 3077511],
                                     'strand': [-1] * 4,
                                     'evalue': [np.nan] * 4,
                                     'type_elt': ['protein'] * 4,
                                     'annotation': ['protein'] * 4,
                                     'model': ['NA'] * 4,
                                     'distance_2attC': [np.nan] *4
                                     },
                                    index=['PSSU.001.C01_13_281{}'.format(i) for i in range(5, 9)],
                                    columns=self.columns
                                    )
        exp_proteins = exp_proteins.astype(dtype=self.dtype)
        pdt.assert_frame_equal(exp_proteins.sort_index(), integron.proteins.sort_index())
예제 #41
0
    def test_get_description(self):
        # SeqDesc(id, strand, strat, stop)
        file_name = {
            ('ACBA.0917.00019', '.fna'): {
                'ACBA.0917.00019.b0001_00001':
                SeqDesc('ACBA.0917.00019.b0001_00001', -1, 266, 1480),
                'ACBA.0917.00019.i0001_03957':
                SeqDesc('ACBA.0917.00019.i0001_03957', -1, 4043755, 4044354)
            },
        }

        for seq_name, ext in file_name:
            replicon_path = self.find_data(
                os.path.join('Gembase', 'Replicons', seq_name + ext))
            self.args.replicon = replicon_path
            cfg = Config(self.args)
            seq_db = read_multi_prot_fasta(replicon_path)
            replicon = next(seq_db)
            replicon.path = replicon_path
            os.makedirs(cfg.tmp_dir(replicon.id))

            db = GembaseDB(replicon, cfg)

            descriptions = file_name[(seq_name, ext)]
            for seq_id, desc in descriptions.items():
                self.assertEqual(desc, db.get_description(seq_id))

        with self.assertRaises(IntegronError) as ctx:
            db.get_description('nimport_naoik')
        self.assertEqual(
            str(ctx.exception),
            "'nimport_naoik' is not a valid Gembase protein identifier.")

        with self.assertRaises(KeyError) as ctx:
            db.get_description('FOO.BAR.00019.i0001_03924')
        self.assertEqual(str(ctx.exception), "'FOO.BAR.00019.i0001_03924'")
예제 #42
0
    def test_read_multi(self):
        """
        Test reading hmm results when there are multiple hits: 2 hits on the same protein: keep
        only the one with the best evalue. 2 hits on 2 different proteins: keep the 2 proteins.
        """
        replicon_id = 'ACBA.0917.00019'
        contig_id = 'ACBA.0917.00019.0001'
        result_dir_expected = self.find_data(
            "Results_Integron_Finder_{}.gembase".format(replicon_id))
        replicon_path = self.find_data(
            os.path.join('Gembase', 'Replicons', replicon_id + '.fna'))
        prot_file = os.path.join(result_dir_expected,
                                 "tmp_{}".format(contig_id),
                                 contig_id + '.prt')

        args = argparse.Namespace()
        args.gembase = True
        args.replicon = replicon_path
        cfg = Config(args)

        sequences_db = read_multi_prot_fasta(replicon_path)
        replicon = next(sequences_db)
        prot_db = GembaseDB(replicon, cfg, prot_file=prot_file)

        infile = self.find_data(
            os.path.join('fictive_results',
                         "{}_intI_multi.res".format(contig_id)))

        df = read_hmm(contig_id, prot_db, infile, cfg)
        exp = pd.DataFrame(data={
            "Accession_number": [contig_id] * 2,
            "query_name": ["Phage_integrase"] * 2,
            "ID_query": ["PF00589.16"] * 2,
            "ID_prot":
            ["ACBA.0917.00019.i0001_00298", "ACBA.0917.00019.i0001_00338"],
            "strand": [-1, -1],
            "pos_beg": [311597, 350328],
            "pos_end": [312631, 351248],
            "evalue": [5.5e-66, 3.4e-51]
        },
                           index=[0, 1])
        exp = exp[[
            "Accession_number", "query_name", "ID_query", "ID_prot", "strand",
            "pos_beg", "pos_end", "evalue"
        ]]
        pdt.assert_frame_equal(df, exp)
예제 #43
0
    def test_find_gembase_file_basename_file_not_in_gembase(self):
        """
        test if find_gembase_file_basename get the the right basename
        for files not located in gembase and file name is the output of split operation
        a file containing one contig
        a file representing a chunk
        """
        gembase_path = self.find_data('Gembase')

        file_names = {
            'ACBA.0917.00019':
            self.find_data(
                os.path.join('Replicons', 'ACBA.0917.00019.0001.fst')),
            'ESCO001.C.00001.C001.fst':
            os.path.join(self.tmp_dir, 'ESCO001.C.00001.C001_chunk_1.fst')
        }

        shutil.copyfile(
            os.path.join(gembase_path, 'Replicons',
                         'ESCO001.C.00001.C001.fst'),
            file_names['ESCO001.C.00001.C001.fst'])

        for base_file_name, replicon_path in file_names.items():
            self.args.replicon = replicon_path
            self.args.gembase_path = gembase_path
            cfg = Config(self.args)
            seq_db = read_multi_prot_fasta(replicon_path)
            replicon = next(seq_db)
            replicon.path = replicon_path
            os.makedirs(cfg.tmp_dir(replicon.id))

            with self.catch_log():
                db = GembaseDB(replicon, cfg, gembase_path=gembase_path)
            self.assertTrue(
                db._find_gembase_file_basename(gembase_path, replicon_path),
                base_file_name)

        replicon_path = self.find_data(
            os.path.join('Replicons', 'acba.007.p01.13.fst'))
        self.args.replicon = replicon_path
        self.args.gembase_path = gembase_path
        cfg = Config(self.args)
        seq_db = read_multi_prot_fasta(replicon_path)
        replicon = next(seq_db)
        replicon.path = replicon_path
        os.makedirs(cfg.tmp_dir(replicon.id))

        with self.assertRaises(FileNotFoundError) as ctx:
            with self.catch_log():
                GembaseDB(replicon, cfg, gembase_path=gembase_path)
        self.assertEqual(
            str(ctx.exception),
            'cannot find lst file matching {} sequence'.format(replicon_path))
예제 #44
0
    def test_read_hmm_gembase(self):
        """
        Test that the hmm hits are well read, when the gembase format is used (.prt file is
        provided, prodigal is not used to find the proteins).
        """
        replicon_id = 'ACBA.0917.00019'
        contig_id = 'ACBA.0917.00019.0001'
        result_dir_expected = self.find_data(
            "Results_Integron_Finder_{}.gembase".format(replicon_id))
        replicon_path = self.find_data(
            os.path.join('Gembase', 'Replicons', replicon_id + '.fna'))
        prot_file = os.path.join(result_dir_expected,
                                 "tmp_{}".format(contig_id),
                                 contig_id + '.prt')
        infile = os.path.join(result_dir_expected, "tmp_{}".format(contig_id),
                              "{}_intI.res".format(contig_id))

        args = argparse.Namespace()
        args.gembase = True
        args.replicon = replicon_path
        cfg = Config(args)

        sequences_db = read_multi_prot_fasta(replicon_path)
        replicon = next(sequences_db)
        prot_db = GembaseDB(replicon, cfg, prot_file=prot_file)

        df = read_hmm(contig_id, prot_db, infile, cfg)
        exp = pd.DataFrame(data={
            "Accession_number": contig_id,
            "query_name": "intI_Cterm",
            "ID_query": "-",
            "ID_prot": "ACBA.0917.00019.i0001_00298",
            "strand": -1,
            "pos_beg": 311597,
            "pos_end": 312631,
            "evalue": 3.6e-25
        },
                           index=[0])
        exp = exp[[
            "Accession_number", "query_name", "ID_query", "ID_prot", "strand",
            "pos_beg", "pos_end", "evalue"
        ]]

        pdt.assert_frame_equal(df, exp)
    def test_find_integron_calin_threshold(self):
        replicon_name = 'ESCO001.B.00018.P002'
        replicon_path = self.find_data(
            os.path.join('Replicons', replicon_name + '.fst'))
        prot_file = self.find_data(
            os.path.join('Proteins', replicon_name + '.prt'))
        topologies = Topology('circ')
        with FastaIterator(replicon_path) as sequences_db:
            sequences_db.topologies = topologies
            replicon = next(sequences_db)
        replicon_results_path = self.find_data(
            os.path.join('Results_Integron_Finder_{}'.format(replicon_name),
                         'tmp_{}'.format(replicon.id)))
        attc_file = os.path.join(replicon_results_path,
                                 '{}_attc_table.res'.format(replicon.id))
        intI_file = os.path.join(replicon_results_path,
                                 '{}_intI.res'.format(replicon.id))
        phageI_file = os.path.join(replicon_results_path,
                                   '{}_phage_int.res'.format(replicon.id))

        args = argparse.Namespace()
        args.no_proteins = False
        args.keep_palindromes = True
        args.distance_threshold = 4000
        args.attc_model = 'attc_4.cm'
        args.evalue_attc = 1.0
        args.max_attc_size = 200
        args.min_attc_size = 40
        args.local_max = False
        args.gembase = False
        args.union_integrases = False
        args.calin_threshold = 2

        cfg = Config(args)
        cfg._prefix_data = os.path.join(os.path.dirname(__file__), 'data')
        prot_db = ProdigalDB(replicon, cfg, prot_file=prot_file)

        with self.catch_log() as log:
            integrons = find_integron(replicon, prot_db, attc_file, intI_file,
                                      phageI_file, cfg)

        self.assertEqual(len(integrons), 2)

        args.calin_threshold = 3
        cfg = Config(args)
        cfg._prefix_data = os.path.join(os.path.dirname(__file__), 'data')

        with self.catch_log() as log:
            integrons = find_integron(replicon, prot_db, attc_file, intI_file,
                                      phageI_file, cfg)
        self.assertEqual(len(integrons), 1)
    def test_find_integron_proteins_n_union_integrase(self):
        replicon_name = 'OBAL001.B.00005.C001'
        replicon_id = 'OBAL001.B.00005.C001'
        replicon_path = self.find_data(os.path.join('Replicons', replicon_name + '.fst'))
        prot_file = self.find_data(os.path.join('Proteins', replicon_name + '.prt'))
        topologies = Topology('lin')
        with FastaIterator(replicon_path) as sequences_db:
            sequences_db.topologies = topologies
            replicon = next(sequences_db)
        result_dir = 'Results_Integron_Finder_{}.union'.format(replicon_name)
        attc_file = self.find_data(os.path.join(result_dir,
                                                'tmp_{}'.format(replicon.id),
                                                '{}_attc_table.res'.format(replicon.id)))
        intI_file = self.find_data(os.path.join(result_dir,
                                                'tmp_{}'.format(replicon.id),
                                                '{}_intI.res'.format(replicon.id)))
        phageI_file = self.find_data(os.path.join(result_dir,
                                                  'tmp_{}'.format(replicon.id),
                                                  '{}_phage_int.res'.format(replicon.id)))
        args = argparse.Namespace()
        args.evalue_attc = 1.
        args.max_attc_size = 200
        args.min_attc_size = 40
        args.distance_threshold = 4000  # (4kb at least between 2 different arrays)
        args.calin_threshold = 2
        args.attc_model = 'attc_4.cm'
        args.no_proteins = False
        args.keep_palindromes = True
        args.union_integrases = True
        args.gembase = False  # needed by read_hmm which is called when no_proteins == False
        args.local_max = False
        cfg = Config(args)
        cfg._prefix_data = os.path.join(os.path.dirname(__file__), 'data')

        prot_db = ProdigalDB(replicon, cfg, prot_file=prot_file)
        exp_msg = """In replicon {}, there are:
- 3 complete integron(s) found with a total 4 attC site(s)
- 0 CALIN element(s) found with a total of 0 attC site(s)
- 2 In0 element(s) found with a total of 0 attC site""".format(replicon.id)
        with self.catch_log() as log:
            integrons = find_integron(replicon,
                                      prot_db,
                                      attc_file,
                                      intI_file,
                                      phageI_file,
                                      cfg)
            catch_msg = log.get_value().strip()
        self.assertEqual(catch_msg, exp_msg)
        self.assertEqual(len(integrons), 5)
        integron = integrons[0]
        self.assertEqual(integron.replicon.name, replicon_id)

        empty = pd.DataFrame(columns=self.columns).astype(dtype=self.dtype)

        exp_int = []
        exp_int.append(pd.DataFrame(
            [[418072, 419283, 1, 5.400000e-25, 'protein', 'Phage_integrase', np.nan, 'intI']],
            columns=self.columns,
            index=['OBAL001.B.00005.C001_388']).astype(dtype=self.dtype))
        exp_int.append(pd.DataFrame(
            [[434671, 440118, -1, 0.085, 'protein', 'Phage_integrase', np.nan, 'intI']],
            columns=self.columns,
            index=['OBAL001.B.00005.C001_399']).astype(dtype=self.dtype))
        exp_int.append(pd.DataFrame(
            [[516941, 517834, -1, 1.200000e-54, 'protein', 'Phage_integrase', np.nan, 'intI']],
            columns=self.columns,
            index=['OBAL001.B.00005.C001_472']).astype(dtype=self.dtype))
        exp_int.append(pd.DataFrame(
            [[1940269, 1941171, 1, 4.200000e-43, 'protein', 'Phage_integrase', np.nan, 'intI']],
            columns=self.columns,
            index=['OBAL001.B.00005.C001_1793']).astype(dtype=self.dtype))
        exp_int.append(pd.DataFrame(
            [[1545830, 1546807, -1, 1.100000e-21, 'protein', 'intersection_tyr_intI', np.nan, 'intI']],
            columns=self.columns,
            index=['OBAL001.B.00005.C001_1416']).astype(dtype=self.dtype))

        exp_attC = []
        exp_attC.append(pd.DataFrame(
            [[421689, 421764, 1, 0.13, 'attC', 'attc_4', np.nan, 'attC']],
            columns=self.columns,
            index=['attc_001']).astype(dtype=self.dtype))
        exp_attC.append(pd.DataFrame(
            [[442458, 442514, -1, 7.000000e-07, 'attC', 'attc_4', np.nan, 'attC']],
            columns=self.columns,
            index=['attc_001']).astype(dtype=self.dtype))
        exp_attC.append(empty)
        exp_attC.append(empty)
        exp_attC.append(pd.DataFrame(
            [[1547800, 1547859, 1, 0.00049, 'attC', 'attc_4', np.nan, 'attC'],
             [1548775, 1548834, 1, 0.00009, 'attC', 'attc_4', 916.0, 'attC']],
            columns=self.columns,
            index=['attc_001', 'attc_002']).astype(dtype=self.dtype))

        for i, integron in enumerate(integrons):
            self.assertEqual(integron.replicon.name, replicon_id)
            pdt.assert_frame_equal(integron.integrase, exp_int[i])
            pdt.assert_frame_equal(integron.attC, exp_attC[i])
            pdt.assert_frame_equal(integron.promoter, empty)
            pdt.assert_frame_equal(integron.attI, empty)
            pdt.assert_frame_equal(integron.proteins, empty)
    def test_find_integron_proteins_lin_replicon(self):
        replicon_name = 'acba.007.p01.13'
        replicon_id = 'ACBA.007.P01_13'
        replicon_path = self.find_data(os.path.join('Replicons', replicon_name + '.fst'))
        prot_file = self.find_data(os.path.join('Proteins', replicon_id + '.prt'))
        topologies = Topology('lin')
        with FastaIterator(replicon_path) as sequences_db:
            sequences_db.topologies = topologies
            replicon = next(sequences_db)
        exp_result_dir = 'Results_Integron_Finder_acba.007.p01.13.linear'
        attc_file = self.find_data(os.path.join(exp_result_dir,
                                                'tmp_{}'.format(replicon.id),
                                                '{}_attc_table.res'.format(replicon.id)))
        intI_file = self.find_data(os.path.join(exp_result_dir,
                                                'tmp_{}'.format(replicon.id),
                                                '{}_intI.res'.format(replicon.id)))
        phageI_file = self.find_data(os.path.join(exp_result_dir,
                                                  'tmp_{}'.format(replicon.id),
                                                  '{}_phage_int.res'.format(replicon.id)))
        args = argparse.Namespace()
        args.no_proteins = False
        args.keep_palindromes = True
        args.union_integrases = False
        args.gembase = False  # needed by read_hmm which is called when no_proteins == False

        args = argparse.Namespace()
        args.evalue_attc = 1.
        args.max_attc_size = 200
        args.min_attc_size = 40
        args.distance_threshold = 4000  # (4kb at least between 2 different arrays)
        args.attc_model = 'attc_4.cm'
        args.no_proteins = False
        args.gembase = False  # needed by read_hmm which is called when no_proteins == False
        args.union_integrases = False
        args.keep_palindromes = True
        args.calin_threshold = 2
        args.local_max = False

        cfg = Config(args)
        cfg._prefix_data = os.path.join(os.path.dirname(__file__), 'data')
        prot_db = ProdigalDB(replicon, cfg, prot_file=prot_file)

        exp_msg = """In replicon {}, there are:
- 0 complete integron(s) found with a total 0 attC site(s)
- 1 CALIN element(s) found with a total of 3 attC site(s)
- 1 In0 element(s) found with a total of 0 attC site""".format(replicon.id)
        with self.catch_log() as log:
            integrons = find_integron(replicon,
                                      prot_db,
                                      attc_file,
                                      intI_file,
                                      phageI_file,
                                      cfg)
            catch_msg = log.get_value().strip()
        self.assertEqual(catch_msg, exp_msg)
        self.assertEqual(len(integrons), 2)

        exp_int = []
        exp = pd.DataFrame({'annotation': 'intI',
                            'distance_2attC': np.nan,
                            'evalue':  1.900000e-25,
                            'model': 'intersection_tyr_intI',
                            'pos_beg': 55,
                            'pos_end': 1014,
                            'strand': 1,
                            'type_elt': 'protein'},
                           columns=self.columns,
                           index=['ACBA.007.P01_13_1'])
        exp = exp.astype(dtype=self.dtype)
        exp_int.append(exp)
        exp_int.append(pd.DataFrame(columns=self.columns).astype(dtype=self.dtype))

        exp_attC = [pd.DataFrame(columns=self.columns).astype(dtype=self.dtype)]
        exp = pd.DataFrame({'annotation': ['attC'] * 3,
                            'distance_2attC': [np.nan, 1196.0,  469.0],
                            'evalue':  [1.000000e-09, 1.000000e-04, 1.100000e-07],
                            'model': ['attc_4'] * 3,
                            'pos_beg': [17825, 19080, 19618],
                            'pos_end': [17884, 19149, 19726],
                            'strand': [-1, -1, -1],
                            'type_elt': 'attC'},
                           columns=self.columns,
                           index=['attc_001', 'attc_002', 'attc_003'])
        exp = exp.astype(dtype=self.dtype)
        exp_attC.append(exp)
        empty = pd.DataFrame(columns=self.columns).astype(dtype=self.dtype)

        for i, integron in enumerate(integrons):
            self.assertEqual(integron.replicon.name, replicon_id)
            pdt.assert_frame_equal(integron.integrase, exp_int[i])
            pdt.assert_frame_equal(integron.attC, exp_attC[i])

            pdt.assert_frame_equal(integron.promoter, empty)
            pdt.assert_frame_equal(integron.attI, empty)
            pdt.assert_frame_equal(integron.proteins, empty)
    def test_find_integron(self):
        replicon_name = 'acba.007.p01.13'
        prot_name = 'ACBA.007.P01_13'
        replicon_path = self.find_data(os.path.join('Replicons', replicon_name + '.fst'))
        prot_file = self.find_data(os.path.join('Proteins', prot_name + '.prt'))
        topologies = Topology('lin')
        with FastaIterator(replicon_path) as sequences_db:
            sequences_db.topologies = topologies
            replicon = next(sequences_db)
        replicon_results_path = self.find_data(os.path.join('Results_Integron_Finder_{}'.format(replicon_name),
                                                            'tmp_{}'.format(replicon.id)))
        attc_file = os.path.join(replicon_results_path, '{}_attc_table.res'.format(replicon.id))
        intI_file = os.path.join(replicon_results_path, '{}_intI.res'.format(replicon.id))
        phageI_file = os.path.join(replicon_results_path, '{}_phage_int.res'.format(replicon.id))

        args = argparse.Namespace()
        args.no_proteins = True
        args.keep_palindromes = True
        args.distance_threshold = 4000
        args.attc_model = 'attc_4.cm'
        args.evalue_attc = 1.0
        args.max_attc_size = 200
        args.min_attc_size = 40
        args.calin_threshold = 2
        args.local_max = False
        cfg = Config(args)
        cfg._prefix_data = os.path.join(os.path.dirname(__file__), 'data')
        prot_db = ProdigalDB(replicon, cfg, prot_file=prot_file)

        exp_msg = """In replicon {}, there are:
- 0 complete integron(s) found with a total 0 attC site(s)
- 1 CALIN element(s) found with a total of 3 attC site(s)
- 0 In0 element(s) found with a total of 0 attC site""".format(replicon.id)
        with self.catch_log() as log:
            integrons = find_integron(replicon, prot_db, attc_file, intI_file, phageI_file, cfg)
            catch_msg = log.get_value().strip()

        self.assertEqual(catch_msg, exp_msg)
        self.assertEqual(len(integrons), 1)
        integron = integrons[0]
        self.assertEqual(integron.replicon.id, replicon.id)

        exp = pd.DataFrame({'annotation': ['attC'] * 3,
                            'distance_2attC': [np.nan, 1196.0, 469.0],
                            'evalue': [1.000000e-09, 1.000000e-04, 1.100000e-07],
                            'model': ['attc_4'] * 3,
                            'pos_beg': [17825, 19080, 19618],
                            'pos_end': [17884, 19149, 19726],
                            'strand': [-1, -1, -1],
                            'type_elt': 'attC'},
                           columns=self.columns,
                           index=['attc_001', 'attc_002', 'attc_003'])
        pdt.assert_frame_equal(integron.attC, exp)

        exp = pd.DataFrame(columns=self.columns,)
        exp = exp.astype(dtype=self.dtype)

        pdt.assert_frame_equal(integron.integrase, exp)
        pdt.assert_frame_equal(integron.promoter, exp)
        pdt.assert_frame_equal(integron.attI, exp)
        pdt.assert_frame_equal(integron.proteins, exp)