Exemplo n.º 1
0
    def test_iter(self):
        # test Gembase Draft
        seq_name = 'ACBA.0917.00019'
        ext = '.fna'
        replicon_path = self.find_data(
            os.path.join('Gembase', 'Replicons', seq_name + ext))
        self.args.replicon = replicon_path
        cfg = Config(self.args)
        seq_db = read_multi_prot_fasta(replicon_path)
        replicon = next(seq_db)
        replicon.path = replicon_path
        db = GembaseDB(replicon, cfg)

        idx = SeqIO.index(self.find_data(
            os.path.join('Gembase', 'Proteins', seq_name + '.prt')),
                          'fasta',
                          alphabet=Seq.IUPAC.extended_protein)

        specie, date, strain, contig = replicon.id.split('.')
        pattern = '{}\.{}\.{}\.\w?{}'.format(specie, date, strain, contig)
        self.assertListEqual(sorted([i for i in idx if re.match(pattern, i)]),
                             sorted([i for i in db]))

        # test Gembase Complet
        seq_name = 'ESCO001.C.00001.C001'
        ext = '.fst'
        replicon_path = self.find_data(
            os.path.join('Gembase', 'Replicons', seq_name + ext))
        self.args.replicon = replicon_path
        cfg = Config(self.args)
        seq_db = read_multi_prot_fasta(replicon_path)
        replicon = next(seq_db)
        replicon.path = replicon_path
        with self.catch_log():
            db = GembaseDB(replicon, cfg)

        idx = SeqIO.index(self.find_data(
            os.path.join('Gembase', 'Proteins', seq_name + '.prt')),
                          'fasta',
                          alphabet=Seq.IUPAC.extended_protein)

        specie, date, strain, contig = replicon.id.split('.')
        pattern = '{}\.{}\.{}\.\w?{}'.format(specie, date, strain, contig)
        seqid_from_gembase_protfile = set(
            [i for i in idx if re.match(pattern, i)])
        seqid_from_if = set([i for i in db])
        non_common_seq = seqid_from_gembase_protfile ^ seqid_from_if
        # in Gembase complete the annotation from lstinfo provided from genbank
        # it appear some times that some CDS are not translate in proteins
        # So in data I have 3 genes from LSTINFO are not in .prt file
        diff = {
            'ESCO001.C.00001.C001_03974', 'ESCO001.C.00001.C001_01509',
            'ESCO001.C.00001.C001_04162'
        }
        self.assertSetEqual(non_common_seq, diff)
Exemplo n.º 2
0
    def test_find_gembase_file_basename_file_not_in_gembase(self):
        """
        test if find_gembase_file_basename get the the right basename
        for files not located in gembase and file name is the output of split operation
        a file containing one contig
        a file representing a chunk
        """
        gembase_path = self.find_data('Gembase')

        file_names = {
            'ACBA.0917.00019':
            self.find_data(
                os.path.join('Replicons', 'ACBA.0917.00019.0001.fst')),
            'ESCO001.C.00001.C001.fst':
            os.path.join(self.tmp_dir, 'ESCO001.C.00001.C001_chunk_1.fst')
        }

        shutil.copyfile(
            os.path.join(gembase_path, 'Replicons',
                         'ESCO001.C.00001.C001.fst'),
            file_names['ESCO001.C.00001.C001.fst'])

        for base_file_name, replicon_path in file_names.items():
            self.args.replicon = replicon_path
            self.args.gembase_path = gembase_path
            cfg = Config(self.args)
            seq_db = read_multi_prot_fasta(replicon_path)
            replicon = next(seq_db)
            replicon.path = replicon_path
            os.makedirs(cfg.tmp_dir(replicon.id))

            with self.catch_log():
                db = GembaseDB(replicon, cfg, gembase_path=gembase_path)
            self.assertTrue(
                db._find_gembase_file_basename(gembase_path, replicon_path),
                base_file_name)

        replicon_path = self.find_data(
            os.path.join('Replicons', 'acba.007.p01.13.fst'))
        self.args.replicon = replicon_path
        self.args.gembase_path = gembase_path
        cfg = Config(self.args)
        seq_db = read_multi_prot_fasta(replicon_path)
        replicon = next(seq_db)
        replicon.path = replicon_path
        os.makedirs(cfg.tmp_dir(replicon.id))

        with self.assertRaises(FileNotFoundError) as ctx:
            with self.catch_log():
                GembaseDB(replicon, cfg, gembase_path=gembase_path)
        self.assertEqual(
            str(ctx.exception),
            'cannot find lst file matching {} sequence'.format(replicon_path))
Exemplo n.º 3
0
    def test_gembase_draft_parser(self):
        replicon_name = 'ACBA.0917.00019'
        replicon_id = 'ACBA.0917.00019.0001'
        lst_path = self.find_data(
            os.path.join('Gembase', 'LSTINFO', replicon_name + '.lst'))
        prots_info = GembaseDB.gembase_draft_parser(lst_path, replicon_id)
        columns = [
            'start', 'end', 'strand', 'type', 'seq_id', 'gene_name',
            'description'
        ]
        self.assertListEqual(list(prots_info.columns), columns)
        self.assertEqual(prots_info.shape, (3870, len(columns)))
        first_row = [
            266, 1480, 'C', 'CDS', 'ACBA.0917.00019.b0001_00001', 'tyrS',
            '| Tyrosine--tRNA ligase | 6.1.1.1 | similar to AA sequence:UniProtKB:P41256'
        ]

        recieved_first_row = prots_info.iloc[0].values.tolist()
        self.assertListEqual(first_row, recieved_first_row)

        last_row = [
            4043755, 4044354, 'C', 'CDS', 'ACBA.0917.00019.i0001_03957',
            'yfcG_3',
            '| Disulfide-bond oxidoreductase YfcG | 1.8.4.- | similar to AA sequence:UniProtKB:P77526'
        ]
        recieved_last_row = prots_info.iloc[len(prots_info) -
                                            1].values.tolist()
        self.assertListEqual(last_row, recieved_last_row)
Exemplo n.º 4
0
    def test_make_protfile(self):
        file_name = (('ACBA.0917.00019', '.fna', 3870),
                     ('ESCO001.C.00001.C001', '.fst', 3870))
        for seq_name, ext, seq_nb in file_name:
            replicon_path = self.find_data(
                os.path.join('Gembase', 'Replicons', seq_name + ext))
            self.args.replicon = replicon_path
            cfg = Config(self.args)
            seq_db = read_multi_prot_fasta(replicon_path)
            replicon = next(seq_db)
            replicon.path = replicon_path
            os.makedirs(cfg.tmp_dir(replicon.id))

            with self.catch_log():
                db = GembaseDB(replicon, cfg)
            for seq_nb, seqs in enumerate(
                    zip(
                        read_multi_prot_fasta(
                            self.find_data(
                                os.path.join('Gembase', 'Proteins',
                                             seq_name + '.prt'))),
                        read_multi_prot_fasta(db.protfile)), 1):
                expected, test = seqs
                self.assertEqual(expected.id, test.id)
            self.assertEqual(seq_nb, seq_nb)
Exemplo n.º 5
0
    def test_getitem(self):
        file_name = (('ACBA.0917.00019', '.fna'), ('ESCO001.C.00001.C001',
                                                   '.fst'))
        for seq_name, ext in file_name:
            replicon_path = self.find_data(
                os.path.join('Gembase', 'Replicons', seq_name + ext))
            self.args.replicon = replicon_path
            cfg = Config(self.args)
            seq_db = read_multi_prot_fasta(replicon_path)
            replicon = next(seq_db)
            os.makedirs(cfg.tmp_dir(replicon.id))

            with self.catch_log():
                db = GembaseDB(replicon, cfg)
            exp = read_multi_prot_fasta(
                self.find_data(
                    os.path.join('Gembase', 'Proteins', seq_name + '.prt')))

            specie, date, strain, contig = replicon.id.split('.')
            pattern = '{}\.{}\.{}\.\w?{}'.format(specie, date, strain, contig)

            for prot_expected in exp:
                if re.match(pattern, prot_expected.id):
                    prot_received = db[prot_expected.id]
                    self.assertEqual(prot_received.id, prot_expected.id)
                    self.assertEqual(prot_received.seq, prot_expected.seq)
        with self.assertRaises(KeyError) as ctx:
            db['nimport_naoik']
        self.assertEqual(str(ctx.exception), "'nimport_naoik'")
Exemplo n.º 6
0
    def test_gembase_complete_parser(self):
        replicon_id = 'ESCO001.C.00001.C001'
        lst_path = self.find_data(
            os.path.join('Gembase', 'LSTINFO', replicon_id + '.lst'))
        prots_info = GembaseDB.gembase_complete_parser(lst_path, replicon_id)
        columns = [
            'start', 'end', 'strand', 'type', 'seq_id', 'valid', 'gene_name',
            'description'
        ]
        self.assertListEqual(list(prots_info.columns), columns)
        self.assertEqual(prots_info.shape, (4139, len(columns)))
        first_row = [
            190, 255, 'D', 'CDS', 'ESCO001.C.00001.C001_00001', 'Valid',
            'thrL',
            '@b0001@NP_414542.1@ b0001 1 190 255 | leader; Amino acid biosynthesis:'
            ' Threonine thr operon leader peptide | ..'
        ]

        recieved_first_row = prots_info.iloc[0].values.tolist()
        self.assertListEqual(first_row, recieved_first_row)

        last_row = [
            4640942, 4641628, 'D', 'CDS', 'ESCO001.C.00001.C001_04495',
            'Valid', 'yjtD',
            '@b4403@NP_418820.1@ b4403 1 4640942 4641628 | putative methyltransferase | ..'
        ]
        recieved_last_row = prots_info.iloc[len(prots_info) -
                                            1].values.tolist()
        self.assertListEqual(last_row, recieved_last_row)
Exemplo n.º 7
0
 def test_gembase_sniffer(self):
     file_names = (('ACBA.0917.00019', 'Draft'), ('ESCO001.C.00001.C001',
                                                  'Complet'))
     for file_name, gem_type in file_names:
         lst_path = self.find_data(
             os.path.join('Gembase', 'LSTINFO', file_name + '.lst'))
         type_recieved = GembaseDB.gembase_sniffer(lst_path)
         self.assertEqual(type_recieved, gem_type)
Exemplo n.º 8
0
    def test_find_gembase_file_basename_file_not_in_gembase(self):
        """
        test if find_gembase_file_basename get the the right basename
        for files not located in gembase and file name is the output of split operation
        a file containing one contig
        a file representing a chunk
        """
        gembase_path = self.find_data('Gembase')

        file_names = {'ACBA.0917.00019': self.find_data(os.path.join('Replicons', 'ACBA.0917.00019.0001.fst')),
                      'ESCO001.C.00001.C001.fst': os.path.join(self.tmp_dir, 'ESCO001.C.00001.C001_chunk_1.fst')
                      }

        shutil.copyfile(os.path.join(gembase_path, 'Replicons', 'ESCO001.C.00001.C001.fst'),
                        file_names['ESCO001.C.00001.C001.fst'])

        for base_file_name, replicon_path in file_names.items():
            self.args.replicon = replicon_path
            self.args.gembase_path = gembase_path
            cfg = Config(self.args)
            seq_db = read_multi_prot_fasta(replicon_path)
            replicon = next(seq_db)
            replicon.path = replicon_path
            os.makedirs(cfg.tmp_dir(replicon.id))

            with self.catch_log():
                db = GembaseDB(replicon, cfg, gembase_path=gembase_path)
            self.assertTrue(db._find_gembase_file_basename(gembase_path, replicon_path),
                            base_file_name)

        replicon_path = self.find_data(os.path.join('Replicons', 'acba.007.p01.13.fst'))
        self.args.replicon = replicon_path
        self.args.gembase_path = gembase_path
        cfg = Config(self.args)
        seq_db = read_multi_prot_fasta(replicon_path)
        replicon = next(seq_db)
        replicon.path = replicon_path
        os.makedirs(cfg.tmp_dir(replicon.id))

        with self.assertRaises(FileNotFoundError) as ctx:
            with self.catch_log():
                GembaseDB(replicon, cfg, gembase_path=gembase_path)
        self.assertEqual(str(ctx.exception),
                         'cannot find lst file matching {} sequence'.format(replicon_path))
Exemplo n.º 9
0
    def test_find_gembase_file_basename(self):
        """
        test if find_gembase_file_basename get the the right basename
        for files in gembase
        """
        gembase_path = self.find_data('Gembase')
        file_names = ('ACBA.0917.00019.fna', 'ESCO001.C.00001.C001.fst')
        for file_name in file_names:
            replicon_path = self.find_data(os.path.join('Gembase', 'Replicons', file_name))
            self.args.replicon = replicon_path
            cfg = Config(self.args)
            seq_db = read_multi_prot_fasta(replicon_path)
            replicon = next(seq_db)
            replicon.path = replicon_path
            os.makedirs(cfg.tmp_dir(replicon.id))

            with self.catch_log():
                db = GembaseDB(replicon, cfg)
            self.assertTrue(db._find_gembase_file_basename(gembase_path, replicon_path),
                            os.path.splitext(file_name)[0])
Exemplo n.º 10
0
    def test_get_description(self):
        # SeqDesc(id, strand, strat, stop)
        file_name = {('ACBA.0917.00019', '.fna'):
                         {'ACBA.0917.00019.b0001_00001': SeqDesc('ACBA.0917.00019.b0001_00001', -1, 266, 1480),
                          'ACBA.0917.00019.i0001_03957': SeqDesc('ACBA.0917.00019.i0001_03957', -1, 4043755, 4044354)},
                     }

        for seq_name, ext in file_name:
            replicon_path = self.find_data(os.path.join('Gembase', 'Replicons', seq_name + ext))
            self.args.replicon = replicon_path
            cfg = Config(self.args)
            seq_db = read_multi_prot_fasta(replicon_path)
            replicon = next(seq_db)
            replicon.path = replicon_path
            os.makedirs(cfg.tmp_dir(replicon.id))

            db = GembaseDB(replicon, cfg)

            descriptions = file_name[(seq_name, ext)]
            for seq_id, desc in descriptions.items():
                self.assertEqual(desc, db.get_description(seq_id))

        with self.assertRaises(IntegronError) as ctx:
            db.get_description('nimport_naoik')
        self.assertEqual(str(ctx.exception), "'nimport_naoik' is not a valid Gembase protein identifier.")

        with self.assertRaises(KeyError) as ctx:
            db.get_description('FOO.BAR.00019.i0001_03924')
        self.assertEqual(str(ctx.exception), "'FOO.BAR.00019.i0001_03924'")
Exemplo n.º 11
0
    def test_find_gembase_file_basename(self):
        """
        test if find_gembase_file_basename get the the right basename
        for files in gembase
        """
        gembase_path = self.find_data('Gembase')
        file_names = ('ACBA.0917.00019.fna', 'ESCO001.C.00001.C001.fst')
        for file_name in file_names:
            replicon_path = self.find_data(
                os.path.join('Gembase', 'Replicons', file_name))
            self.args.replicon = replicon_path
            cfg = Config(self.args)
            seq_db = read_multi_prot_fasta(replicon_path)
            replicon = next(seq_db)
            replicon.path = replicon_path
            os.makedirs(cfg.tmp_dir(replicon.id))

            with self.catch_log():
                db = GembaseDB(replicon, cfg)
            self.assertTrue(
                db._find_gembase_file_basename(gembase_path, replicon_path),
                os.path.splitext(file_name)[0])
Exemplo n.º 12
0
    def test_ProteinDB(self):
        # From Gembase Draft , Gembase Complete
        file_names = ('ACBA.0917.00019.fna', 'ESCO001.C.00001.C001.fst')
        for file_name in file_names:
            replicon_path = self.find_data(
                os.path.join('Gembase', 'Replicons', file_name))
            self.args.replicon = replicon_path
            cfg = Config(self.args)
            seq_db = read_multi_prot_fasta(replicon_path)
            replicon = next(seq_db)
            replicon.path = replicon_path
            os.makedirs(cfg.tmp_dir(replicon.id))

            with self.catch_log():
                db = GembaseDB(replicon, cfg)
            self.assertTrue(db.replicon.id, replicon.id)
Exemplo n.º 13
0
    def test_read_multi(self):
        """
        Test reading hmm results when there are multiple hits: 2 hits on the same protein: keep
        only the one with the best evalue. 2 hits on 2 different proteins: keep the 2 proteins.
        """
        replicon_id = 'ACBA.0917.00019'
        contig_id = 'ACBA.0917.00019.0001'
        result_dir_expected = self.find_data(
            "Results_Integron_Finder_{}.gembase".format(replicon_id))
        replicon_path = self.find_data(
            os.path.join('Gembase', 'Replicons', replicon_id + '.fna'))
        prot_file = os.path.join(result_dir_expected,
                                 "tmp_{}".format(contig_id),
                                 contig_id + '.prt')

        args = argparse.Namespace()
        args.gembase = True
        args.replicon = replicon_path
        cfg = Config(args)

        sequences_db = read_multi_prot_fasta(replicon_path)
        replicon = next(sequences_db)
        prot_db = GembaseDB(replicon, cfg, prot_file=prot_file)

        infile = self.find_data(
            os.path.join('fictive_results',
                         "{}_intI_multi.res".format(contig_id)))

        df = read_hmm(contig_id, prot_db, infile, cfg)
        exp = pd.DataFrame(data={
            "Accession_number": [contig_id] * 2,
            "query_name": ["Phage_integrase"] * 2,
            "ID_query": ["PF00589.16"] * 2,
            "ID_prot":
            ["ACBA.0917.00019.i0001_00298", "ACBA.0917.00019.i0001_00338"],
            "strand": [-1, -1],
            "pos_beg": [311597, 350328],
            "pos_end": [312631, 351248],
            "evalue": [5.5e-66, 3.4e-51]
        },
                           index=[0, 1])
        exp = exp[[
            "Accession_number", "query_name", "ID_query", "ID_prot", "strand",
            "pos_beg", "pos_end", "evalue"
        ]]
        pdt.assert_frame_equal(df, exp)
Exemplo n.º 14
0
    def test_protfile(self):
        file_name = (('ACBA.0917.00019', '.fna'), ('ESCO001.C.00001.C001',
                                                   '.fst'))
        for seq_name, ext in file_name:
            replicon_path = self.find_data(
                os.path.join('Gembase', 'Replicons', seq_name + ext))
            self.args.replicon = replicon_path
            cfg = Config(self.args)
            seq_db = read_multi_prot_fasta(replicon_path)
            replicon = next(seq_db)
            replicon.path = replicon_path
            os.makedirs(cfg.tmp_dir(replicon.id))

            with self.catch_log():
                db = GembaseDB(replicon, cfg)
            self.assertEqual(
                os.path.join(cfg.tmp_dir(replicon.id), replicon.id + '.prt'),
                db.protfile)
Exemplo n.º 15
0
    def test_gembase_complete_parser(self):
        replicon_id = 'ESCO001.C.00001.C001'
        lst_path = self.find_data(os.path.join('Gembase', 'LSTINFO', replicon_id + '.lst'))
        prots_info = GembaseDB.gembase_complete_parser(lst_path, replicon_id)
        columns = ['start', 'end', 'strand', 'type', 'seq_id', 'valid', 'gene_name', 'description']
        self.assertListEqual(list(prots_info.columns), columns)
        self.assertEqual(prots_info.shape, (4139, len(columns)))
        first_row = [190, 255, 'D', 'CDS', 'ESCO001.C.00001.C001_00001', 'Valid', 'thrL',
                     '@b0001@NP_414542.1@ b0001 1 190 255 | leader; Amino acid biosynthesis:'
                     ' Threonine thr operon leader peptide | ..']

        recieved_first_row = prots_info.iloc[0].values.tolist()
        self.assertListEqual(first_row, recieved_first_row)

        last_row = [4640942, 4641628, 'D', 'CDS', 'ESCO001.C.00001.C001_04495', 'Valid', 'yjtD',
                    '@b4403@NP_418820.1@ b4403 1 4640942 4641628 | putative methyltransferase | ..']
        recieved_last_row = prots_info.iloc[len(prots_info) - 1].values.tolist()
        self.assertListEqual(last_row, recieved_last_row)
Exemplo n.º 16
0
    def test_gembase_draft_parser(self):
        replicon_name = 'ACBA.0917.00019'
        replicon_id = 'ACBA.0917.00019.0001'
        lst_path = self.find_data(os.path.join('Gembase', 'LSTINFO', replicon_name + '.lst'))
        prots_info = GembaseDB.gembase_draft_parser(lst_path, replicon_id)
        columns = ['start', 'end', 'strand', 'type', 'seq_id', 'gene_name', 'description']
        self.assertListEqual(list(prots_info.columns), columns)
        self.assertEqual(prots_info.shape, (3870, len(columns)))
        first_row = [266, 1480, 'C', 'CDS', 'ACBA.0917.00019.b0001_00001', 'tyrS',
                     '| Tyrosine--tRNA ligase | 6.1.1.1 | similar to AA sequence:UniProtKB:P41256']

        recieved_first_row = prots_info.iloc[0].values.tolist()
        self.assertListEqual(first_row, recieved_first_row)

        last_row = [4043755, 4044354, 'C', 'CDS', 'ACBA.0917.00019.i0001_03957', 'yfcG_3',
                    '| Disulfide-bond oxidoreductase YfcG | 1.8.4.- | similar to AA sequence:UniProtKB:P77526']
        recieved_last_row = prots_info.iloc[len(prots_info) - 1].values.tolist()
        self.assertListEqual(last_row, recieved_last_row)
Exemplo n.º 17
0
    def test_read_hmm_gembase(self):
        """
        Test that the hmm hits are well read, when the gembase format is used (.prt file is
        provided, prodigal is not used to find the proteins).
        """
        replicon_id = 'ACBA.0917.00019'
        contig_id = 'ACBA.0917.00019.0001'
        result_dir_expected = self.find_data(
            "Results_Integron_Finder_{}.gembase".format(replicon_id))
        replicon_path = self.find_data(
            os.path.join('Gembase', 'Replicons', replicon_id + '.fna'))
        prot_file = os.path.join(result_dir_expected,
                                 "tmp_{}".format(contig_id),
                                 contig_id + '.prt')
        infile = os.path.join(result_dir_expected, "tmp_{}".format(contig_id),
                              "{}_intI.res".format(contig_id))

        args = argparse.Namespace()
        args.gembase = True
        args.replicon = replicon_path
        cfg = Config(args)

        sequences_db = read_multi_prot_fasta(replicon_path)
        replicon = next(sequences_db)
        prot_db = GembaseDB(replicon, cfg, prot_file=prot_file)

        df = read_hmm(contig_id, prot_db, infile, cfg)
        exp = pd.DataFrame(data={
            "Accession_number": contig_id,
            "query_name": "intI_Cterm",
            "ID_query": "-",
            "ID_prot": "ACBA.0917.00019.i0001_00298",
            "strand": -1,
            "pos_beg": 311597,
            "pos_end": 312631,
            "evalue": 3.6e-25
        },
                           index=[0])
        exp = exp[[
            "Accession_number", "query_name", "ID_query", "ID_prot", "strand",
            "pos_beg", "pos_end", "evalue"
        ]]

        pdt.assert_frame_equal(df, exp)
Exemplo n.º 18
0
    def test_get_description(self):
        # SeqDesc(id, strand, strat, stop)
        file_name = {
            ('ACBA.0917.00019', '.fna'): {
                'ACBA.0917.00019.b0001_00001':
                SeqDesc('ACBA.0917.00019.b0001_00001', -1, 266, 1480),
                'ACBA.0917.00019.i0001_03957':
                SeqDesc('ACBA.0917.00019.i0001_03957', -1, 4043755, 4044354)
            },
        }

        for seq_name, ext in file_name:
            replicon_path = self.find_data(
                os.path.join('Gembase', 'Replicons', seq_name + ext))
            self.args.replicon = replicon_path
            cfg = Config(self.args)
            seq_db = read_multi_prot_fasta(replicon_path)
            replicon = next(seq_db)
            replicon.path = replicon_path
            os.makedirs(cfg.tmp_dir(replicon.id))

            db = GembaseDB(replicon, cfg)

            descriptions = file_name[(seq_name, ext)]
            for seq_id, desc in descriptions.items():
                self.assertEqual(desc, db.get_description(seq_id))

        with self.assertRaises(IntegronError) as ctx:
            db.get_description('nimport_naoik')
        self.assertEqual(
            str(ctx.exception),
            "'nimport_naoik' is not a valid Gembase protein identifier.")

        with self.assertRaises(KeyError) as ctx:
            db.get_description('FOO.BAR.00019.i0001_03924')
        self.assertEqual(str(ctx.exception), "'FOO.BAR.00019.i0001_03924'")
Exemplo n.º 19
0
 def test_gembase_sniffer(self):
     file_names = (('ACBA.0917.00019', 'Draft'), ('ESCO001.C.00001.C001', 'Complet'))
     for file_name, gem_type in file_names:
         lst_path = self.find_data(os.path.join('Gembase', 'LSTINFO', file_name + '.lst'))
         type_recieved = GembaseDB.gembase_sniffer(lst_path)
         self.assertEqual(type_recieved, gem_type)
Exemplo n.º 20
0
def find_integron_in_one_replicon(replicon, config):
    """
    scan replicon for integron.

      * presence of integrase
      * presence of attC sites
      * presence of promoters and attI sites

    depending on the configuration

     * perform functional annotation

    produce a file containing presence of putative integrons

    depending on configuration

        * produce genbank file with replicon and annotations with integrons
        * produce schema of replicon with integrons (in pdf)

    :param replicon: the replicon to analyse.
    :type replicon: a :class:`Bio.SeqRecord` object.
    :param config: The configuration
    :type config: a :class:`integron_finder.config.Config` object.
    :returns: the path to the integron file (<replicon_id>.integrons)
              and the summary file (<replicon_id.summary>).
              if there is no integron the summary file is None
    :rtype: tuple (str integron_file, str summary_file) or (str integron_file, None)
    """
    result_tmp_dir = config.tmp_dir(replicon.id)
    try:
        os.mkdir(result_tmp_dir)
    except OSError:
        pass
    tmp_replicon_path = os.path.join(result_tmp_dir, replicon.id + '.fst')
    SeqIO.write(replicon, tmp_replicon_path, "fasta")
    # create attr path
    # used to generate protein file with prodigal
    replicon.path = tmp_replicon_path

    # func_annot_path is the canonical path for Functional_annotation
    # path_func_annot is the path provide on the command line
    if config.func_annot and not config.no_proteins and not config.path_func_annot:
        if os.path.exists('bank_hmm'):
            fa_hmm = scan_hmm_bank('bank_hmm')
        elif os.path.exists(config.func_annot_path):
            fa_hmm = scan_hmm_bank(config.func_annot_path)
        else:
            raise IntegronError(
                "the dir '{}' neither 'bank_hmm' exists, specify the location of hmm "
                "profile with --path-func-annot option".format(
                    config.func_annot_path))
        is_func_annot = True

    elif config.path_func_annot and config.no_proteins is False:
        fa_hmm = scan_hmm_bank(config.path_func_annot)
        is_func_annot = True
    else:
        is_func_annot = False

    if is_func_annot and not fa_hmm:
        _log.warning(
            "No hmm profiles for functional annotation detected, skip functional annotation step."
        )

    if config.gembase_path:
        protein_db = GembaseDB(replicon,
                               config,
                               gembase_path=config.gembase_path)
    elif config.gembase:
        protein_db = GembaseDB(replicon, config)
    else:
        protein_db = ProdigalDB(replicon, config)

    ##################
    # Default search #
    ##################
    intI_file = os.path.join(result_tmp_dir, replicon.id + "_intI.res")
    phageI_file = os.path.join(result_tmp_dir, replicon.id + "_phage_int.res")
    attC_default_file = os.path.join(result_tmp_dir,
                                     replicon.id + "_attc_table.res")

    try:
        if not config.no_proteins:
            if not os.path.isfile(intI_file) or not os.path.isfile(
                    phageI_file):
                find_integrase(replicon.id, protein_db.protfile,
                               result_tmp_dir, config)
        _log.info("Starting Default search ... :")
        if not os.path.isfile(attC_default_file):
            # find attc with cmsearch
            find_attc(tmp_replicon_path,
                      replicon.name,
                      config.cmsearch,
                      result_tmp_dir,
                      config.model_attc_path,
                      incE=config.evalue_attc,
                      cpu=config.cpu)

        _log.info("Default search done... : ")
        integrons = find_integron(replicon, protein_db, attC_default_file,
                                  intI_file, phageI_file, config)

        #########################
        # Search with local_max #
        #########################
        if config.local_max:
            _log.info("Starting search with local_max...:")
            if not os.path.isfile(
                    os.path.join(result_tmp_dir, "integron_max.pickle")):
                circular = True if replicon.topology == 'circ' else False
                integron_max = find_attc_max(
                    integrons,
                    replicon,
                    config.distance_threshold,
                    config.model_attc_path,
                    max_attc_size=config.max_attc_size,
                    min_attc_size=config.min_attc_size,
                    circular=circular,
                    out_dir=result_tmp_dir,
                    cpu=config.cpu,
                    evalue_attc=config.evalue_attc)
                integron_max.to_pickle(
                    os.path.join(result_tmp_dir, "integron_max.pickle"))
                _log.info("Search with local_max done... :")

            else:
                integron_max = pd.read_pickle(
                    os.path.join(result_tmp_dir, "integron_max.pickle"))
                integron_max = integron_max[
                    (integron_max.evalue < config.evalue_attc)
                    & (abs(integron_max.pos_end -
                           integron_max.pos_beg) < config.max_attc_size) &
                    (config.min_attc_size <
                     abs(integron_max.pos_end - integron_max.pos_beg))]
                _log.info(
                    "Search with local_max was already done, continue... :")

            integrons = find_integron(replicon, protein_db, integron_max,
                                      intI_file, phageI_file, config)

        ##########################
        # Add promoters and attI #
        ##########################
        for integron in integrons:
            integron_type = integron.type()
            if integron_type != "In0":  # complete & CALIN
                if not config.no_proteins:
                    _log.info("Adding proteins ... :")
                    integron.add_proteins(protein_db)

            if config.promoter_attI:
                _log.info("Adding promoters and attI ... :")
                if integron_type == "complete":
                    integron.add_promoter()
                    integron.add_attI()
                elif integron_type == "In0":
                    integron.add_attI()
                    integron.add_promoter()
        #########################
        # Functional annotation #
        #########################
        if is_func_annot and fa_hmm:
            _log.info("Starting functional annotation ...:")
            func_annot(integrons, replicon, protein_db, fa_hmm, config,
                       result_tmp_dir)

        #######################
        # Writing out results #
        #######################
        _log.info("Writing out results for replicon {}".format(replicon.id))

        if config.pdf:
            for j, integron in enumerate(integrons, 1):
                if integron.type() == "complete":
                    integron.draw_integron(file=os.path.join(
                        config.result_dir, "{}_{}.pdf".format(replicon.id, j)))

        base_outfile = os.path.join(config.result_dir, replicon.id)
        integron_file = base_outfile + ".integrons"
        _log.debug("Writing integron_file {}".format(integron_file))
        if integrons:
            integrons_report = results.integrons_report(integrons)
            integrons_report.to_csv(integron_file,
                                    sep="\t",
                                    index=False,
                                    na_rep="NA")

            summary = results.summary(integrons_report)
            summary_file = base_outfile + ".summary"
            summary.to_csv(summary_file,
                           sep="\t",
                           na_rep="NA",
                           index=False,
                           columns=[
                               'ID_replicon', 'ID_integron', 'complete', 'In0',
                               'CALIN'
                           ])
            if config.gbk:
                add_feature(replicon, integrons_report, protein_db,
                            config.distance_threshold)
                SeqIO.write(
                    replicon,
                    os.path.join(config.result_dir, replicon.id + ".gbk"),
                    "genbank")
        else:
            with open(integron_file, "w") as out_f:
                out_f.write("# No Integron found\n")
            summary_file = None
    except integron_finder.EmptyFileError as err:
        _log.warning('############ Skip replicon {} ############'.format(
            replicon.name))
        integron_file = ''
        summary_file = ''
    #########################
    # clean temporary files #
    #########################

    if not config.keep_tmp:
        try:
            shutil.rmtree(result_tmp_dir)
        except Exception as err:
            _log.warning("Cannot remove temporary results : '{} : {}'".format(
                result_tmp_dir, str(err)))

    return integron_file, summary_file