示例#1
0
    def setUp(self):
        self.identify_dir_reference = 'tests/data/identify_dir_reference/'
        self.align_dir_reference = 'tests/data/align_dir_reference/'
        self.genome_dir = 'gtdbtk/tests/data/genomes/'

        self.options = argparse.ArgumentParser()
        self.options.batchfile = None
        self.options.prefix = 'gtdbtk'
        self.options.cpus = 1
        self.options.extension = 'fna'
        self.options.debug = False

        # align option
        self.options.skip_gtdb_refs = False
        self.options.taxa_filter = None
        self.options.custom_msa_filters = False
        self.options.min_consensus = None
        self.options.min_perc_taxa = None
        self.options.skip_gtdb_refs = False
        self.options.cols_per_gene = None
        self.options.max_consensus = None
        self.options.min_perc_aa = 50

        # classify options
        self.options.scratch_dir = None

        # infer options
        self.options.prot_model = 'WAG'
        self.options.no_support = False
        self.options.no_gamma = True

        self.version = ' unittest'
        self.optionparser = OptionsParser(self.version)
        logger_setup(None, "gtdbtk.log", "GTDB-Tk", self.version, True)
        self.generic_out_path = 'tests/data/results'
示例#2
0
    def setUp(self):
        self.identify_dir_reference = os.path.join(
            os.path.dirname(__file__), 'data/identify_dir_reference/')
        self.align_dir_reference = 'tests/data/align_dir_reference/'
        self.genome_dir = 'gtdbtk/tests/data/genomes/'

        self.options = argparse.ArgumentParser()
        self.options.batchfile = None
        self.options.prefix = 'gtdbtk'
        self.options.cpus = 1
        self.options.extension = 'fna'
        self.options.debug = False
        self.options.force = False
        self.options.genes = False
        self.options.write_single_copy_genes = False

        # align option
        self.options.skip_gtdb_refs = False
        self.options.taxa_filter = None
        self.options.custom_msa_filters = False
        self.options.skip_trimming = False
        self.options.min_consensus = None
        self.options.min_perc_taxa = None
        self.options.skip_gtdb_refs = False
        self.options.cols_per_gene = None
        self.options.max_consensus = None
        self.options.min_perc_aa = 50
        self.options.rnd_seed = 42
        self.options.outgroup_taxon = None

        # classify options
        self.options.scratch_dir = None
        self.options.keep_ref_red = None
        self.options.pplacer_cpus = None
        self.options.min_af = None

        # infer options
        self.options.prot_model = 'WAG'
        self.options.no_support = False
        self.options.no_gamma = True

        self.version = ' unittest'
        self.optionparser = OptionsParser(self.version)
        logger_setup(None, "gtdbtk.log", "GTDB-Tk", self.version, True)
        # self.generic_out_path = 'tests/data/results'
        self.generic_out_path = tempfile.mkdtemp(prefix='gtdbtk_tmp_')
示例#3
0
 def setUp(self):
     self.options_parser = OptionsParser('-1')
     self.dir_tmp = tempfile.mkdtemp(prefix='gtdbtk_tmp_')
     pass
示例#4
0
class TestOptionsParser(unittest.TestCase):
    def setUp(self):
        self.options_parser = OptionsParser('-1')
        self.dir_tmp = tempfile.mkdtemp(prefix='gtdbtk_tmp_')
        pass

    def tearDown(self):
        shutil.rmtree(self.dir_tmp)

    def test__verify_genome_id__valid(self):
        """ Test that a valid genome id returns True. """
        self.assertTrue(
            self.options_parser._assert_genome_id_valid('genome_1'))

    def test__verify_genome_id__invalid(self):
        """ Test that invalid genome ids throw an exception. """
        for c in list('()[],;='):
            self.assertRaises(GenomeNameInvalid,
                              self.options_parser._assert_genome_id_valid,
                              'genome%s1' % c)

    def test__genomes_to_process__genome_dir__valid(self):
        """ Test that the expected results are returned when using genome_dir. """
        open(os.path.join(self.dir_tmp, 'genome_1.fna'), 'a').close()
        open(os.path.join(self.dir_tmp, 'genome_2.fna'), 'a').close()
        open(os.path.join(self.dir_tmp, 'other_file.txt'), 'a').close()
        results = self.options_parser._genomes_to_process(
            self.dir_tmp, '', 'fna')
        expected = {
            'genome_1': os.path.join(self.dir_tmp, 'genome_1.fna'),
            'genome_2': os.path.join(self.dir_tmp, 'genome_2.fna')
        }
        self.assertDictEqual(results, expected)

    def test__genomes_to_process__batchfile__valid(self):
        """ Test that the expected results are returned when using batchfile """
        path_batchfile = os.path.join(self.dir_tmp, 'batchfile.txt')
        path_genome_1 = os.path.join(self.dir_tmp, 'genome_1.fna')
        path_genome_2 = os.path.join(self.dir_tmp, 'genome_2.fna')
        open(path_genome_1, 'a').close()
        open(path_genome_2, 'a').close()

        with open(path_batchfile, 'a') as f:
            f.write('%s\tgenome_1\n' % path_genome_1)
            f.write('\n')
            f.write('%s\tgenome_2\n' % path_genome_2)

        results = self.options_parser._genomes_to_process(
            '', path_batchfile, 'fna')
        expected = {'genome_1': path_genome_1, 'genome_2': path_genome_2}
        self.assertDictEqual(results, expected)

    def test__genomes_to_process__batchfile__invalid_columns(self):
        """ Test that a batchfile containing columns not equal to 2 throws an exception. """
        path_batchfile = os.path.join(self.dir_tmp, 'batchfile.txt')
        path_genome_1 = os.path.join(self.dir_tmp, 'genome_1.fna')
        path_genome_2 = os.path.join(self.dir_tmp, 'genome_2.fna')
        open(path_genome_1, 'a').close()
        open(path_genome_2, 'a').close()

        with open(path_batchfile, 'a') as f:
            f.write('%s\tgenome_1\n' % path_genome_1)
            f.write('\n')
            f.write('%s\tgenome_2\tfoo\n' % path_genome_2)

        self.assertRaises(GenomeBatchfileMalformed,
                          self.options_parser._genomes_to_process, '',
                          path_batchfile, 'fna')

    def test__genomes_to_process__batchfile__blank_genome_path(self):
        """ Test that a batchfile containing a blank genome path throws an exception. """
        path_batchfile = os.path.join(self.dir_tmp, 'batchfile.txt')
        path_genome_1 = os.path.join(self.dir_tmp, 'genome_1.fna')
        path_genome_2 = os.path.join(self.dir_tmp, 'genome_2.fna')
        open(path_genome_1, 'a').close()
        open(path_genome_2, 'a').close()

        with open(path_batchfile, 'a') as f:
            f.write('%s\tgenome_1\n' % path_genome_1)
            f.write('\n')
            f.write('%s\tgenome_2\n' % '')

        self.assertRaises(GenomeBatchfileMalformed,
                          self.options_parser._genomes_to_process, '',
                          path_batchfile, 'fna')

    def test__genomes_to_process__batchfile__blank_genome_id(self):
        """ Test that a batchfile containing a blank genome id throws an exception. """
        path_batchfile = os.path.join(self.dir_tmp, 'batchfile.txt')
        path_genome_1 = os.path.join(self.dir_tmp, 'genome_1.fna')
        path_genome_2 = os.path.join(self.dir_tmp, 'genome_2.fna')
        open(path_genome_1, 'a').close()
        open(path_genome_2, 'a').close()

        with open(path_batchfile, 'a') as f:
            f.write('%s\tgenome_1\n' % path_genome_1)
            f.write('\n')
            f.write('%s\t\n' % path_genome_2)

        self.assertRaises(GenomeBatchfileMalformed,
                          self.options_parser._genomes_to_process, '',
                          path_batchfile, 'fna')

    def test__genomes_to_process__batchfile__duplicate_genome_id(self):
        """ Test that a batchfile containing duplicate genome ids throws an exception. """
        # Branch 1: The number of columns are not equal to 2.
        path_batchfile = os.path.join(self.dir_tmp, 'batchfile.txt')
        path_genome_1 = os.path.join(self.dir_tmp, 'genome_1.fna')
        path_genome_2 = os.path.join(self.dir_tmp, 'genome_2.fna')
        open(path_genome_1, 'a').close()
        open(path_genome_2, 'a').close()

        with open(path_batchfile, 'a') as f:
            f.write('%s\tgenome_1\n' % path_genome_1)
            f.write('\n')
            f.write('%s\tgenome_1\n' % path_genome_2)

        self.assertRaises(GenomeBatchfileMalformed,
                          self.options_parser._genomes_to_process, '',
                          path_batchfile, 'fna')

    def test__genomes_to_process__batchfile__invalid_genome_id(self):
        """ Test that a batchfile containing duplicate genome ids throws an exception. """
        # Branch 1: The number of columns are not equal to 2.
        path_batchfile_1 = os.path.join(self.dir_tmp, 'batchfile_1.txt')
        path_batchfile_2 = os.path.join(self.dir_tmp, 'batchfile_2.txt')
        path_batchfile_3 = os.path.join(self.dir_tmp, 'batchfile_3.txt')
        path_genome_1 = os.path.join(self.dir_tmp, 'genome_1.fna')
        path_genome_2 = os.path.join(self.dir_tmp, 'genome_2.fna')
        open(path_genome_1, 'a').close()
        open(path_genome_2, 'a').close()

        with open(path_batchfile_1, 'a') as f:
            f.write('%s\tgenome_1\n' % path_genome_1)
            f.write('\n')
            f.write('%s\tGB_genome_2\n' % path_genome_2)

        with open(path_batchfile_2, 'a') as f:
            f.write('%s\tgenome_1\n' % path_genome_1)
            f.write('\n')
            f.write('%s\tRS_genome_2\n' % path_genome_2)

        with open(path_batchfile_3, 'a') as f:
            f.write('%s\tgenome_1\n' % path_genome_1)
            f.write('\n')
            f.write('%s\tUBAgenome_2\n' % path_genome_2)

        self.assertRaises(GenomeNameInvalid,
                          self.options_parser._genomes_to_process, '',
                          path_batchfile_1, 'fna')
        self.assertRaises(GenomeNameInvalid,
                          self.options_parser._genomes_to_process, '',
                          path_batchfile_2, 'fna')
        self.assertRaises(GenomeNameInvalid,
                          self.options_parser._genomes_to_process, '',
                          path_batchfile_3, 'fna')

    def test__genomes_to_process__no_files(self):
        """ Test that an exception is thrown if no files are found to process """
        # Branch 1 : genome_dir is specified
        tmp_genome_dir = tempfile.mkdtemp()
        try:
            self.assertRaises(NoGenomesFound,
                              self.options_parser._genomes_to_process,
                              tmp_genome_dir, '', 'fna')
        finally:
            shutil.rmtree(tmp_genome_dir)

        # Branch 2: batchfile is specified
        tmp_genome_dir = tempfile.mkdtemp()
        try:
            path_batchfile = os.path.join(tmp_genome_dir, 'batchfile.txt')
            open(path_batchfile, 'a').close()
            self.assertRaises(NoGenomesFound,
                              self.options_parser._genomes_to_process, '',
                              path_batchfile, 'fna')
        finally:
            shutil.rmtree(tmp_genome_dir)

    def test__marker_set_id(self):
        """ Test that the correct marker set id is returned """
        self.assertEqual(
            self.options_parser._marker_set_id(True, False, False), 'bac120')
        self.assertEqual(
            self.options_parser._marker_set_id(False, True, False), 'ar122')
        self.assertEqual(
            self.options_parser._marker_set_id(False, False, True), 'rps23')

    def test_identify__genome_dir_raises_io_exception(self):
        """ Test that the identify method raises an exception on invalid genome_dir """
        options = argparse.ArgumentParser()
        options.genome_dir = os.path.join(tempfile.gettempdir(),
                                          'non-existent-dir')
        self.assertRaises(BioLibDirNotFound, self.options_parser.identify,
                          options)

    def test_identify__batchfile_raises_io_exception(self):
        """ Test that the identify method raises an exception on invalid batchfile """
        options = argparse.ArgumentParser()
        options.genome_dir = None
        options.batchfile = os.path.join(tempfile.gettempdir(),
                                         'non-existent-file.txt')
        self.assertRaises(BioLibFileNotFound, self.options_parser.identify,
                          options)

    def test_align__identify_dir_raises_io_exception(self):
        """ Test that the align method raises an exception on invalid identify dir """
        options = argparse.ArgumentParser()
        options.identify_dir = os.path.join(tempfile.gettempdir(),
                                            'non-existent-dir')
        self.assertRaises(BioLibDirNotFound, self.options_parser.align,
                          options)

    def test_infer__msa_raises_io_exception(self):
        """ Test that the infer method raises an exception on invalid MSA """
        options = argparse.ArgumentParser()
        options.msa_file = os.path.join(tempfile.gettempdir(),
                                        'non-existent-msa.txt')
        self.assertRaises(BioLibFileNotFound, self.options_parser.infer,
                          options)

    def test_run_test(self):
        """Test that the user-test method runs correctly"""
        options = argparse.ArgumentParser()
        options.out_dir = self.dir_tmp
        options.cpus = 3
        self.assertTrue(self.options_parser.run_test(options))

    def test_run_test__throws_exception(self):
        """Test that the user-test method fails correctly"""
        options = argparse.ArgumentParser()
        options.out_dir = self.dir_tmp
        os.mkdir(os.path.join(self.dir_tmp, 'genomes'))
        options.cpus = 3
        self.assertRaises(GTDBTkTestFailure, self.options_parser.run_test,
                          options)

    def test_classify__align_dir_raises_io_exception(self):
        """ Test that the classify method raises an exception on invalid align dir """
        options = argparse.ArgumentParser()
        options.align_dir = os.path.join(tempfile.gettempdir(),
                                         'non-existent-dir')
        self.assertRaises(BioLibDirNotFound, self.options_parser.classify,
                          options)

    def test_root__no_tree_raises_io_exception(self):
        """ Test that the infer method raises an exception on invalid tree """
        options = argparse.ArgumentParser()
        options.input_tree = os.path.join(tempfile.gettempdir(),
                                          'non-existent-tree.tree')
        self.assertRaises(BioLibFileNotFound, self.options_parser.root,
                          options)

    def test_decorate__no_tree_raises_io_exception(self):
        """ Test that the infer method raises an exception on invalid tree """
        options = argparse.ArgumentParser()
        options.input_tree = os.path.join(tempfile.gettempdir(),
                                          'non-existent-tree.tree')
        self.assertRaises(BioLibFileNotFound, self.options_parser.decorate,
                          options)

    def test_trim_msa__mask_file(self):
        """ Test that the expected result is returned when running trim_msa with mask_file """
        path_untrimmed_msa = os.path.join(self.dir_tmp, 'untrimmed_msa.fasta')
        path_mask_file = os.path.join(self.dir_tmp, 'mask_file.txt')
        path_output = os.path.join(self.dir_tmp, 'trimmed_msa.fasta')

        with open(path_untrimmed_msa, 'w') as f:
            f.write('>genome_1\n')
            f.write('ALGPVW\n')
            f.write('>genome_2\n')
            f.write('WVPGLA\n')

        with open(path_mask_file, 'w') as f:
            f.write('010010\n')

        options = argparse.ArgumentParser()
        # Required arguments
        options.untrimmed_msa = path_untrimmed_msa
        options.output = path_output
        # Mutex arguments
        options.mask_file = path_mask_file
        options.reference_mask = None

        self.options_parser.trim_msa(options)

        results = dict()
        with open(path_output, 'r') as f:
            re_hits = re.findall(r'>(.+)\n(.+)\n', f.read())
            for gid, seq in re_hits:
                results[gid] = seq

        expected = {'genome_1': 'LV', 'genome_2': 'VL'}

        self.assertDictEqual(results, expected)

    def test_trim_msa__reference_mask_arc(self):
        """ Test that the expected result is returned when running trim_msa with archaeal reference_mask """
        path_untrimmed_msa = os.path.join(self.dir_tmp, 'untrimmed_msa.fasta')
        path_output = os.path.join(self.dir_tmp, 'trimmed_msa.fasta')

        msa_str = str()
        while len(msa_str) < 32675:
            msa_str += 'ALGPVW'
        msa_str = msa_str[0:32675]

        with open(path_untrimmed_msa, 'w') as f:
            f.write('>genome_1\n')
            f.write('%s\n' % msa_str)
            f.write('>genome_2\n')
            f.write('%s\n' % msa_str[::-1])

        options = argparse.ArgumentParser()
        # Required arguments
        options.untrimmed_msa = path_untrimmed_msa
        options.output = path_output
        # Mutex arguments
        options.mask_file = None
        options.reference_mask = 'arc'

        self.options_parser.trim_msa(options)

        results = dict()
        with open(path_output, 'r') as f:
            re_hits = re.findall(r'>(.+)\n(.+)\n', f.read())
            for gid, seq in re_hits:
                results[gid] = hashlib.sha256(seq).hexdigest()

        expected = {
            'genome_1':
            '332b8cd125a36c375196064e136efab78db38e41bbd8bd8484243531bc57df6d',
            'genome_2':
            '84e91b9f5fa1ec0bedc0097233044e6dd0e79557bb6df3625928dc9573795989'
        }

        self.assertDictEqual(results, expected)

    def test_trim_msa__reference_mask_bac(self):
        """ Test that the expected result is returned when running trim_msa with bacterial reference_mask """
        path_untrimmed_msa = os.path.join(self.dir_tmp, 'untrimmed_msa.fasta')
        path_output = os.path.join(self.dir_tmp, 'trimmed_msa.fasta')

        msa_str = str()
        while len(msa_str) < 41155:
            msa_str += 'ALGPVW'
        msa_str = msa_str[0:41155]

        with open(path_untrimmed_msa, 'w') as f:
            f.write('>genome_1\n')
            f.write('%s\n' % msa_str)
            f.write('>genome_2\n')
            f.write('%s\n' % msa_str[::-1])

        options = argparse.ArgumentParser()
        # Required arguments
        options.untrimmed_msa = path_untrimmed_msa
        options.output = path_output
        # Mutex arguments
        options.mask_file = None
        options.reference_mask = 'bac'

        self.options_parser.trim_msa(options)

        results = dict()
        with open(path_output, 'r') as f:
            re_hits = re.findall(r'>(.+)\n(.+)\n', f.read())
            for gid, seq in re_hits:
                results[gid] = hashlib.sha256(seq).hexdigest()

        expected = {
            'genome_1':
            '35e080f9ab7d318e8f4a7cef46ce6044bd9c538e6fbe8a69b17431df44bd5a81',
            'genome_2':
            'bb4beed69063dad0092a809ee5854ff124da0b55c651edd50c47b1d8fdff0d7b'
        }

        self.assertDictEqual(results, expected)

    def test_export_msa__arc(self):
        """ Test that the untrimmed archaeal MSA is exported correctly """
        path_out = os.path.join(self.dir_tmp, 'output.fasta')

        options = argparse.ArgumentParser()
        options.domain = 'arc'
        options.output = path_out

        self.options_parser.export_msa(options)

        with open(path_out, 'r') as f:
            out_hash = hashlib.sha256(f.read()).hexdigest()
        self.assertEqual(
            out_hash,
            '11eb12b91ab20c43824abafb909ccc20bed84a8609a9bf82748b2cdbdd8b7aad')

    def test_export_msa__bac(self):
        """ Test that the untrimmed bacterial MSA is exported correctly """
        path_out = os.path.join(self.dir_tmp, 'output.fasta')

        options = argparse.ArgumentParser()
        options.domain = 'bac'
        options.output = path_out

        self.options_parser.export_msa(options)

        with open(path_out, 'r') as f:
            out_hash = hashlib.sha256(f.read()).hexdigest()
        self.assertEqual(
            out_hash,
            '50dde1e96df9533def7c7047a1e8627d4ad566db10f8ab3de72751e62c4ac10a')
示例#5
0
    def setUp(self):
        self.options_parser = OptionsParser(-1)

        pass
示例#6
0
文件: __main__.py 项目: fplaza/GTDBTk
def main():
    # -------------------------------------------------
    # get and check options
    args = None
    if len(sys.argv) == 1:
        print_help()
        sys.exit(0)
    elif sys.argv[1] in {'-v', '--v', '-version', '--version'}:
        print(f"gtdbtk: version {__version__} {__copyright__} {__author__}")

        # Warn the user they are not using the latest version (if possible)
        latest_ver = get_gtdbtk_latest_version()
        if latest_ver and latest_ver != __version__:
            print(f'Note: There is a newer version of GTDB-Tk available: v{latest_ver}')
        sys.exit(0)
    elif sys.argv[1] in {'-h', '--h', '-help', '--help'}:
        print_help()
        sys.exit(0)
    else:
        args = get_main_parser().parse_args()

    # setup logger
    logger_setup(args.out_dir if hasattr(args, 'out_dir') else None,
                 "gtdbtk.log", "GTDB-Tk", __version__, False,
                 hasattr(args, 'debug') and args.debug)
    logger = logging.getLogger('timestamp')

    # -------------------------------------------------
    # do what we came here to do
    try:
        gt_parser = OptionsParser(__version__)
        gt_parser.parse_options(args)
    except SystemExit:
        logger.error('Controlled exit resulting from early termination.')
        sys.exit(1)
    except KeyboardInterrupt:
        logger.error('Controlled exit resulting from interrupt signal.')
        sys.exit(1)
    except GTDBTkExit as e:
        if len(str(e)) > 0:
            logger.error('{}'.format(e))
        logger.error('Controlled exit resulting from an unrecoverable error or warning.')
        sys.exit(1)
    except (GTDBTkException, BioLibError) as e:
        msg = 'Controlled exit resulting from an unrecoverable error or warning.\n\n'
        msg += '=' * 80 + '\n'
        msg += 'EXCEPTION: {}\n'.format(type(e).__name__)
        msg += '  MESSAGE: {}\n'.format(e)
        msg += '_' * 80 + '\n\n'
        msg += traceback.format_exc()
        msg += '=' * 80
        logger.error(msg)
        sys.exit(1)
    except Exception as e:
        msg = 'Uncontrolled exit resulting from an unexpected error.\n\n'
        msg += '=' * 80 + '\n'
        msg += 'EXCEPTION: {}\n'.format(type(e).__name__)
        msg += '  MESSAGE: {}\n'.format(e)
        msg += '_' * 80 + '\n\n'
        msg += traceback.format_exc()
        msg += '=' * 80
        logger.error(msg)
        sys.exit(1)
示例#7
0
class TestCli(unittest.TestCase):

    def setUp(self):
        self.identify_dir_reference = os.path.join(os.path.dirname(__file__), 'data/identify_dir_reference/')
        self.align_dir_reference = 'tests/data/align_dir_reference/'
        self.genome_dir = 'gtdbtk/tests/data/genomes/'

        self.options = argparse.ArgumentParser()
        self.options.batchfile = None
        self.options.prefix = 'gtdbtk'
        self.options.cpus = 1
        self.options.extension = 'fna'
        self.options.debug = False
        self.options.force = False
        self.options.genes = False
        self.options.write_single_copy_genes = False

        # align option
        self.options.skip_gtdb_refs = False
        self.options.taxa_filter = None
        self.options.custom_msa_filters = False
        self.options.skip_trimming = False
        self.options.min_consensus = None
        self.options.min_perc_taxa = None
        self.options.skip_gtdb_refs = False
        self.options.cols_per_gene = None
        self.options.max_consensus = None
        self.options.min_perc_aa = 50
        self.options.rnd_seed = 42
        self.options.outgroup_taxon = None

        # classify options
        self.options.scratch_dir = None
        self.options.keep_ref_red = None
        self.options.pplacer_cpus = None
        self.options.min_af = None

        # infer options
        self.options.prot_model = 'WAG'
        self.options.no_support = False
        self.options.no_gamma = True

        self.version = ' unittest'
        self.optionparser = OptionsParser(self.version)
        logger_setup(None, "gtdbtk.log", "GTDB-Tk", self.version, True)
        # self.generic_out_path = 'tests/data/results'
        self.generic_out_path = tempfile.mkdtemp(prefix='gtdbtk_tmp_')

    def tearDown(self):
        shutil.rmtree(self.generic_out_path)


    def test_identify(self):
        tmp_folder = ''.join(random.choice(
            string.ascii_uppercase + string.digits) for _ in range(10))
        identify_options = self.options
        identify_options.genome_dir = self.genome_dir
        identify_options.out_dir = os.path.join(
            self.generic_out_path, tmp_folder, 'identify')
        self.optionparser.identify(identify_options)

        ar53_marker_path = os.path.join(self.options.out_dir,
                                         PATH_AR53_MARKER_SUMMARY.format(prefix=self.options.prefix))

        self.assertTrue(os.path.isfile(
            os.path.join(self.options.out_dir, PATH_BAC120_MARKER_SUMMARY.format(prefix=self.options.prefix))))
        self.assertTrue(os.path.isfile(ar53_marker_path))

        results = {}
        with open(ar53_marker_path, 'r') as f:
            f.readline()
            for line in f:
                infos = line.split('\t', 1)
                results[infos[0]] = infos[1]
        self.assertTrue(results.get('genome_1').startswith('120\t2\t0\t'))

    def test_align(self):
        tmp_folder = ''.join(random.choice(
            string.ascii_uppercase + string.digits) for _ in range(10))
        align_options = self.options
        align_options.identify_dir = self.identify_dir_reference
        align_options.out_dir = os.path.join(
            self.generic_out_path, tmp_folder, 'align')
        self.optionparser.align(align_options)
        path_user_msa = os.path.join(align_options.out_dir, PATH_AR53_USER_MSA.format(prefix=align_options.prefix))
        self.assertTrue(os.path.isfile(path_user_msa))
        with open(path_user_msa, 'r') as f:
            lines = f.read().splitlines()
            last_line = lines[-1]
        self.assertTrue(len(last_line) > 4500)
        self.assertTrue(len(last_line) < 5500)
        self.assertTrue('-' in last_line)
        self.assertFalse(any(char.isdigit() for char in last_line))

    def test_identify_align(self):
        tmp_folder = ''.join(random.choice(
            string.ascii_uppercase + string.digits) for _ in range(10))

        identify_options = self.options
        identify_options.genome_dir = self.genome_dir
        identify_options.out_dir = os.path.join(
            self.generic_out_path, tmp_folder, 'identify')
        self.optionparser.identify(identify_options)

        align_options = self.options
        align_options.identify_dir = os.path.join(
            self.generic_out_path, tmp_folder, 'identify')
        align_options.out_dir = os.path.join(
            self.generic_out_path, tmp_folder, 'align')
        self.optionparser.align(align_options)
        path_user_msa = os.path.join(align_options.out_dir, PATH_AR53_USER_MSA.format(prefix=align_options.prefix))
        self.assertTrue(os.path.isfile(path_user_msa))
        with open(path_user_msa, 'r') as f:
            lines = f.read().splitlines()
            last_line = lines[-1]
        self.assertTrue(len(last_line) > 4500)
        self.assertTrue(len(last_line) < 5500)
        self.assertTrue('-' in last_line)
        self.assertFalse(any(char.isdigit() for char in last_line))

    def test_identify_align_classify(self):
        tmp_folder = ''.join(random.choice(
            string.ascii_uppercase + string.digits) for _ in range(10))

        identify_options = self.options
        identify_options.genome_dir = self.genome_dir
        identify_options.out_dir = os.path.join(
            self.generic_out_path, tmp_folder, 'identify')
        self.optionparser.identify(identify_options)

        align_options = self.options
        align_options.identify_dir = os.path.join(
            self.generic_out_path, tmp_folder, 'identify')
        align_options.out_dir = os.path.join(
            self.generic_out_path, tmp_folder, 'align')
        self.optionparser.align(align_options)
        path_user_msa = os.path.join(align_options.out_dir, PATH_AR53_USER_MSA.format(prefix=align_options.prefix))
        self.assertTrue(os.path.isfile(path_user_msa))
        with open(path_user_msa, 'r') as f:
            lines = f.read().splitlines()
            last_line = lines[-1]
        self.assertTrue(len(last_line) > 4500)
        self.assertTrue(len(last_line) < 5500)
        self.assertTrue('-' in last_line)
        self.assertFalse(any(char.isdigit() for char in last_line))

        classify_options = self.options
        classify_options.genome_dir = self.genome_dir
        classify_options.full_tree = True
        classify_options.align_dir = align_options.out_dir
        classify_options.out_dir = os.path.join(
            self.generic_out_path, tmp_folder, 'classify')
        classify_options.recalculate_red = False
        self.optionparser.classify(classify_options)
        summary_out = os.path.join(classify_options.out_dir,
                                   PATH_AR53_SUMMARY_OUT.format(prefix=classify_options.prefix))
        self.assertTrue(summary_out)
        with open(summary_out, 'r') as f:
            lines = f.read().splitlines()
            last_line = lines[-1]
        infos = last_line.split('\t')
        self.assertEqual(len(infos), 20)
        self.assertTrue(infos[1].startswith('d__Archaea'))

        self.assertTrue(os.path.isdir(os.path.join(classify_options.out_dir, DIR_IDENTIFY_INTERMEDIATE)))
        self.assertTrue(os.path.isdir(os.path.join(classify_options.out_dir, DIR_ALIGN_INTERMEDIATE)))
        self.assertTrue(os.path.isdir(os.path.join(classify_options.out_dir, DIR_CLASSIFY_INTERMEDIATE)))
        self.optionparser.remove_intermediate_files(classify_options.out_dir,'classify_wf')
        self.assertFalse(os.path.exists(os.path.join(classify_options.out_dir, DIR_IDENTIFY_INTERMEDIATE)))
        self.assertFalse(os.path.exists(os.path.join(classify_options.out_dir, DIR_ALIGN_INTERMEDIATE)))
        self.assertFalse(os.path.exists(os.path.join(classify_options.out_dir, DIR_CLASSIFY_INTERMEDIATE)))


    def test_classify_wf(self):
        tmp_folder = ''.join(random.choice(
            string.ascii_uppercase + string.digits) for _ in range(10))
        classify_wf_options = self.options
        classify_wf_options.genome_dir = self.genome_dir
        classify_wf_options.out_dir = os.path.join(
            self.generic_out_path, tmp_folder, 'classify_wf')
        self.optionparser.identify(classify_wf_options)
        classify_wf_options.identify_dir = classify_wf_options.out_dir
        classify_wf_options.align_dir = classify_wf_options.out_dir
        classify_wf_options.taxa_filter = None
        classify_wf_options.custom_msa_filters = False
        classify_wf_options.min_consensus = None
        classify_wf_options.min_perc_taxa = None
        classify_wf_options.skip_gtdb_refs = False
        classify_wf_options.cols_per_gene = None
        classify_wf_options.max_consensus = None
        classify_wf_options.recalculate_red = False
        classify_wf_options.full_tree = True
        self.optionparser.align(classify_wf_options)
        self.optionparser.classify(classify_wf_options)
        summary_out = os.path.join(classify_wf_options.out_dir,
                                   PATH_AR53_SUMMARY_OUT.format(prefix=classify_wf_options.prefix))
        self.assertTrue(os.path.isfile(summary_out))
        with open(summary_out, 'r') as f:
            lines = f.read().splitlines()
            last_line = lines[-1]
        infos = last_line.split('\t')
        self.assertEqual(len(infos), 20)
        self.assertTrue(infos[1].startswith('d__Archaea'))

    def test_infer(self):
        tmp_folder = ''.join(random.choice(
            string.ascii_uppercase + string.digits) for _ in range(10))
        infer_options = self.options
        path_user_msa = PATH_AR53_USER_MSA.format(prefix=self.options.prefix)
        infer_options.msa_file = os.path.join(self.align_dir_reference, path_user_msa)
        infer_options.out_dir = os.path.join(self.generic_out_path, tmp_folder, 'infer')
        infer_options.gamma = False
        # if not os.path.isdir(infer_options.out_dir):
        #     os.makedirs(infer_options.out_dir)
        self.optionparser.infer(infer_options)
        with open(os.path.join(infer_options.out_dir, PATH_TREE_LOG.format(prefix=self.options.prefix)), 'r') as f:
            lines = f.read().splitlines()
            last_line = lines[-1]
        self.assertEqual(last_line.strip(), 'TreeCompleted')
        with open(os.path.join(infer_options.out_dir, PATH_UNROOTED_TREE.format(prefix=self.options.prefix)), 'r') as f:
            lines = f.read().splitlines()
            last_line = lines[-1]
        self.assertTrue('genome_1' in last_line)
        self.assertTrue('genome_2' in last_line)
        self.assertTrue('genome_3' in last_line)

    def test_de_novo_wf(self):
        tmp_folder = ''.join(random.choice(
            string.ascii_uppercase + string.digits) for _ in range(10))
        de_novo_wf_options = self.options
        de_novo_wf_options.genome_dir = self.genome_dir
        de_novo_wf_options.suffix = ".ar53"
        de_novo_wf_options.gamma = False
        de_novo_wf_options.out_dir = os.path.join(
            self.generic_out_path, tmp_folder, 'de_novo_wf')
        de_novo_wf_options.identify_dir = de_novo_wf_options.out_dir
        de_novo_wf_options.msa_file = os.path.join(
            de_novo_wf_options.out_dir, de_novo_wf_options.prefix + de_novo_wf_options.suffix + ".user_msa.fasta")
        self.optionparser.identify(de_novo_wf_options)
        self.optionparser.align(de_novo_wf_options)
        self.optionparser.infer(de_novo_wf_options)

    def test_root(self):
        """Test that rooting is successful when called through the CLI"""
        options = argparse.ArgumentParser()
        options.input_tree = 'tests/data/pplacer_dir_reference/gtdbtk.ar53.classify.tree'
        options.outgroup_taxon = 'p__Altiarchaeota'
        options.output_tree = os.path.join(self.generic_out_path, 'test.rooted.tree')
        options.custom_taxonomy_file = None
        options.gtdbtk_classification_file = None
        self.optionparser.root(options)
        self.assertTrue(os.path.isfile(options.output_tree))
示例#8
0
class TestCli(unittest.TestCase):
    def setUp(self):
        self.identify_dir_reference = 'tests/data/identify_dir_reference/'
        self.align_dir_reference = 'tests/data/align_dir_reference/'
        self.genome_dir = 'gtdbtk/tests/data/genomes/'

        self.options = argparse.ArgumentParser()
        self.options.batchfile = None
        self.options.prefix = 'gtdbtk'
        self.options.cpus = 1
        self.options.extension = 'fna'
        self.options.debug = False

        # align option
        self.options.skip_gtdb_refs = False
        self.options.taxa_filter = None
        self.options.custom_msa_filters = False
        self.options.min_consensus = None
        self.options.min_perc_taxa = None
        self.options.skip_gtdb_refs = False
        self.options.cols_per_gene = None
        self.options.max_consensus = None
        self.options.min_perc_aa = 50

        # classify options
        self.options.scratch_dir = None

        # infer options
        self.options.prot_model = 'WAG'
        self.options.no_support = False
        self.options.no_gamma = True

        self.version = ' unittest'
        self.optionparser = OptionsParser(self.version)
        logger_setup(None, "gtdbtk.log", "GTDB-Tk", self.version, True)
        self.generic_out_path = 'tests/data/results'

    def test_identify(self):
        tmp_folder = ''.join(
            random.choice(string.ascii_uppercase + string.digits)
            for _ in range(10))
        identify_options = self.options
        identify_options.genome_dir = self.genome_dir
        identify_options.out_dir = os.path.join(self.generic_out_path,
                                                tmp_folder, 'identify')
        self.optionparser.identify(identify_options)
        self.assertTrue(
            os.path.isfile(
                os.path.join(self.options.out_dir,
                             'gtdbtk_bac120_markers_summary.tsv')))
        self.assertTrue(
            os.path.isfile(
                os.path.join(self.options.out_dir,
                             'gtdbtk_ar122_markers_summary.tsv')))

        results = {}
        with open(
                os.path.join(identify_options.out_dir,
                             'gtdbtk_ar122_markers_summary.tsv'), 'r') as f:
            f.readline()
            for line in f:
                infos = line.split('\t', 1)
                results[infos[0]] = infos[1]
        self.assertTrue(results.get('genome_1').startswith('120\t2\t0\t'))

    def test_align(self):
        tmp_folder = ''.join(
            random.choice(string.ascii_uppercase + string.digits)
            for _ in range(10))
        align_options = self.options
        align_options.identify_dir = self.identify_dir_reference
        align_options.out_dir = os.path.join(self.generic_out_path, tmp_folder,
                                             'align')
        self.optionparser.align(align_options)
        self.assertTrue(
            os.path.isfile(
                os.path.join(align_options.out_dir,
                             'gtdbtk.ar122.user_msa.fasta')))
        with open(
                os.path.join(align_options.out_dir,
                             'gtdbtk.ar122.user_msa.fasta'), 'r') as f:
            lines = f.read().splitlines()
            last_line = lines[-1]
        self.assertTrue(len(last_line) > 4500)
        self.assertTrue(len(last_line) < 5500)
        self.assertTrue('-' in last_line)
        self.assertFalse(any(char.isdigit() for char in last_line))

    def test_classify(self):
        tmp_folder = ''.join(
            random.choice(string.ascii_uppercase + string.digits)
            for _ in range(10))
        classify_options = self.options
        classify_options.genome_dir = self.genome_dir
        classify_options.align_dir = self.align_dir_reference
        classify_options.out_dir = os.path.join(self.generic_out_path,
                                                tmp_folder, 'classify')
        self.optionparser.classify(classify_options)
        self.assertTrue(
            os.path.isfile(
                os.path.join(classify_options.out_dir,
                             'gtdbtk.ar122.summary.tsv')))
        with open(
                os.path.join(classify_options.out_dir,
                             'gtdbtk.ar122.summary.tsv'), 'r') as f:
            lines = f.read().splitlines()
            last_line = lines[-1]
        infos = last_line.split('\t')
        self.assertEquals(len(infos), 17)
        self.assertTrue(infos[1].startswith('d__Archaea'))

    def test_identify_align(self):
        tmp_folder = ''.join(
            random.choice(string.ascii_uppercase + string.digits)
            for _ in range(10))

        identify_options = self.options
        identify_options.genome_dir = self.genome_dir
        identify_options.out_dir = os.path.join(self.generic_out_path,
                                                tmp_folder, 'identify')
        self.optionparser.identify(identify_options)

        align_options = self.options
        align_options.identify_dir = os.path.join(self.generic_out_path,
                                                  tmp_folder, 'identify')
        align_options.out_dir = os.path.join(self.generic_out_path, tmp_folder,
                                             'align')
        self.optionparser.align(align_options)
        self.assertTrue(
            os.path.isfile(
                os.path.join(align_options.out_dir,
                             'gtdbtk.ar122.user_msa.fasta')))
        with open(
                os.path.join(align_options.out_dir,
                             'gtdbtk.ar122.user_msa.fasta'), 'r') as f:
            lines = f.read().splitlines()
            last_line = lines[-1]
        self.assertTrue(len(last_line) > 4500)
        self.assertTrue(len(last_line) < 5500)
        self.assertTrue('-' in last_line)
        self.assertFalse(any(char.isdigit() for char in last_line))

    def test_identify_align_classify(self):
        tmp_folder = ''.join(
            random.choice(string.ascii_uppercase + string.digits)
            for _ in range(10))

        identify_options = self.options
        identify_options.genome_dir = self.genome_dir
        identify_options.out_dir = os.path.join(self.generic_out_path,
                                                tmp_folder, 'identify')
        self.optionparser.identify(identify_options)

        align_options = self.options
        align_options.identify_dir = os.path.join(self.generic_out_path,
                                                  tmp_folder, 'identify')
        align_options.out_dir = os.path.join(self.generic_out_path, tmp_folder,
                                             'align')
        self.optionparser.align(align_options)
        self.assertTrue(
            os.path.isfile(
                os.path.join(align_options.out_dir,
                             'gtdbtk.ar122.user_msa.fasta')))
        with open(
                os.path.join(align_options.out_dir,
                             'gtdbtk.ar122.user_msa.fasta'), 'r') as f:
            lines = f.read().splitlines()
            last_line = lines[-1]
        self.assertTrue(len(last_line) > 4500)
        self.assertTrue(len(last_line) < 5500)
        self.assertTrue('-' in last_line)
        self.assertFalse(any(char.isdigit() for char in last_line))

        classify_options = self.options
        classify_options.genome_dir = self.genome_dir
        classify_options.align_dir = align_options.out_dir
        classify_options.out_dir = os.path.join(self.generic_out_path,
                                                tmp_folder, 'classify')
        self.optionparser.classify(classify_options)
        self.assertTrue(
            os.path.isfile(
                os.path.join(classify_options.out_dir,
                             'gtdbtk.ar122.summary.tsv')))
        with open(
                os.path.join(classify_options.out_dir,
                             'gtdbtk.ar122.summary.tsv'), 'r') as f:
            lines = f.read().splitlines()
            last_line = lines[-1]
        infos = last_line.split('\t')
        self.assertEquals(len(infos), 17)
        self.assertTrue(infos[1].startswith('d__Archaea'))

    def test_classify_wf(self):
        tmp_folder = ''.join(
            random.choice(string.ascii_uppercase + string.digits)
            for _ in range(10))
        classify_wf_options = self.options
        classify_wf_options.genome_dir = self.genome_dir
        classify_wf_options.out_dir = os.path.join(self.generic_out_path,
                                                   tmp_folder, 'classify_wf')
        self.optionparser.identify(classify_wf_options)
        classify_wf_options.identify_dir = classify_wf_options.out_dir
        classify_wf_options.align_dir = classify_wf_options.out_dir
        classify_wf_options.taxa_filter = None
        classify_wf_options.custom_msa_filters = False
        classify_wf_options.min_consensus = None
        classify_wf_options.min_perc_taxa = None
        classify_wf_options.skip_gtdb_refs = False
        classify_wf_options.cols_per_gene = None
        classify_wf_options.max_consensus = None
        self.optionparser.align(classify_wf_options)
        self.optionparser.classify(classify_wf_options)

        self.assertTrue(
            os.path.isfile(
                os.path.join(classify_wf_options.out_dir,
                             'gtdbtk.ar122.summary.tsv')))
        with open(
                os.path.join(classify_wf_options.out_dir,
                             'gtdbtk.ar122.summary.tsv'), 'r') as f:
            lines = f.read().splitlines()
            last_line = lines[-1]
        infos = last_line.split('\t')
        self.assertEquals(len(infos), 17)
        self.assertTrue(infos[1].startswith('d__Archaea'))

    def test_infer(self):
        tmp_folder = ''.join(
            random.choice(string.ascii_uppercase + string.digits)
            for _ in range(10))
        infer_options = self.options
        infer_options.msa_file = os.path.join(self.align_dir_reference,
                                              'gtdbtk.ar122.user_msa.fasta')
        infer_options.out_dir = os.path.join(self.generic_out_path, tmp_folder,
                                             'infer')
        self.optionparser.infer(infer_options)
        with open(os.path.join(infer_options.out_dir, 'gtdbtk.tree.log'),
                  'r') as f:
            lines = f.read().splitlines()
            last_line = lines[-1]
        self.assertEqual(last_line.strip(), 'TreeCompleted')
        with open(os.path.join(infer_options.out_dir, 'gtdbtk.unrooted.tree'),
                  'r') as f:
            lines = f.read().splitlines()
            last_line = lines[-1]
        self.assertTrue('genome_1' in last_line)
        self.assertTrue('genome_2' in last_line)
        self.assertTrue('genome_3' in last_line)

    def test_de_novo_wf(self):
        tmp_folder = ''.join(
            random.choice(string.ascii_uppercase + string.digits)
            for _ in range(10))
        de_novo_wf_options = self.options
        de_novo_wf_options.genome_dir = self.genome_dir
        de_novo_wf_options.suffix = ".ar122"
        de_novo_wf_options.out_dir = os.path.join(self.generic_out_path,
                                                  tmp_folder, 'de_novo_wf')
        de_novo_wf_options.identify_dir = de_novo_wf_options.out_dir
        de_novo_wf_options.msa_file = os.path.join(
            de_novo_wf_options.out_dir, de_novo_wf_options.prefix +
            de_novo_wf_options.suffix + ".user_msa.fasta")
        self.optionparser.identify(de_novo_wf_options)
        self.optionparser.align(de_novo_wf_options)
        self.optionparser.infer(de_novo_wf_options)

    def tearDown(self):
        shutil.rmtree(self.generic_out_path)
示例#9
0
class TestOptionsParser(unittest.TestCase):
    def setUp(self):
        self.options_parser = OptionsParser('-1')
        self.dir_tmp = tempfile.mkdtemp(prefix='gtdbtk_tmp_')
        pass

    def tearDown(self):
        shutil.rmtree(self.dir_tmp)

    def test__verify_genome_id__valid(self):
        """ Test that a valid genome id returns True. """
        self.assertTrue(self.options_parser._verify_genome_id('genome_1'))

    def test__verify_genome_id__invalid(self):
        """ Test that invalid genome ids throw an exception. """
        for c in list('()[],;='):
            self.assertRaises(GenomeNameInvalid,
                              self.options_parser._verify_genome_id,
                              'genome%s1' % c)

    def test__genomes_to_process__genome_dir__valid(self):
        """ Test that the expected results are returned when using genome_dir. """
        open(os.path.join(self.dir_tmp, 'genome_1.fna'), 'a').close()
        open(os.path.join(self.dir_tmp, 'genome_2.fna'), 'a').close()
        open(os.path.join(self.dir_tmp, 'other_file.txt'), 'a').close()
        results = self.options_parser._genomes_to_process(
            self.dir_tmp, '', 'fna')
        expected = {
            'genome_1': os.path.join(self.dir_tmp, 'genome_1.fna'),
            'genome_2': os.path.join(self.dir_tmp, 'genome_2.fna')
        }
        self.assertDictEqual(results, expected)

    def test__genomes_to_process__batchfile__valid(self):
        """ Test that the expected results are returned when using batchfile """
        path_batchfile = os.path.join(self.dir_tmp, 'batchfile.txt')
        path_genome_1 = os.path.join(self.dir_tmp, 'genome_1.fna')
        path_genome_2 = os.path.join(self.dir_tmp, 'genome_2.fna')
        open(path_genome_1, 'a').close()
        open(path_genome_2, 'a').close()

        with open(path_batchfile, 'a') as f:
            f.write('%s\tgenome_1\n' % path_genome_1)
            f.write('\n')
            f.write('%s\tgenome_2\n' % path_genome_2)

        results = self.options_parser._genomes_to_process(
            '', path_batchfile, 'fna')
        expected = {'genome_1': path_genome_1, 'genome_2': path_genome_2}
        self.assertDictEqual(results, expected)

    def test__genomes_to_process__batchfile__invalid_columns(self):
        """ Test that a batchfile containing columns not equal to 2 throws an exception. """
        path_batchfile = os.path.join(self.dir_tmp, 'batchfile.txt')
        path_genome_1 = os.path.join(self.dir_tmp, 'genome_1.fna')
        path_genome_2 = os.path.join(self.dir_tmp, 'genome_2.fna')
        open(path_genome_1, 'a').close()
        open(path_genome_2, 'a').close()

        with open(path_batchfile, 'a') as f:
            f.write('%s\tgenome_1\n' % path_genome_1)
            f.write('\n')
            f.write('%s\tgenome_2\tfoo\n' % path_genome_2)

        self.assertRaises(GenomeBatchfileMalformed,
                          self.options_parser._genomes_to_process, '',
                          path_batchfile, 'fna')

    def test__genomes_to_process__batchfile__blank_genome_path(self):
        """ Test that a batchfile containing a blank genome path throws an exception. """
        path_batchfile = os.path.join(self.dir_tmp, 'batchfile.txt')
        path_genome_1 = os.path.join(self.dir_tmp, 'genome_1.fna')
        path_genome_2 = os.path.join(self.dir_tmp, 'genome_2.fna')
        open(path_genome_1, 'a').close()
        open(path_genome_2, 'a').close()

        with open(path_batchfile, 'a') as f:
            f.write('%s\tgenome_1\n' % path_genome_1)
            f.write('\n')
            f.write('%s\tgenome_2\n' % '')

        self.assertRaises(GenomeBatchfileMalformed,
                          self.options_parser._genomes_to_process, '',
                          path_batchfile, 'fna')

    def test__genomes_to_process__batchfile__blank_genome_id(self):
        """ Test that a batchfile containing a blank genome id throws an exception. """
        path_batchfile = os.path.join(self.dir_tmp, 'batchfile.txt')
        path_genome_1 = os.path.join(self.dir_tmp, 'genome_1.fna')
        path_genome_2 = os.path.join(self.dir_tmp, 'genome_2.fna')
        open(path_genome_1, 'a').close()
        open(path_genome_2, 'a').close()

        with open(path_batchfile, 'a') as f:
            f.write('%s\tgenome_1\n' % path_genome_1)
            f.write('\n')
            f.write('%s\t\n' % path_genome_2)

        self.assertRaises(GenomeBatchfileMalformed,
                          self.options_parser._genomes_to_process, '',
                          path_batchfile, 'fna')

    def test__genomes_to_process__batchfile__duplicate_genome_id(self):
        """ Test that a batchfile containing duplicate genome ids throws an exception. """
        # Branch 1: The number of columns are not equal to 2.
        path_batchfile = os.path.join(self.dir_tmp, 'batchfile.txt')
        path_genome_1 = os.path.join(self.dir_tmp, 'genome_1.fna')
        path_genome_2 = os.path.join(self.dir_tmp, 'genome_2.fna')
        open(path_genome_1, 'a').close()
        open(path_genome_2, 'a').close()

        with open(path_batchfile, 'a') as f:
            f.write('%s\tgenome_1\n' % path_genome_1)
            f.write('\n')
            f.write('%s\tgenome_1\n' % path_genome_2)

        self.assertRaises(GTDBTkExit, self.options_parser._genomes_to_process,
                          '', path_batchfile, 'fna')

    def test__genomes_to_process__batchfile__invalid_genome_id(self):
        """ Test that a batchfile containing duplicate genome ids throws an exception. """
        # Branch 1: The number of columns are not equal to 2.
        path_batchfile_1 = os.path.join(self.dir_tmp, 'batchfile_1.txt')
        path_batchfile_2 = os.path.join(self.dir_tmp, 'batchfile_2.txt')
        path_batchfile_3 = os.path.join(self.dir_tmp, 'batchfile_3.txt')
        path_genome_1 = os.path.join(self.dir_tmp, 'genome_1.fna')
        path_genome_2 = os.path.join(self.dir_tmp, 'genome_2.fna')
        open(path_genome_1, 'a').close()
        open(path_genome_2, 'a').close()

        with open(path_batchfile_1, 'a') as f:
            f.write('%s\tgenome_1\n' % path_genome_1)
            f.write('\n')
            f.write('%s\tGB_genome_2\n' % path_genome_2)

        with open(path_batchfile_2, 'a') as f:
            f.write('%s\tgenome_1\n' % path_genome_1)
            f.write('\n')
            f.write('%s\tRS_genome_2\n' % path_genome_2)

        with open(path_batchfile_3, 'a') as f:
            f.write('%s\tgenome_1\n' % path_genome_1)
            f.write('\n')
            f.write('%s\tUBAgenome_2\n' % path_genome_2)

        self.assertRaises(GTDBTkExit, self.options_parser._genomes_to_process,
                          '', path_batchfile_1, 'fna')
        self.assertRaises(GTDBTkExit, self.options_parser._genomes_to_process,
                          '', path_batchfile_2, 'fna')
        self.assertRaises(GTDBTkExit, self.options_parser._genomes_to_process,
                          '', path_batchfile_3, 'fna')

    def test__genomes_to_process__no_files(self):
        """ Test that an exception is thrown if no files are found to process """
        # Branch 1 : genome_dir is specified
        tmp_genome_dir = tempfile.mkdtemp()
        try:
            self.assertRaises(GTDBTkExit,
                              self.options_parser._genomes_to_process,
                              tmp_genome_dir, '', 'fna')
        finally:
            shutil.rmtree(tmp_genome_dir)

        # Branch 2: batchfile is specified
        tmp_genome_dir = tempfile.mkdtemp()
        try:
            path_batchfile = os.path.join(tmp_genome_dir, 'batchfile.txt')
            open(path_batchfile, 'a').close()
            self.assertRaises(GTDBTkExit,
                              self.options_parser._genomes_to_process, '',
                              path_batchfile, 'fna')
        finally:
            shutil.rmtree(tmp_genome_dir)

    def test__marker_set_id(self):
        """ Test that the correct marker set id is returned """
        self.assertEqual(
            self.options_parser._marker_set_id(True, False, False), 'bac120')
        self.assertEqual(
            self.options_parser._marker_set_id(False, True, False), 'ar122')
        self.assertEqual(
            self.options_parser._marker_set_id(False, False, True), 'rps23')

    def test_identify__genome_dir_raises_io_exception(self):
        """ Test that the identify method raises an exception on invalid genome_dir """
        options = argparse.ArgumentParser()
        options.genome_dir = os.path.join(tempfile.gettempdir(),
                                          'non-existent-dir')
        self.assertRaises(BioLibDirNotFound, self.options_parser.identify,
                          options)

    def test_identify__batchfile_raises_io_exception(self):
        """ Test that the identify method raises an exception on invalid batchfile """
        options = argparse.ArgumentParser()
        options.genome_dir = None
        options.batchfile = os.path.join(tempfile.gettempdir(),
                                         'non-existent-file.txt')
        self.assertRaises(BioLibFileNotFound, self.options_parser.identify,
                          options)

    def test_align__identify_dir_raises_io_exception(self):
        """ Test that the align method raises an exception on invalid identify dir """
        options = argparse.ArgumentParser()
        options.identify_dir = os.path.join(tempfile.gettempdir(),
                                            'non-existent-dir')
        self.assertRaises(BioLibDirNotFound, self.options_parser.align,
                          options)

    def test_infer__msa_raises_io_exception(self):
        """ Test that the infer method raises an exception on invalid MSA """
        options = argparse.ArgumentParser()
        options.msa_file = os.path.join(tempfile.gettempdir(),
                                        'non-existent-msa.txt')
        self.assertRaises(BioLibFileNotFound, self.options_parser.infer,
                          options)

    def test_run_test(self):
        """Test that the user-test method runs correctly"""
        options = argparse.ArgumentParser()
        options.out_dir = self.dir_tmp
        options.cpus = 3
        self.assertTrue(self.options_parser.run_test(options))

    # def test_run_test__throws_exception(self):
    #     """Test that the user-test method fails correctly"""
    #     options = argparse.ArgumentParser()
    #     options.out_dir = self.dir_tmp
    #     os.mkdir(os.path.join(self.dir_tmp, 'genomes'))
    #     options.cpus = 3
    #     self.assertRaises(GTDBTkTestFailure, self.options_parser.run_test, options)

    def test_classify__align_dir_raises_io_exception(self):
        """ Test that the classify method raises an exception on invalid align dir """
        options = argparse.ArgumentParser()
        options.align_dir = os.path.join(tempfile.gettempdir(),
                                         'non-existent-dir')
        self.assertRaises(BioLibDirNotFound, self.options_parser.classify,
                          options)

    def test_root__no_tree_raises_io_exception(self):
        """ Test that the infer method raises an exception on invalid tree """
        options = argparse.ArgumentParser()
        options.input_tree = os.path.join(tempfile.gettempdir(),
                                          'non-existent-tree.tree')
        self.assertRaises(BioLibFileNotFound, self.options_parser.root,
                          options)

    def test_decorate__no_tree_raises_io_exception(self):
        """ Test that the infer method raises an exception on invalid tree """
        options = argparse.ArgumentParser()
        options.input_tree = os.path.join(tempfile.gettempdir(),
                                          'non-existent-tree.tree')
        self.assertRaises(BioLibFileNotFound, self.options_parser.decorate,
                          options)

    def test_trim_msa__mask_file(self):
        """ Test that the expected result is returned when running trim_msa with mask_file """
        path_untrimmed_msa = os.path.join(self.dir_tmp, 'untrimmed_msa.fasta')
        path_mask_file = os.path.join(self.dir_tmp, 'mask_file.txt')
        path_output = os.path.join(self.dir_tmp, 'trimmed_msa.fasta')

        with open(path_untrimmed_msa, 'w') as f:
            f.write('>genome_1\n')
            f.write('ALGPVW\n')
            f.write('>genome_2\n')
            f.write('WVPGLA\n')

        with open(path_mask_file, 'w') as f:
            f.write('010010\n')

        options = argparse.ArgumentParser()
        # Required arguments
        options.untrimmed_msa = path_untrimmed_msa
        options.output = path_output
        # Mutex arguments
        options.mask_file = path_mask_file
        options.reference_mask = None

        self.options_parser.trim_msa(options)

        results = dict()
        with open(path_output, 'r') as f:
            re_hits = re.findall(r'>(.+)\n(.+)\n', f.read())
            for gid, seq in re_hits:
                results[gid] = seq

        expected = {'genome_1': 'LV', 'genome_2': 'VL'}

        self.assertDictEqual(results, expected)

    def test_trim_msa__reference_mask_arc(self):
        """ Test that the expected result is returned when running trim_msa with archaeal reference_mask """
        path_untrimmed_msa = os.path.join(self.dir_tmp, 'untrimmed_msa.fasta')
        path_output = os.path.join(self.dir_tmp, 'trimmed_msa.fasta')

        msa_str = str()
        while len(msa_str) < 32675:
            msa_str += 'ALGPVW'
        msa_str = msa_str[0:32675]

        with open(path_untrimmed_msa, 'w') as f:
            f.write('>genome_1\n')
            f.write('%s\n' % msa_str)
            f.write('>genome_2\n')
            f.write('%s\n' % msa_str[::-1])

        options = argparse.ArgumentParser()
        # Required arguments
        options.untrimmed_msa = path_untrimmed_msa
        options.output = path_output
        # Mutex arguments
        options.mask_file = None
        options.reference_mask = 'arc'

        self.options_parser.trim_msa(options)

        results = dict()
        with open(path_output, 'r') as f:
            re_hits = re.findall(r'>(.+)\n(.+)\n', f.read())
            for gid, seq in re_hits:
                results[gid] = hashlib.sha256(seq.encode('utf-8')).hexdigest()

        expected = {
            'genome_1':
            '4975c04d640415de4c715552f6f6b460a8996226239440faa6539ac777622515',
            'genome_2':
            '7b53881aecb13bbe54612962e22736db7ab83271ffe4685d63c16e962e3561d9'
        }

        self.assertDictEqual(results, expected)

    def test_trim_msa__reference_mask_bac(self):
        """ Test that the expected result is returned when running trim_msa with bacterial reference_mask """
        path_untrimmed_msa = os.path.join(self.dir_tmp, 'untrimmed_msa.fasta')
        path_output = os.path.join(self.dir_tmp, 'trimmed_msa.fasta')

        msa_str = str()
        while len(msa_str) < 41155:
            msa_str += 'ALGPVW'
        msa_str = msa_str[0:41155]

        with open(path_untrimmed_msa, 'w') as f:
            f.write('>genome_1\n')
            f.write('%s\n' % msa_str)
            f.write('>genome_2\n')
            f.write('%s\n' % msa_str[::-1])

        options = argparse.ArgumentParser()
        # Required arguments
        options.untrimmed_msa = path_untrimmed_msa
        options.output = path_output
        # Mutex arguments
        options.mask_file = None
        options.reference_mask = 'bac'

        self.options_parser.trim_msa(options)

        results = dict()
        with open(path_output, 'r') as f:
            re_hits = re.findall(r'>(.+)\n(.+)\n', f.read())
            for gid, seq in re_hits:
                results[gid] = hashlib.sha256(seq.encode('utf-8')).hexdigest()

        expected = {
            'genome_1':
            '32798bdc3245b2ac5ecd8a15ea2cfb21011b22b6021baa51066864b1c02d72b4',
            'genome_2':
            '0b63d416c72e9641011f80fcf64fa41eb3f0e8e85dbaa4bd8feba12cf3b64c62'
        }

        self.assertDictEqual(results, expected)

    def test_export_msa__arc(self):
        """ Test that the untrimmed archaeal MSA is exported correctly """
        path_out = os.path.join(self.dir_tmp, 'output.fasta')

        options = argparse.ArgumentParser()
        options.domain = 'arc'
        options.output = path_out

        self.options_parser.export_msa(options)

        with open(path_out, 'rb') as f:
            out_hash = hashlib.sha256(f.read()).hexdigest()
        self.assertEqual(
            out_hash,
            'e84edf65511002b73f110ff44c9acee3ae44220448dfc971a2778d43c966bbba')

    def test_export_msa__bac(self):
        """ Test that the untrimmed bacterial MSA is exported correctly """
        path_out = os.path.join(self.dir_tmp, 'output.fasta')

        options = argparse.ArgumentParser()
        options.domain = 'bac'
        options.output = path_out

        self.options_parser.export_msa(options)

        with open(path_out, 'rb') as f:
            out_hash = hashlib.sha256(f.read()).hexdigest()
        self.assertEqual(
            out_hash,
            '5e37bc123819061490681068b49450fc43587d09b87df90ef62452bd73f961cc')
示例#10
0
class TestOptionsParser(unittest.TestCase):
    def setUp(self):
        self.options_parser = OptionsParser('-1')
        self.dir_tmp = tempfile.mkdtemp(prefix='gtdbtk_tmp_')
        pass

    def tearDown(self):
        shutil.rmtree(self.dir_tmp)

    def test__verify_genome_id__valid(self):
        """ Test that a valid genome id returns True. """
        self.assertTrue(self.options_parser._verify_genome_id('genome_1'))

    def test__verify_genome_id__invalid(self):
        """ Test that invalid genome ids throw an exception. """
        for c in list('()[],;='):
            self.assertRaises(GTDBTkExit,
                              self.options_parser._verify_genome_id,
                              'genome%s1' % c)

    def test__genomes_to_process__genome_dir__valid(self):
        """ Test that the expected results are returned when using genome_dir. """
        open(os.path.join(self.dir_tmp, 'genome_1.fna'), 'a').close()
        open(os.path.join(self.dir_tmp, 'genome_2.fna'), 'a').close()
        open(os.path.join(self.dir_tmp, 'other_file.txt'), 'a').close()
        results, tln_table = self.options_parser._genomes_to_process(
            self.dir_tmp, '', 'fna')
        expected = {
            'genome_1': os.path.join(self.dir_tmp, 'genome_1.fna'),
            'genome_2': os.path.join(self.dir_tmp, 'genome_2.fna')
        }
        self.assertDictEqual(results, expected)

    def test__genomes_to_process__batchfile__valid(self):
        """ Test that the expected results are returned when using batchfile """
        path_batchfile = os.path.join(self.dir_tmp, 'batchfile.txt')
        path_genome_1 = os.path.join(self.dir_tmp, 'genome_1.fna')
        path_genome_2 = os.path.join(self.dir_tmp, 'genome_2.fna')
        open(path_genome_1, 'a').close()
        open(path_genome_2, 'a').close()

        with open(path_batchfile, 'a') as f:
            f.write(f'{path_genome_1}\tgenome_1\n')
            f.write('\n')
            f.write(f'{path_genome_2}\tgenome_2\t4\n')

        results, tln_table = self.options_parser._genomes_to_process(
            '', path_batchfile, 'fna')
        expected = {'genome_1': path_genome_1, 'genome_2': path_genome_2}
        expected_tln = {'genome_2': 4}
        self.assertDictEqual(results, expected)
        self.assertDictEqual(tln_table, expected_tln)

    def test__genomes_to_process__batchfile__invalid_columns(self):
        """ Test that a batchfile containing columns not equal to 2 throws an exception. """
        path_batchfile = os.path.join(self.dir_tmp, 'batchfile.txt')
        path_genome_1 = os.path.join(self.dir_tmp, 'genome_1.fna')
        path_genome_2 = os.path.join(self.dir_tmp, 'genome_2.fna')
        open(path_genome_1, 'a').close()
        open(path_genome_2, 'a').close()

        with open(path_batchfile, 'a') as f:
            f.write('%s\tgenome_1\n' % path_genome_1)
            f.write('\n')
            f.write('%s\tgenome_2\tfoo\n' % path_genome_2)

        self.assertRaises(GTDBTkExit, self.options_parser._genomes_to_process,
                          '', path_batchfile, 'fna')

    def test__genomes_to_process__batchfile__blank_genome_path(self):
        """ Test that a batchfile containing a blank genome path throws an exception. """
        path_batchfile = os.path.join(self.dir_tmp, 'batchfile.txt')
        path_genome_1 = os.path.join(self.dir_tmp, 'genome_1.fna')
        path_genome_2 = os.path.join(self.dir_tmp, 'genome_2.fna')
        open(path_genome_1, 'a').close()
        open(path_genome_2, 'a').close()

        with open(path_batchfile, 'a') as f:
            f.write('%s\tgenome_1\n' % path_genome_1)
            f.write('\n')
            f.write('%s\tgenome_2\n' % '')

        self.assertRaises(GTDBTkExit, self.options_parser._genomes_to_process,
                          '', path_batchfile, 'fna')

    def test__genomes_to_process__batchfile__blank_genome_id(self):
        """ Test that a batchfile containing a blank genome id throws an exception. """
        path_batchfile = os.path.join(self.dir_tmp, 'batchfile.txt')
        path_genome_1 = os.path.join(self.dir_tmp, 'genome_1.fna')
        path_genome_2 = os.path.join(self.dir_tmp, 'genome_2.fna')
        open(path_genome_1, 'a').close()
        open(path_genome_2, 'a').close()

        with open(path_batchfile, 'a') as f:
            f.write('%s\tgenome_1\n' % path_genome_1)
            f.write('\n')
            f.write('%s\t\n' % path_genome_2)

        self.assertRaises(GTDBTkExit, self.options_parser._genomes_to_process,
                          '', path_batchfile, 'fna')

    def test__genomes_to_process__batchfile__duplicate_genome_id(self):
        """ Test that a batchfile containing duplicate genome ids throws an exception. """
        # Branch 1: The number of columns are not equal to 2.
        path_batchfile = os.path.join(self.dir_tmp, 'batchfile.txt')
        path_genome_1 = os.path.join(self.dir_tmp, 'genome_1.fna')
        path_genome_2 = os.path.join(self.dir_tmp, 'genome_2.fna')
        open(path_genome_1, 'a').close()
        open(path_genome_2, 'a').close()

        with open(path_batchfile, 'a') as f:
            f.write('%s\tgenome_1\n' % path_genome_1)
            f.write('\n')
            f.write('%s\tgenome_1\n' % path_genome_2)

        self.assertRaises(GTDBTkExit, self.options_parser._genomes_to_process,
                          '', path_batchfile, 'fna')

    # def test__genomes_to_process__batchfile__invalid_genome_id(self):
    #     """ Test that a batchfile containing duplicate genome ids throws an exception. """
    #     # Branch 1: The number of columns are not equal to 2.
    #     path_batchfile_1 = os.path.join(self.dir_tmp, 'batchfile_1.txt')
    #     path_batchfile_2 = os.path.join(self.dir_tmp, 'batchfile_2.txt')
    #     path_batchfile_3 = os.path.join(self.dir_tmp, 'batchfile_3.txt')
    #     path_genome_1 = os.path.join(self.dir_tmp, 'genome_1.fna')
    #     path_genome_2 = os.path.join(self.dir_tmp, 'genome_2.fna')
    #     open(path_genome_1, 'a').close()
    #     open(path_genome_2, 'a').close()
    #
    #     with open(path_batchfile_1, 'a') as f:
    #         f.write('%s\tgenome_1\n' % path_genome_1)
    #         f.write('\n')
    #         f.write('%s\tGB_genome_2\n' % path_genome_2)
    #
    #     with open(path_batchfile_2, 'a') as f:
    #         f.write('%s\tgenome_1\n' % path_genome_1)
    #         f.write('\n')
    #         f.write('%s\tRS_genome_2\n' % path_genome_2)
    #
    #     with open(path_batchfile_3, 'a') as f:
    #         f.write('%s\tgenome_1\n' % path_genome_1)
    #         f.write('\n')
    #         f.write('%s\tUBAgenome_2\n' % path_genome_2)
    #
    #     self.assertRaises(GTDBTkExit, self.options_parser._genomes_to_process, '', path_batchfile_1, 'fna')
    #     self.assertRaises(GTDBTkExit, self.options_parser._genomes_to_process, '', path_batchfile_2, 'fna')
    #     self.assertRaises(GTDBTkExit, self.options_parser._genomes_to_process, '', path_batchfile_3, 'fna')

    def test__genomes_to_process__no_files(self):
        """ Test that an exception is thrown if no files are found to process """
        # Branch 1 : genome_dir is specified
        tmp_genome_dir = tempfile.mkdtemp()
        try:
            self.assertRaises(GTDBTkExit,
                              self.options_parser._genomes_to_process,
                              tmp_genome_dir, '', 'fna')
        finally:
            shutil.rmtree(tmp_genome_dir)

        # Branch 2: batchfile is specified
        tmp_genome_dir = tempfile.mkdtemp()
        try:
            path_batchfile = os.path.join(tmp_genome_dir, 'batchfile.txt')
            open(path_batchfile, 'a').close()
            self.assertRaises(GTDBTkExit,
                              self.options_parser._genomes_to_process, '',
                              path_batchfile, 'fna')
        finally:
            shutil.rmtree(tmp_genome_dir)

    def test_identify__genome_dir_raises_io_exception(self):
        """ Test that the identify method raises an exception on invalid genome_dir """
        options = argparse.ArgumentParser()
        options.genome_dir = os.path.join(tempfile.gettempdir(),
                                          'non-existent-dir')
        self.assertRaises(BioLibDirNotFound, self.options_parser.identify,
                          options)

    def test_identify__batchfile_raises_io_exception(self):
        """ Test that the identify method raises an exception on invalid batchfile """
        options = argparse.ArgumentParser()
        options.genome_dir = None
        options.batchfile = os.path.join(tempfile.gettempdir(),
                                         'non-existent-file.txt')
        self.assertRaises(BioLibFileNotFound, self.options_parser.identify,
                          options)

    def test_align__identify_dir_raises_io_exception(self):
        """ Test that the align method raises an exception on invalid identify dir """
        options = argparse.ArgumentParser()
        options.identify_dir = os.path.join(tempfile.gettempdir(),
                                            'non-existent-dir')
        self.assertRaises(BioLibDirNotFound, self.options_parser.align,
                          options)

    def test_infer__msa_raises_io_exception(self):
        """ Test that the infer method raises an exception on invalid MSA """
        options = argparse.ArgumentParser()
        options.msa_file = os.path.join(tempfile.gettempdir(),
                                        'non-existent-msa.txt')
        self.assertRaises(BioLibFileNotFound, self.options_parser.infer,
                          options)

    def test_run_test(self):
        """Test that the user-test method runs correctly"""
        options = argparse.ArgumentParser()
        options.out_dir = self.dir_tmp
        options.cpus = 3
        self.assertTrue(self.options_parser.run_test(options))

    # def test_run_test__throws_exception(self):
    #     """Test that the user-test method fails correctly"""
    #     options = argparse.ArgumentParser()
    #     options.out_dir = self.dir_tmp
    #     os.mkdir(os.path.join(self.dir_tmp, 'genomes'))
    #     options.cpus = 3
    #     self.assertRaises(GTDBTkTestFailure, self.options_parser.run_test, options)

    def test_classify__align_dir_raises_io_exception(self):
        """ Test that the classify method raises an exception on invalid align dir """
        options = argparse.ArgumentParser()
        options.align_dir = os.path.join(tempfile.gettempdir(),
                                         'non-existent-dir')
        self.assertRaises(BioLibDirNotFound, self.options_parser.classify,
                          options)

    def test_root__no_tree_raises_io_exception(self):
        """ Test that the infer method raises an exception on invalid tree """
        options = argparse.ArgumentParser()
        options.input_tree = os.path.join(tempfile.gettempdir(),
                                          'non-existent-tree.tree')
        self.assertRaises(BioLibFileNotFound, self.options_parser.root,
                          options)

    def test_decorate__no_tree_raises_io_exception(self):
        """ Test that the infer method raises an exception on invalid tree """
        options = argparse.ArgumentParser()
        options.input_tree = os.path.join(tempfile.gettempdir(),
                                          'non-existent-tree.tree')
        self.assertRaises(BioLibFileNotFound, self.options_parser.decorate,
                          options)
示例#11
0
def main():
    parser = argparse.ArgumentParser(prog='gtdbtk',
                                     add_help=False,
                                     conflict_handler='resolve')
    parser.add_argument('-f',
                        '--force',
                        action="store_true",
                        default=False,
                        help="overwrite existing files without prompting.")

    subparsers = parser.add_subparsers(help="--", dest='subparser_name')

    # de novo workflow
    denovo_wf_parser = subparsers.add_parser(
        'de_novo_wf',
        conflict_handler='resolve',
        formatter_class=CustomHelpFormatter,
        help='Infer de novo tree and decorate with GTDB taxonomy.')

    mutual_genome_denovo_wf = denovo_wf_parser.add_argument_group(
        'mutually exclusive required arguments')
    mutex_group = mutual_genome_denovo_wf.add_mutually_exclusive_group(
        required=True)
    mutex_group.add_argument(
        '--genome_dir',
        help="directory containing genome files in FASTA format")
    mutex_group.add_argument(
        '--batchfile',
        help=
        "file describing genomes - tab separated in 2 columns (FASTA file, genome ID)"
    )

    mutual_ms_denovo_wf = denovo_wf_parser.add_argument_group(
        'mutually exclusive required arguments')
    mutex_group = mutual_ms_denovo_wf.add_mutually_exclusive_group(
        required=True)
    mutex_group.add_argument('--bacteria',
                             action='store_true',
                             help='process bacterial genomes')
    mutex_group.add_argument('--archaea',
                             action='store_true',
                             help='process archaeal genomes')

    required_denovo_wf = denovo_wf_parser.add_argument_group(
        'required named arguments')
    required_denovo_wf.add_argument(
        '--outgroup_taxon',
        required=True,
        help=
        "taxon to use as outgroup (e.g., p__Patescibacteria or p__Altiarchaeota)"
    )
    required_denovo_wf.add_argument('--out_dir',
                                    required=True,
                                    help="directory to output files")

    optional_denovo_wf = denovo_wf_parser.add_argument_group(
        'optional arguments')
    optional_denovo_wf.add_argument(
        '-x',
        '--extension',
        default='fna',
        help='extension of files to process, gz = gzipped')

    optional_denovo_wf.add_argument(
        '--skip_gtdb_refs',
        action="store_true",
        help=
        'do not include GTDB reference genomes in multiple sequence alignment')
    optional_denovo_wf.add_argument(
        '--taxa_filter',
        help=('filter GTDB genomes to taxa (comma separated) within ' +
              'specific taxonomic groups (e.g., d__Bacteria ' +
              'or p__Proteobacteria, p__Actinobacteria)'))
    optional_denovo_wf.add_argument(
        '--min_perc_aa',
        type=float,
        default=10,
        help=
        'filter genomes with an insufficient percentage of AA in the MSA (inclusive bound)'
    )
    optional_denovo_wf.add_argument(
        '--custom_msa_filters',
        action="store_true",
        help=
        ('perform custom filtering of MSA with cols_per_gene, min_consensus ' +
         'max_consensus, and min_perc_taxa parameters instead of using canonical mask'
         ))
    optional_denovo_wf.add_argument(
        '--cols_per_gene',
        type=int,
        default=42,
        help='maximum number of columns to retain per gene')
    optional_denovo_wf.add_argument(
        '--min_consensus',
        type=float,
        default=25,
        help=
        'minimum percentage of the same amino acid required to retain column (inclusive bound)'
    )
    optional_denovo_wf.add_argument(
        '--max_consensus',
        type=float,
        default=95,
        help=
        'maximum percentage of the same amino acid required to retain column (exclusive bound)'
    )
    optional_denovo_wf.add_argument(
        '--min_perc_taxa',
        type=float,
        default=50,
        help=
        'minimum percentage of taxa required to retain column (inclusive bound)'
    )
    optional_denovo_wf.add_argument(
        '--rnd_seed',
        type=int,
        default=None,
        help='random seed to use for selecting columns')

    optional_denovo_wf.add_argument(
        '--prot_model',
        choices=['JTT', 'WAG', 'LG'],
        help='protein substitution model for tree inference',
        default='WAG')
    optional_denovo_wf.add_argument(
        '--no_support',
        action="store_true",
        help=
        "do not compute local support values using the Shimodaira-Hasegawa test"
    )
    optional_denovo_wf.add_argument(
        '--gamma',
        action="store_true",
        help="rescale branch lengths to optimize the Gamma20 likelihood")

    optional_denovo_wf.add_argument(
        '--gtdbtk_classification_file',
        help=
        "file with GTDB-Tk classifications produced by the `classify` command")
    optional_denovo_wf.add_argument(
        '--custom_taxonomy_file',
        help=
        "file indicating custom taxonomy string for at least the genomes belonging to the outgroup"
    )

    optional_denovo_wf.add_argument('--prefix',
                                    default='gtdbtk',
                                    help='desired prefix for output files')
    optional_denovo_wf.add_argument('--cpus',
                                    default=1,
                                    type=int,
                                    help='number of CPUs to use')
    optional_denovo_wf.add_argument(
        '--force',
        action='store_const',
        const=True,
        default=False,
        help='continue processing if an error occurs on a single genome')
    optional_denovo_wf.add_argument(
        '--debug',
        action="store_true",
        help='create intermediate files for debugging purposes')
    optional_denovo_wf.add_argument('-h',
                                    '--help',
                                    action="help",
                                    help="show help message")

    # classify workflow
    classify_wf_parser = subparsers.add_parser(
        'classify_wf',
        conflict_handler='resolve',
        formatter_class=CustomHelpFormatter,
        help='Classify genomes by placement in GTDB reference tree.')

    mutual_genome_classify_wf = classify_wf_parser.add_argument_group(
        'mutually exclusive required arguments')
    mutex_group = mutual_genome_classify_wf.add_mutually_exclusive_group(
        required=True)
    mutex_group.add_argument(
        '--genome_dir',
        help="directory containing genome files in FASTA format")
    mutex_group.add_argument(
        '--batchfile',
        help=
        "file describing genomes - tab separated in 3 columns (FASTA file, genome ID, translation table [optional])"
    )

    required_classify_wf = classify_wf_parser.add_argument_group(
        'required named arguments')
    required_classify_wf.add_argument('--out_dir',
                                      required=True,
                                      help="directory to output files")

    optional_classify_wf = classify_wf_parser.add_argument_group(
        'optional arguments')
    optional_classify_wf.add_argument(
        '-x',
        '--extension',
        default='fna',
        help='extension of files to process, gz = gzipped')
    optional_classify_wf.add_argument(
        '--min_perc_aa',
        type=float,
        default=10,
        help='filter genomes with an insufficient percentage of AA in the MSA')
    optional_classify_wf.add_argument('--prefix',
                                      required=False,
                                      default='gtdbtk',
                                      help='desired prefix for output files')
    optional_classify_wf.add_argument('--cpus',
                                      default=1,
                                      type=int,
                                      help='number of CPUs to use')
    optional_classify_wf.add_argument(
        '--pplacer_cpus',
        type=int,
        default=None,
        help='use PPLACER_CPUS during placement (default: CPUS)')
    optional_classify_wf.add_argument(
        '--force',
        action='store_const',
        const=True,
        default=False,
        help='continue processing if an error occurs on a single genome')
    optional_classify_wf.add_argument(
        '--scratch_dir',
        help='Reduce memory usage by writing to disk (slower).')
    optional_classify_wf.add_argument(
        '-r',
        '--recalculate_red',
        action='store_true',
        help=
        'recalculate RED values based on the reference tree and all added user genomes'
    )
    # optional_classify_wf.add_argument('-s', '--split_tree', action='store_true',
    #                                   help='Use shards of the reference tree (for Bacteria only). reduce memory usage (slower).')
    optional_classify_wf.add_argument(
        '-d',
        '--debug',
        action="store_true",
        help='create intermediate files for debugging purposes')
    optional_classify_wf.add_argument('-h',
                                      '--help',
                                      action="help",
                                      help="show help message")

    # identify marker genes in genomes
    identify_parser = subparsers.add_parser(
        'identify',
        conflict_handler='resolve',
        formatter_class=CustomHelpFormatter,
        help='Identify marker genes in genome.')

    mutex_identify = identify_parser.add_argument_group(
        'mutually exclusive required arguments')
    mutex_group = mutex_identify.add_mutually_exclusive_group(required=True)
    mutex_group.add_argument(
        '--genome_dir',
        help="directory containing genome files in FASTA format")
    mutex_group.add_argument(
        '--batchfile',
        help=
        "file describing genomes - tab separated in 3 columns (FASTA file, genome ID, translation table [optional])"
    )

    required_identify = identify_parser.add_argument_group(
        'required named arguments')
    required_identify.add_argument('--out_dir',
                                   required=True,
                                   help="directory to output files")

    optional_identify = identify_parser.add_argument_group(
        'optional arguments')
    optional_identify.add_argument(
        '-x',
        '--extension',
        default='fna',
        help='extension of files to process, gz = gzipped')
    optional_identify.add_argument('--prefix',
                                   default='gtdbtk',
                                   help='desired prefix for output files')
    optional_identify.add_argument('--cpus',
                                   default=1,
                                   type=int,
                                   help='number of CPUs to use')
    optional_identify.add_argument(
        '--force',
        action='store_const',
        const=True,
        default=False,
        help='continue processing if an error occurs on a single genome')
    optional_identify.add_argument('-h',
                                   '--help',
                                   action="help",
                                   help="show help message")

    # create multiple sequence alignment
    align_parser = subparsers.add_parser(
        'align',
        conflict_handler='resolve',
        formatter_class=CustomHelpFormatter,
        help='Create multiple sequence alignment.',
    )

    required_align = align_parser.add_argument_group(
        'required named arguments')
    required_align.add_argument('--identify_dir',
                                required=True,
                                help="output directory of 'identify' command")
    required_align.add_argument('--out_dir',
                                required=True,
                                help='directory to output files')

    optional_align = align_parser.add_argument_group('optional arguments')
    optional_align.add_argument(
        '--skip_gtdb_refs',
        action="store_true",
        help=
        'do not include GTDB reference genomes in multiple sequence alignment')
    optional_align.add_argument(
        '--taxa_filter',
        help=('filter GTDB genomes to taxa (comma separated) within ' +
              'specific taxonomic groups (e.g., d__Bacteria ' +
              'or p__Proteobacteria, p__Actinobacteria)'))
    optional_align.add_argument(
        '--min_perc_aa',
        type=float,
        default=10,
        help=
        'filter genomes with an insufficient percentage of AA in the MSA (inclusive bound)'
    )

    mutual_genome_align = align_parser.add_argument_group(
        'mutually exclusive optional arguments')
    mutex_align_group = mutual_genome_align.add_mutually_exclusive_group()
    mutex_align_group.add_argument(
        '--custom_msa_filters',
        action="store_true",
        help=
        ('perform custom filtering of MSA with cols_per_gene, min_consensus ' +
         'max_consensus, and min_perc_taxa parameters instead of using canonical mask'
         ))
    mutex_align_group.add_argument(
        '--skip_trimming',
        action="store_true",
        default=False,
        help='skip trimming step and return the full MSAs')

    optional_align.add_argument(
        '--cols_per_gene',
        type=int,
        default=42,
        help='maximum number of columns to retain per gene')
    optional_align.add_argument(
        '--min_consensus',
        type=float,
        default=25,
        help=
        'minimum percentage of the same amino acid required to retain column (inclusive bound)'
    )
    optional_align.add_argument(
        '--max_consensus',
        type=float,
        default=95,
        help=
        'maximum percentage of the same amino acid required to retain column (exclusive bound)'
    )
    optional_align.add_argument(
        '--min_perc_taxa',
        type=float,
        default=50,
        help=
        'minimum percentage of taxa required to retain column (inclusive bound)'
    )
    optional_align.add_argument(
        '--rnd_seed',
        type=int,
        default=None,
        help='random seed to use for selecting columns')
    optional_align.add_argument('--prefix',
                                required=False,
                                default='gtdbtk',
                                help='desired prefix for output files')
    optional_align.add_argument('--cpus',
                                default=1,
                                type=int,
                                help='number of CPUs to use')
    optional_align.add_argument(
        '--debug',
        action="store_true",
        help='create intermediate files for debugging purposes')
    optional_align.add_argument('-h',
                                '--help',
                                action="help",
                                help="show help message")

    # infer tree
    infer_parser = subparsers.add_parser(
        'infer',
        conflict_handler='resolve',
        formatter_class=CustomHelpFormatter,
        help='Infer tree from multiple sequence alignment.',
    )

    required_infer = infer_parser.add_argument_group(
        'required named arguments')
    required_infer.add_argument(
        '--msa_file',
        required=True,
        help="multiple sequence alignment in FASTA format")
    required_infer.add_argument('--out_dir',
                                required=True,
                                help='directory to output files')

    optional_infer = infer_parser.add_argument_group('optional arguments')
    optional_infer.add_argument(
        '--prot_model',
        choices=['JTT', 'WAG', 'LG'],
        help='protein substitution model for tree inference',
        default='WAG')
    optional_infer.add_argument(
        '--no_support',
        action="store_true",
        help=
        "do not compute local support values using the Shimodaira-Hasegawa test"
    )
    optional_infer.add_argument(
        '--gamma',
        action="store_true",
        help="rescale branch lengths to optimize the Gamma20 likelihood")
    optional_infer.add_argument('--prefix',
                                required=False,
                                default='gtdbtk',
                                help='desired prefix for output files')
    optional_infer.add_argument('--cpus',
                                default=1,
                                type=int,
                                help='number of CPUs to use')
    optional_infer.add_argument('-h',
                                '--help',
                                action="help",
                                help="show help message")

    # classify genomes via placement with pplacer
    classify_parser = subparsers.add_parser(
        'classify',
        conflict_handler='resolve',
        formatter_class=CustomHelpFormatter,
        help='Determine taxonomic classification of genomes.',
    )

    mutual_genome_classify = classify_parser.add_argument_group(
        'mutually exclusive required arguments')
    mutex_group = mutual_genome_classify.add_mutually_exclusive_group(
        required=True)
    mutex_group.add_argument(
        '--genome_dir',
        help="directory containing genome files in FASTA format")
    mutex_group.add_argument(
        '--batchfile',
        help=
        "file describing genomes - tab separated in 2 columns (FASTA file, genome ID)"
    )

    required_classify = classify_parser.add_argument_group(
        'required named arguments')
    required_classify.add_argument('--align_dir',
                                   required=True,
                                   help="output directory of 'align' command")
    required_classify.add_argument('--out_dir',
                                   required=True,
                                   help='directory to output files')

    optional_classify = classify_parser.add_argument_group(
        'optional arguments')
    optional_classify.add_argument(
        '-x',
        '--extension',
        default='fna',
        help='extension of files to process, gz = gzipped')
    optional_classify.add_argument('--prefix',
                                   required=False,
                                   default='gtdbtk',
                                   help='desired prefix for output files')
    optional_classify.add_argument('--cpus',
                                   default=1,
                                   type=int,
                                   help='number of CPUs to use')
    optional_classify.add_argument(
        '--pplacer_cpus',
        type=int,
        default=None,
        help='use PPLACER_CPUS during placement (default: CPUS)')
    optional_classify.add_argument(
        '--scratch_dir',
        help='reduce memory usage by writing to disk (slower)')
    # optional_classify.add_argument('-s', '--split_tree', action='store_true',
    #                                help='Use shards of the reference tree (for Bacteria only). reduce memory usage (slower).')
    optional_classify.add_argument(
        '-r',
        '--recalculate_red',
        action='store_true',
        help=
        'recalculate RED values based on the reference tree and all added user genomes'
    )

    optional_classify.add_argument(
        '--debug',
        action="store_true",
        help='create intermediate files for debugging purposes')
    optional_classify.add_argument('-h',
                                   '--help',
                                   action="help",
                                   help="show help message")

    # root tree using outgroup
    root_parser = subparsers.add_parser(
        'root',
        conflict_handler='resolve',
        formatter_class=CustomHelpFormatter,
        help='Root tree using an outgroup.',
    )

    required_root = root_parser.add_argument_group('required named arguments')
    required_root.add_argument('--input_tree',
                               required=True,
                               help="tree to root in Newick format")
    required_root.add_argument(
        '--outgroup_taxon',
        required=True,
        help=
        "taxon to use as outgroup (e.g., p__Patescibacteria or p__Altiarchaeota)"
    )
    required_root.add_argument('--output_tree',
                               required=True,
                               help='output tree')

    optional_root = root_parser.add_argument_group('optional arguments')
    optional_root.add_argument(
        '--gtdbtk_classification_file',
        help=
        "file with GTDB-Tk classifications produced by the `classify` command")
    optional_root.add_argument(
        '--custom_taxonomy_file',
        help=
        "file indicating custom taxonomy strings for user genomes, which should contain any genomes belonging to the outgroup"
    )
    optional_root.add_argument('-h',
                               '--help',
                               action="help",
                               help="show help message")

    # decorate tree
    decorate_parser = subparsers.add_parser(
        'decorate',
        conflict_handler='resolve',
        formatter_class=CustomHelpFormatter,
        help='Decorate tree with GTDB taxonomy.',
    )

    required_decorate = decorate_parser.add_argument_group(
        'required named arguments')
    required_decorate.add_argument('--input_tree',
                                   required=True,
                                   help="tree to root in Newick format")
    required_decorate.add_argument('--output_tree',
                                   required=True,
                                   help='output tree')

    optional_decorate = decorate_parser.add_argument_group(
        'optional arguments')
    optional_decorate.add_argument(
        '--gtdbtk_classification_file',
        help=
        "file with GTDB-Tk classifications produced by the `classify` command")
    optional_decorate.add_argument(
        '--custom_taxonomy_file',
        help="file indicating custom taxonomy strings for user genomes")
    optional_decorate.add_argument('-h',
                                   '--help',
                                   action="help",
                                   help="show help message")

    # establish taxonomic ranks of internal nodes using RED
    infer_ranks_parser = subparsers.add_parser(
        'infer_ranks',
        conflict_handler='resolve',
        formatter_class=CustomHelpFormatter,
        help='Establish taxonomic ranks of internal nodes using RED.',
    )

    infer_ranks_req = infer_ranks_parser.add_argument_group(
        'required named arguments')
    infer_ranks_req.add_argument(
        '--input_tree',
        required=True,
        help="rooted input tree with labelled ingroup taxon")
    infer_ranks_req.add_argument(
        '--ingroup_taxon',
        required=True,
        help=
        "labelled ingroup taxon to use as root for establish RED values (e.g., c__Bacilli or f__Lactobacillaceae"
    )
    infer_ranks_req.add_argument('--output_tree',
                                 required=True,
                                 help="output tree")

    infer_ranks_opt = infer_ranks_parser.add_argument_group(
        'optional arguments')
    infer_ranks_opt.add_argument('-h',
                                 '--help',
                                 action="help",
                                 help="show help message")

    # ani_rep
    ani_rep_parser = subparsers.add_parser(
        'ani_rep',
        conflict_handler='resolve',
        formatter_class=CustomHelpFormatter,
        help='Calculates ANI to GTDB representative genomes.',
    )

    # ani_rep mutex required input genomes
    ani_rep_mutex_genome = ani_rep_parser.add_argument_group(
        'mutually exclusive required arguments')
    ani_rep_mutex_in = ani_rep_mutex_genome.add_mutually_exclusive_group(
        required=True)
    ani_rep_mutex_in.add_argument(
        '--genome_dir',
        help="directory containing genome files in FASTA format")
    ani_rep_mutex_in.add_argument(
        '--batchfile',
        help=
        "file describing genomes - tab separated in 2 columns (FASTA file, genome ID)"
    )

    # ani_rep required arguments
    ani_rep_req = ani_rep_parser.add_argument_group('required named arguments')
    ani_rep_req.add_argument('--out_dir',
                             required=True,
                             help="directory to output files")

    # ani_rep mash arguments
    ani_rep_mash = ani_rep_parser.add_argument_group('optional Mash arguments')
    ani_rep_mash.add_argument('--no_mash',
                              action='store_const',
                              const=True,
                              default=False,
                              help='skip pre-filtering using MASH')
    ani_rep_mash.add_argument('--mash_k',
                              default=16,
                              type=int,
                              help='k-mer size [1-32]')
    ani_rep_mash.add_argument('--mash_s',
                              default=5000,
                              type=int,
                              help='maximum number of non-redundant hashes')
    ani_rep_mash.add_argument('--mash_d',
                              default=0.1,
                              type=float,
                              help='maximum distance to keep [0-1]')
    ani_rep_mash.add_argument('--mash_v',
                              default=1.0,
                              type=float,
                              help='maximum p-value to keep [0-1]')

    ani_rep_fastani_opt = ani_rep_parser.add_argument_group(
        'optional FastANI arguments')
    ani_rep_fastani_opt.add_argument(
        '--min_af',
        default=AF_THRESHOLD,
        type=float,
        help='alignment fraction to consider closest genome')

    # ani_rep optional arguments
    ani_rep_opt = ani_rep_parser.add_argument_group('optional arguments')

    ani_rep_opt.add_argument(
        '-x',
        '--extension',
        default='fna',
        help='extension of files to process, gz = gzipped')
    ani_rep_opt.add_argument('--prefix',
                             default='gtdbtk',
                             help='desired prefix for output files')
    ani_rep_opt.add_argument('--cpus',
                             default=1,
                             type=int,
                             help='number of CPUs to use')
    ani_rep_opt.add_argument('-h',
                             '--help',
                             action="help",
                             help="show help message")

    # test
    test_parser = subparsers.add_parser(
        'test',
        conflict_handler='resolve',
        formatter_class=CustomHelpFormatter,
        help='Test the classify_wf pipeline with 3 archaeal genomes.')
    required_test = test_parser.add_argument_group('required named arguments')
    required_test.add_argument('--out_dir',
                               required=True,
                               help='directory to output files')
    optional_test = test_parser.add_argument_group('optional arguments')
    optional_test.add_argument('--cpus',
                               default=1,
                               type=int,
                               help='number of CPUs to use')
    optional_test.add_argument('-h',
                               '--help',
                               action="help",
                               help="show help message")

    # trim MSA
    msa_parser = subparsers.add_parser(
        'trim_msa',
        conflict_handler='resolve',
        formatter_class=CustomHelpFormatter,
        help='Trim an untrimmed MSA file based on a mask.',
    )

    required_msa = msa_parser.add_argument_group('required named arguments')
    required_msa.add_argument('--untrimmed_msa',
                              required=True,
                              help="untrimmed MSA file")
    required_msa.add_argument('--output', required=True, help='output file')
    mutual_trim_msa = msa_parser.add_argument_group(
        'mutually exclusive required arguments')
    mutex_msa_group = mutual_trim_msa.add_mutually_exclusive_group(
        required=True)
    mutex_msa_group.add_argument('--mask_file',
                                 help="mask file to use for trimming the MSA")
    mutex_msa_group.add_argument(
        '--reference_mask',
        choices=['arc', 'bac'],
        help="reference mask already present in GTDB-Tk")

    optional_msa = msa_parser.add_argument_group('optional arguments')
    optional_msa.add_argument('-h',
                              '--help',
                              action="help",
                              help="show help message")

    # export msa
    export_msa_parser = subparsers.add_parser(
        'export_msa',
        conflict_handler='resolve',
        formatter_class=CustomHelpFormatter,
        help='Export the untrimmed archaeal or bacterial MSA file.',
    )
    required_export_msa = export_msa_parser.add_argument_group(
        'required named arguments')
    required_export_msa.add_argument('--domain',
                                     required=True,
                                     choices=['arc', 'bac'],
                                     help="select domain to download")
    required_export_msa.add_argument('--output',
                                     required=True,
                                     help='output file')
    optional_export_msa = export_msa_parser.add_argument_group(
        'optional arguments')
    optional_export_msa.add_argument('-h',
                                     '--help',
                                     action="help",
                                     help="show help message")

    # verify install
    check_install_parser = subparsers.add_parser(
        'check_install',
        conflict_handler='resolve',
        formatter_class=CustomHelpFormatter,
        help='Verify if all gtdb data files are present to run GTDB-Tk.',
    )
    optional_check_install = check_install_parser.add_argument_group(
        'optional arguments')
    optional_check_install.add_argument('-h',
                                        '--help',
                                        action="help",
                                        help="show help message")

    # -------------------------------------------------
    # get and check options
    args = None
    if len(sys.argv) == 1:
        print_help()
        sys.exit(0)
    elif sys.argv[1] in {'-v', '--v', '-version', '--version'}:
        print("gtdbtk: version %s %s %s" %
              (__version__, __copyright__, __author__))
        sys.exit(0)
    elif sys.argv[1] in {'-h', '--h', '-help', '--help'}:
        print_help()
        sys.exit(0)
    else:
        args = parser.parse_args()

    # setup logger
    logger_setup(args.out_dir if hasattr(args, 'out_dir') else None,
                 "gtdbtk.log", "GTDB-Tk", __version__, False,
                 hasattr(args, 'debug') and args.debug)
    logger = logging.getLogger('timestamp')

    # -------------------------------------------------
    # do what we came here to do
    try:
        gt_parser = OptionsParser(__version__)
        if False:
            import cProfile

            cProfile.run('gt_parser.parseOptions(args)', 'prof')
        else:
            gt_parser.parse_options(args)
    except SystemExit:
        sys.stdout.write('\n')
        sys.stdout.flush()
        logger.error('Controlled exit resulting from early termination.')
        sys.exit(1)
    except KeyboardInterrupt:
        sys.stdout.write('\n')
        sys.stdout.flush()
        logger.error('Controlled exit resulting from interrupt signal.')
        sys.exit(1)
    except GTDBTkExit as e:
        sys.stdout.write('\n')
        sys.stdout.flush()
        if len(str(e)) > 0:
            logger.error('{}'.format(e))
        logger.error(
            'Controlled exit resulting from an unrecoverable error or warning.'
        )
        sys.exit(1)
    except (GTDBTkException, BioLibError) as e:
        sys.stdout.write('\n')
        sys.stdout.flush()
        msg = 'Controlled exit resulting from an unrecoverable error or warning.\n\n'
        msg += '=' * 80 + '\n'
        msg += 'EXCEPTION: {}\n'.format(type(e).__name__)
        msg += '  MESSAGE: {}\n'.format(e)
        msg += '_' * 80 + '\n\n'
        msg += traceback.format_exc()
        msg += '=' * 80
        logger.error(msg)
        sys.exit(1)
    except Exception as e:
        sys.stdout.write('\n')
        sys.stdout.flush()
        msg = 'Uncontrolled exit resulting from an unexpected error.\n\n'
        msg += '=' * 80 + '\n'
        msg += 'EXCEPTION: {}\n'.format(type(e).__name__)
        msg += '  MESSAGE: {}\n'.format(e)
        msg += '_' * 80 + '\n\n'
        msg += traceback.format_exc()
        msg += '=' * 80
        logger.error(msg)
        sys.exit(1)
示例#12
0
文件: test_cli.py 项目: fplaza/GTDBTk
class TestCli(unittest.TestCase):
    def setUp(self):
        self.identify_dir_reference = os.path.join(
            os.path.dirname(__file__), 'data/identify_dir_reference/')
        self.align_dir_reference = 'tests/data/align_dir_reference/'
        self.genome_dir = 'gtdbtk/tests/data/genomes/'

        self.options = argparse.ArgumentParser()
        self.options.batchfile = None
        self.options.prefix = 'gtdbtk'
        self.options.cpus = 1
        self.options.extension = 'fna'
        self.options.debug = False
        self.options.force = False
        self.options.genes = False
        self.options.write_single_copy_genes = False

        # align option
        self.options.skip_gtdb_refs = False
        self.options.taxa_filter = None
        self.options.custom_msa_filters = False
        self.options.skip_trimming = False
        self.options.min_consensus = None
        self.options.min_perc_taxa = None
        self.options.skip_gtdb_refs = False
        self.options.cols_per_gene = None
        self.options.max_consensus = None
        self.options.min_perc_aa = 50
        self.options.rnd_seed = 42

        # classify options
        self.options.scratch_dir = None
        self.options.keep_ref_red = None
        self.options.pplacer_cpus = None

        # infer options
        self.options.prot_model = 'WAG'
        self.options.no_support = False
        self.options.no_gamma = True

        self.version = ' unittest'
        self.optionparser = OptionsParser(self.version)
        logger_setup(None, "gtdbtk.log", "GTDB-Tk", self.version, True)
        # self.generic_out_path = 'tests/data/results'
        self.generic_out_path = tempfile.mkdtemp(prefix='gtdbtk_tmp_')

    def test_identify(self):
        tmp_folder = ''.join(
            random.choice(string.ascii_uppercase + string.digits)
            for _ in range(10))
        identify_options = self.options
        identify_options.genome_dir = self.genome_dir
        identify_options.out_dir = os.path.join(self.generic_out_path,
                                                tmp_folder, 'identify')
        self.optionparser.identify(identify_options)

        ar122_marker_path = os.path.join(
            self.options.out_dir,
            PATH_AR122_MARKER_SUMMARY.format(prefix=self.options.prefix))

        self.assertTrue(
            os.path.isfile(
                os.path.join(
                    self.options.out_dir,
                    PATH_BAC120_MARKER_SUMMARY.format(
                        prefix=self.options.prefix))))
        self.assertTrue(os.path.isfile(ar122_marker_path))

        results = {}
        with open(ar122_marker_path, 'r') as f:
            f.readline()
            for line in f:
                infos = line.split('\t', 1)
                results[infos[0]] = infos[1]
        self.assertTrue(results.get('genome_1').startswith('120\t2\t0\t'))

    def test_align(self):
        tmp_folder = ''.join(
            random.choice(string.ascii_uppercase + string.digits)
            for _ in range(10))
        align_options = self.options
        align_options.identify_dir = self.identify_dir_reference
        align_options.out_dir = os.path.join(self.generic_out_path, tmp_folder,
                                             'align')
        self.optionparser.align(align_options)
        path_user_msa = os.path.join(
            align_options.out_dir,
            PATH_AR122_USER_MSA.format(prefix=align_options.prefix))
        self.assertTrue(os.path.isfile(path_user_msa))
        with open(path_user_msa, 'r') as f:
            lines = f.read().splitlines()
            last_line = lines[-1]
        self.assertTrue(len(last_line) > 4500)
        self.assertTrue(len(last_line) < 5500)
        self.assertTrue('-' in last_line)
        self.assertFalse(any(char.isdigit() for char in last_line))

    def test_classify(self):
        tmp_folder = ''.join(
            random.choice(string.ascii_uppercase + string.digits)
            for _ in range(10))
        classify_options = self.options
        classify_options.genome_dir = self.genome_dir
        classify_options.align_dir = self.align_dir_reference
        classify_options.out_dir = os.path.join(self.generic_out_path,
                                                tmp_folder, 'classify')
        classify_options.recalculate_red = False
        classify_options.split_tree = False
        self.optionparser.classify(classify_options)
        summary_fh = ClassifySummaryFileAR122(classify_options.out_dir,
                                              classify_options.prefix)
        summary_fh.read()
        self.assertEqual(
            'd__Archaea;p__Euryarchaeota;c__Methanobacteria;o__Methanobacteriales;f__Methanobacteriaceae;g__Methanobrevibacter;s__Methanobrevibacter ruminantium',
            summary_fh.rows['genome_1'].classification)
        self.assertEqual(
            'd__Archaea;p__Thermoplasmatota;c__Thermoplasmata;o__Methanomassiliicoccales;f__Methanomethylophilaceae;g__VadinCA11;s__VadinCA11 sp002498365',
            summary_fh.rows['genome_2'].classification)
        self.assertEqual(
            'd__Archaea;p__Thermoplasmatota;c__Thermoplasmata;o__Methanomassiliicoccales;f__Methanomethylophilaceae;g__VadinCA11;s__VadinCA11 sp002498365',
            summary_fh.rows['genome_3'].classification)

    def test_identify_align(self):
        tmp_folder = ''.join(
            random.choice(string.ascii_uppercase + string.digits)
            for _ in range(10))

        identify_options = self.options
        identify_options.genome_dir = self.genome_dir
        identify_options.out_dir = os.path.join(self.generic_out_path,
                                                tmp_folder, 'identify')
        self.optionparser.identify(identify_options)

        align_options = self.options
        align_options.identify_dir = os.path.join(self.generic_out_path,
                                                  tmp_folder, 'identify')
        align_options.out_dir = os.path.join(self.generic_out_path, tmp_folder,
                                             'align')
        self.optionparser.align(align_options)
        path_user_msa = os.path.join(
            align_options.out_dir,
            PATH_AR122_USER_MSA.format(prefix=align_options.prefix))
        self.assertTrue(os.path.isfile(path_user_msa))
        with open(path_user_msa, 'r') as f:
            lines = f.read().splitlines()
            last_line = lines[-1]
        self.assertTrue(len(last_line) > 4500)
        self.assertTrue(len(last_line) < 5500)
        self.assertTrue('-' in last_line)
        self.assertFalse(any(char.isdigit() for char in last_line))

    def test_identify_align_classify(self):
        tmp_folder = ''.join(
            random.choice(string.ascii_uppercase + string.digits)
            for _ in range(10))

        identify_options = self.options
        identify_options.genome_dir = self.genome_dir
        identify_options.out_dir = os.path.join(self.generic_out_path,
                                                tmp_folder, 'identify')
        self.optionparser.identify(identify_options)

        align_options = self.options
        align_options.identify_dir = os.path.join(self.generic_out_path,
                                                  tmp_folder, 'identify')
        align_options.out_dir = os.path.join(self.generic_out_path, tmp_folder,
                                             'align')
        self.optionparser.align(align_options)
        path_user_msa = os.path.join(
            align_options.out_dir,
            PATH_AR122_USER_MSA.format(prefix=align_options.prefix))
        self.assertTrue(os.path.isfile(path_user_msa))
        with open(path_user_msa, 'r') as f:
            lines = f.read().splitlines()
            last_line = lines[-1]
        self.assertTrue(len(last_line) > 4500)
        self.assertTrue(len(last_line) < 5500)
        self.assertTrue('-' in last_line)
        self.assertFalse(any(char.isdigit() for char in last_line))

        classify_options = self.options
        classify_options.genome_dir = self.genome_dir
        classify_options.split_tree = False
        classify_options.align_dir = align_options.out_dir
        classify_options.out_dir = os.path.join(self.generic_out_path,
                                                tmp_folder, 'classify')
        classify_options.recalculate_red = False
        self.optionparser.classify(classify_options)
        summary_out = os.path.join(
            classify_options.out_dir,
            PATH_AR122_SUMMARY_OUT.format(prefix=classify_options.prefix))
        self.assertTrue(summary_out)
        with open(summary_out, 'r') as f:
            lines = f.read().splitlines()
            last_line = lines[-1]
        infos = last_line.split('\t')
        self.assertEquals(len(infos), 19)
        self.assertTrue(infos[1].startswith('d__Archaea'))

    def test_classify_wf(self):
        tmp_folder = ''.join(
            random.choice(string.ascii_uppercase + string.digits)
            for _ in range(10))
        classify_wf_options = self.options
        classify_wf_options.genome_dir = self.genome_dir
        classify_wf_options.out_dir = os.path.join(self.generic_out_path,
                                                   tmp_folder, 'classify_wf')
        self.optionparser.identify(classify_wf_options)
        classify_wf_options.identify_dir = classify_wf_options.out_dir
        classify_wf_options.align_dir = classify_wf_options.out_dir
        classify_wf_options.taxa_filter = None
        classify_wf_options.custom_msa_filters = False
        classify_wf_options.min_consensus = None
        classify_wf_options.min_perc_taxa = None
        classify_wf_options.skip_gtdb_refs = False
        classify_wf_options.cols_per_gene = None
        classify_wf_options.max_consensus = None
        classify_wf_options.recalculate_red = False
        classify_wf_options.split_tree = False
        self.optionparser.align(classify_wf_options)
        self.optionparser.classify(classify_wf_options)
        summary_out = os.path.join(
            classify_wf_options.out_dir,
            PATH_AR122_SUMMARY_OUT.format(prefix=classify_wf_options.prefix))
        self.assertTrue(os.path.isfile(summary_out))
        with open(summary_out, 'r') as f:
            lines = f.read().splitlines()
            last_line = lines[-1]
        infos = last_line.split('\t')
        self.assertEqual(len(infos), 19)
        self.assertTrue(infos[1].startswith('d__Archaea'))

    def test_infer(self):
        tmp_folder = ''.join(
            random.choice(string.ascii_uppercase + string.digits)
            for _ in range(10))
        infer_options = self.options
        path_user_msa = PATH_AR122_USER_MSA.format(prefix=self.options.prefix)
        infer_options.msa_file = os.path.join(self.align_dir_reference,
                                              path_user_msa)
        infer_options.out_dir = os.path.join(self.generic_out_path, tmp_folder,
                                             'infer')
        infer_options.gamma = False
        # if not os.path.isdir(infer_options.out_dir):
        #     os.makedirs(infer_options.out_dir)
        self.optionparser.infer(infer_options)
        with open(
                os.path.join(infer_options.out_dir,
                             PATH_TREE_LOG.format(prefix=self.options.prefix)),
                'r') as f:
            lines = f.read().splitlines()
            last_line = lines[-1]
        self.assertEqual(last_line.strip(), 'TreeCompleted')
        with open(
                os.path.join(
                    infer_options.out_dir,
                    PATH_UNROOTED_TREE.format(prefix=self.options.prefix)),
                'r') as f:
            lines = f.read().splitlines()
            last_line = lines[-1]
        self.assertTrue('genome_1' in last_line)
        self.assertTrue('genome_2' in last_line)
        self.assertTrue('genome_3' in last_line)

    def test_de_novo_wf(self):
        tmp_folder = ''.join(
            random.choice(string.ascii_uppercase + string.digits)
            for _ in range(10))
        de_novo_wf_options = self.options
        de_novo_wf_options.genome_dir = self.genome_dir
        de_novo_wf_options.suffix = ".ar122"
        de_novo_wf_options.gamma = False
        de_novo_wf_options.out_dir = os.path.join(self.generic_out_path,
                                                  tmp_folder, 'de_novo_wf')
        de_novo_wf_options.identify_dir = de_novo_wf_options.out_dir
        de_novo_wf_options.msa_file = os.path.join(
            de_novo_wf_options.out_dir, de_novo_wf_options.prefix +
            de_novo_wf_options.suffix + ".user_msa.fasta")
        self.optionparser.identify(de_novo_wf_options)
        self.optionparser.align(de_novo_wf_options)
        self.optionparser.infer(de_novo_wf_options)

    def test_identify_gzipped_genomes(self):
        """ Test that gene calling is successful when using gzipped genomes """
        options = argparse.ArgumentParser()
        options.genome_dir = 'tests/data/genomes_gz/'
        options.cpus = 5
        options.batchfile = None
        options.extension = 'fna.gz'
        options.write_single_copy_genes = False
        options.prefix = 'gtdbtk'
        options.force = None
        options.genes = False
        options.out_dir = self.generic_out_path
        self.optionparser.identify(options)

        self.assertTrue(
            are_files_equal(
                os.path.join(
                    self.identify_dir_reference,
                    PATH_BAC120_MARKER_SUMMARY.format(prefix='gtdbtk')),
                os.path.join(
                    self.generic_out_path,
                    PATH_BAC120_MARKER_SUMMARY.format(prefix='gtdbtk')),
                ignore_order=True))

        self.assertTrue(
            are_files_equal(
                os.path.join(
                    self.identify_dir_reference,
                    PATH_AR122_MARKER_SUMMARY.format(prefix='gtdbtk')),
                os.path.join(
                    self.generic_out_path,
                    PATH_AR122_MARKER_SUMMARY.format(prefix='gtdbtk')),
                ignore_order=True))

        self.assertTrue(
            are_files_equal(
                os.path.join(self.identify_dir_reference,
                             PATH_TLN_TABLE_SUMMARY.format(prefix='gtdbtk')),
                os.path.join(self.generic_out_path,
                             PATH_TLN_TABLE_SUMMARY.format(prefix='gtdbtk')),
                ignore_order=True))

    def test_root(self):
        """Test that rooting is successful when called through the CLI"""
        options = argparse.ArgumentParser()
        options.input_tree = 'tests/data/pplacer_dir_reference/gtdbtk.ar122.classify.tree'
        options.outgroup_taxon = 'p__Altarchaeota'
        options.output_tree = os.path.join(self.generic_out_path,
                                           'test.rooted.tree')
        options.custom_taxonomy_file = None
        options.gtdbtk_classification_file = None
        self.optionparser.root(options)
        self.assertTrue(os.path.isfile(options.output_tree))

    def tearDown(self):
        shutil.rmtree(self.generic_out_path)
示例#13
0
class TestCli(unittest.TestCase):
    def setUp(self):
        self.identify_dir_reference = os.path.join(
            os.path.dirname(__file__), 'data/identify_dir_reference/')
        self.align_dir_reference = 'tests/data/align_dir_reference/'
        self.genome_dir = 'gtdbtk/tests/data/genomes/'

        self.options = argparse.ArgumentParser()
        self.options.batchfile = None
        self.options.prefix = 'gtdbtk'
        self.options.cpus = 1
        self.options.extension = 'fna'
        self.options.debug = False
        self.options.force = False

        # align option
        self.options.skip_gtdb_refs = False
        self.options.taxa_filter = None
        self.options.custom_msa_filters = False
        self.options.skip_trimming = False
        self.options.min_consensus = None
        self.options.min_perc_taxa = None
        self.options.skip_gtdb_refs = False
        self.options.cols_per_gene = None
        self.options.max_consensus = None
        self.options.min_perc_aa = 50
        self.options.rnd_seed = 42

        # classify options
        self.options.scratch_dir = None
        self.options.keep_ref_red = None

        # infer options
        self.options.prot_model = 'WAG'
        self.options.no_support = False
        self.options.no_gamma = True

        self.version = ' unittest'
        self.optionparser = OptionsParser(self.version)
        logger_setup(None, "gtdbtk.log", "GTDB-Tk", self.version, True)
        # self.generic_out_path = 'tests/data/results'
        self.generic_out_path = '/tmp/GTDBTk/tests'

    def test_identify(self):
        tmp_folder = ''.join(
            random.choice(string.ascii_uppercase + string.digits)
            for _ in range(10))
        identify_options = self.options
        identify_options.genome_dir = self.genome_dir
        identify_options.out_dir = os.path.join(self.generic_out_path,
                                                tmp_folder, 'identify')
        self.optionparser.identify(identify_options)

        ar122_marker_path = os.path.join(
            self.options.out_dir,
            PATH_AR122_MARKER_SUMMARY.format(prefix=self.options.prefix))

        self.assertTrue(
            os.path.isfile(
                os.path.join(
                    self.options.out_dir,
                    PATH_BAC120_MARKER_SUMMARY.format(
                        prefix=self.options.prefix))))
        self.assertTrue(os.path.isfile(ar122_marker_path))

        results = {}
        with open(ar122_marker_path, 'r') as f:
            f.readline()
            for line in f:
                infos = line.split('\t', 1)
                results[infos[0]] = infos[1]
        self.assertTrue(results.get('genome_1').startswith('120\t2\t0\t'))

    def test_align(self):
        tmp_folder = ''.join(
            random.choice(string.ascii_uppercase + string.digits)
            for _ in range(10))
        align_options = self.options
        align_options.identify_dir = self.identify_dir_reference
        align_options.out_dir = os.path.join(self.generic_out_path, tmp_folder,
                                             'align')
        self.optionparser.align(align_options)
        path_user_msa = os.path.join(
            align_options.out_dir,
            PATH_AR122_USER_MSA.format(prefix=align_options.prefix))
        self.assertTrue(os.path.isfile(path_user_msa))
        with open(path_user_msa, 'r') as f:
            lines = f.read().splitlines()
            last_line = lines[-1]
        self.assertTrue(len(last_line) > 4500)
        self.assertTrue(len(last_line) < 5500)
        self.assertTrue('-' in last_line)
        self.assertFalse(any(char.isdigit() for char in last_line))

    def test_classify(self):
        tmp_folder = ''.join(
            random.choice(string.ascii_uppercase + string.digits)
            for _ in range(10))
        classify_options = self.options
        classify_options.genome_dir = self.genome_dir
        classify_options.align_dir = self.align_dir_reference
        classify_options.out_dir = os.path.join(self.generic_out_path,
                                                tmp_folder, 'classify')
        self.optionparser.classify(classify_options)
        summary_out = os.path.join(
            classify_options.out_dir,
            PATH_AR122_SUMMARY_OUT.format(prefix=classify_options.prefix))
        self.assertTrue(os.path.isfile(summary_out))
        with open(summary_out, 'r') as f:
            lines = f.read().splitlines()
            last_line = lines[-1]
        infos = last_line.split('\t')
        self.assertEquals(len(infos), 18)
        self.assertTrue(infos[1].startswith('d__Archaea'))

    def test_identify_align(self):
        tmp_folder = ''.join(
            random.choice(string.ascii_uppercase + string.digits)
            for _ in range(10))

        identify_options = self.options
        identify_options.genome_dir = self.genome_dir
        identify_options.out_dir = os.path.join(self.generic_out_path,
                                                tmp_folder, 'identify')
        self.optionparser.identify(identify_options)

        align_options = self.options
        align_options.identify_dir = os.path.join(self.generic_out_path,
                                                  tmp_folder, 'identify')
        align_options.out_dir = os.path.join(self.generic_out_path, tmp_folder,
                                             'align')
        self.optionparser.align(align_options)
        path_user_msa = os.path.join(
            align_options.out_dir,
            PATH_AR122_USER_MSA.format(prefix=align_options.prefix))
        self.assertTrue(os.path.isfile(path_user_msa))
        with open(path_user_msa, 'r') as f:
            lines = f.read().splitlines()
            last_line = lines[-1]
        self.assertTrue(len(last_line) > 4500)
        self.assertTrue(len(last_line) < 5500)
        self.assertTrue('-' in last_line)
        self.assertFalse(any(char.isdigit() for char in last_line))

    def test_identify_align_classify(self):
        tmp_folder = ''.join(
            random.choice(string.ascii_uppercase + string.digits)
            for _ in range(10))

        identify_options = self.options
        identify_options.genome_dir = self.genome_dir
        identify_options.out_dir = os.path.join(self.generic_out_path,
                                                tmp_folder, 'identify')
        self.optionparser.identify(identify_options)

        align_options = self.options
        align_options.identify_dir = os.path.join(self.generic_out_path,
                                                  tmp_folder, 'identify')
        align_options.out_dir = os.path.join(self.generic_out_path, tmp_folder,
                                             'align')
        self.optionparser.align(align_options)
        path_user_msa = os.path.join(
            align_options.out_dir,
            PATH_AR122_USER_MSA.format(prefix=align_options.prefix))
        self.assertTrue(os.path.isfile(path_user_msa))
        with open(path_user_msa, 'r') as f:
            lines = f.read().splitlines()
            last_line = lines[-1]
        self.assertTrue(len(last_line) > 4500)
        self.assertTrue(len(last_line) < 5500)
        self.assertTrue('-' in last_line)
        self.assertFalse(any(char.isdigit() for char in last_line))

        classify_options = self.options
        classify_options.genome_dir = self.genome_dir
        classify_options.align_dir = align_options.out_dir
        classify_options.out_dir = os.path.join(self.generic_out_path,
                                                tmp_folder, 'classify')
        self.optionparser.classify(classify_options)
        summary_out = os.path.join(
            classify_options.out_dir,
            PATH_AR122_SUMMARY_OUT.format(prefix=classify_options.prefix))
        self.assertTrue(summary_out)
        with open(summary_out, 'r') as f:
            lines = f.read().splitlines()
            last_line = lines[-1]
        infos = last_line.split('\t')
        self.assertEquals(len(infos), 18)
        self.assertTrue(infos[1].startswith('d__Archaea'))

    def test_classify_wf(self):
        tmp_folder = ''.join(
            random.choice(string.ascii_uppercase + string.digits)
            for _ in range(10))
        classify_wf_options = self.options
        classify_wf_options.genome_dir = self.genome_dir
        classify_wf_options.out_dir = os.path.join(self.generic_out_path,
                                                   tmp_folder, 'classify_wf')
        self.optionparser.identify(classify_wf_options)
        classify_wf_options.identify_dir = classify_wf_options.out_dir
        classify_wf_options.align_dir = classify_wf_options.out_dir
        classify_wf_options.taxa_filter = None
        classify_wf_options.custom_msa_filters = False
        classify_wf_options.min_consensus = None
        classify_wf_options.min_perc_taxa = None
        classify_wf_options.skip_gtdb_refs = False
        classify_wf_options.cols_per_gene = None
        classify_wf_options.max_consensus = None
        self.optionparser.align(classify_wf_options)
        self.optionparser.classify(classify_wf_options)
        summary_out = os.path.join(
            classify_wf_options.out_dir,
            PATH_AR122_SUMMARY_OUT.format(prefix=classify_wf_options.prefix))
        self.assertTrue(os.path.isfile(summary_out))
        with open(summary_out, 'r') as f:
            lines = f.read().splitlines()
            last_line = lines[-1]
        infos = last_line.split('\t')
        self.assertEquals(len(infos), 18)
        self.assertTrue(infos[1].startswith('d__Archaea'))

    def test_infer(self):
        tmp_folder = ''.join(
            random.choice(string.ascii_uppercase + string.digits)
            for _ in range(10))
        infer_options = self.options
        path_user_msa = PATH_AR122_USER_MSA.format(prefix=self.options.prefix)
        infer_options.msa_file = os.path.join(self.align_dir_reference,
                                              path_user_msa)
        infer_options.out_dir = os.path.join(self.generic_out_path, tmp_folder,
                                             'infer')
        # if not os.path.isdir(infer_options.out_dir):
        #     os.makedirs(infer_options.out_dir)
        self.optionparser.infer(infer_options)
        with open(
                os.path.join(infer_options.out_dir,
                             PATH_TREE_LOG.format(prefix=self.options.prefix)),
                'r') as f:
            lines = f.read().splitlines()
            last_line = lines[-1]
        self.assertEqual(last_line.strip(), 'TreeCompleted')
        with open(
                os.path.join(
                    infer_options.out_dir,
                    PATH_UNROOTED_TREE.format(prefix=self.options.prefix)),
                'r') as f:
            lines = f.read().splitlines()
            last_line = lines[-1]
        self.assertTrue('genome_1' in last_line)
        self.assertTrue('genome_2' in last_line)
        self.assertTrue('genome_3' in last_line)

    def test_de_novo_wf(self):
        tmp_folder = ''.join(
            random.choice(string.ascii_uppercase + string.digits)
            for _ in range(10))
        de_novo_wf_options = self.options
        de_novo_wf_options.genome_dir = self.genome_dir
        de_novo_wf_options.suffix = ".ar122"
        de_novo_wf_options.out_dir = os.path.join(self.generic_out_path,
                                                  tmp_folder, 'de_novo_wf')
        de_novo_wf_options.identify_dir = de_novo_wf_options.out_dir
        de_novo_wf_options.msa_file = os.path.join(
            de_novo_wf_options.out_dir, de_novo_wf_options.prefix +
            de_novo_wf_options.suffix + ".user_msa.fasta")
        self.optionparser.identify(de_novo_wf_options)
        self.optionparser.align(de_novo_wf_options)
        self.optionparser.infer(de_novo_wf_options)

    def tearDown(self):
        shutil.rmtree(self.generic_out_path)
示例#14
0
class TestOptionsParser(unittest.TestCase):
    def setUp(self):
        self.options_parser = OptionsParser('-1')
        self.dir_tmp = tempfile.mkdtemp(prefix='gtdbtk_tmp_')
        pass

    def tearDown(self):
        shutil.rmtree(self.dir_tmp)

    def test__verify_genome_id__valid(self):
        """ Test that a valid genome id returns True. """
        self.assertTrue(self.options_parser._verify_genome_id('genome_1'))

    def test__verify_genome_id__invalid(self):
        """ Test that invalid genome ids throw an exception. """
        for c in list('()[],;='):
            self.assertRaises(GenomeNameInvalid,
                              self.options_parser._verify_genome_id,
                              'genome%s1' % c)

    def test__genomes_to_process__genome_dir__valid(self):
        """ Test that the expected results are returned when using genome_dir. """
        open(os.path.join(self.dir_tmp, 'genome_1.fna'), 'a').close()
        open(os.path.join(self.dir_tmp, 'genome_2.fna'), 'a').close()
        open(os.path.join(self.dir_tmp, 'other_file.txt'), 'a').close()
        results, tln_table = self.options_parser._genomes_to_process(
            self.dir_tmp, '', 'fna')
        expected = {
            'genome_1': os.path.join(self.dir_tmp, 'genome_1.fna'),
            'genome_2': os.path.join(self.dir_tmp, 'genome_2.fna')
        }
        self.assertDictEqual(results, expected)

    def test__genomes_to_process__batchfile__valid(self):
        """ Test that the expected results are returned when using batchfile """
        path_batchfile = os.path.join(self.dir_tmp, 'batchfile.txt')
        path_genome_1 = os.path.join(self.dir_tmp, 'genome_1.fna')
        path_genome_2 = os.path.join(self.dir_tmp, 'genome_2.fna')
        open(path_genome_1, 'a').close()
        open(path_genome_2, 'a').close()

        with open(path_batchfile, 'a') as f:
            f.write(f'{path_genome_1}\tgenome_1\n')
            f.write('\n')
            f.write(f'{path_genome_2}\tgenome_2\t4\n')

        results, tln_table = self.options_parser._genomes_to_process(
            '', path_batchfile, 'fna')
        expected = {'genome_1': path_genome_1, 'genome_2': path_genome_2}
        expected_tln = {'genome_2': 4}
        self.assertDictEqual(results, expected)
        self.assertDictEqual(tln_table, expected_tln)

    def test__genomes_to_process__batchfile__invalid_columns(self):
        """ Test that a batchfile containing columns not equal to 2 throws an exception. """
        path_batchfile = os.path.join(self.dir_tmp, 'batchfile.txt')
        path_genome_1 = os.path.join(self.dir_tmp, 'genome_1.fna')
        path_genome_2 = os.path.join(self.dir_tmp, 'genome_2.fna')
        open(path_genome_1, 'a').close()
        open(path_genome_2, 'a').close()

        with open(path_batchfile, 'a') as f:
            f.write('%s\tgenome_1\n' % path_genome_1)
            f.write('\n')
            f.write('%s\tgenome_2\tfoo\n' % path_genome_2)

        self.assertRaises(GTDBTkExit, self.options_parser._genomes_to_process,
                          '', path_batchfile, 'fna')

    def test__genomes_to_process__batchfile__blank_genome_path(self):
        """ Test that a batchfile containing a blank genome path throws an exception. """
        path_batchfile = os.path.join(self.dir_tmp, 'batchfile.txt')
        path_genome_1 = os.path.join(self.dir_tmp, 'genome_1.fna')
        path_genome_2 = os.path.join(self.dir_tmp, 'genome_2.fna')
        open(path_genome_1, 'a').close()
        open(path_genome_2, 'a').close()

        with open(path_batchfile, 'a') as f:
            f.write('%s\tgenome_1\n' % path_genome_1)
            f.write('\n')
            f.write('%s\tgenome_2\n' % '')

        self.assertRaises(GTDBTkExit, self.options_parser._genomes_to_process,
                          '', path_batchfile, 'fna')

    def test__genomes_to_process__batchfile__blank_genome_id(self):
        """ Test that a batchfile containing a blank genome id throws an exception. """
        path_batchfile = os.path.join(self.dir_tmp, 'batchfile.txt')
        path_genome_1 = os.path.join(self.dir_tmp, 'genome_1.fna')
        path_genome_2 = os.path.join(self.dir_tmp, 'genome_2.fna')
        open(path_genome_1, 'a').close()
        open(path_genome_2, 'a').close()

        with open(path_batchfile, 'a') as f:
            f.write('%s\tgenome_1\n' % path_genome_1)
            f.write('\n')
            f.write('%s\t\n' % path_genome_2)

        self.assertRaises(GTDBTkExit, self.options_parser._genomes_to_process,
                          '', path_batchfile, 'fna')

    def test__genomes_to_process__batchfile__duplicate_genome_id(self):
        """ Test that a batchfile containing duplicate genome ids throws an exception. """
        # Branch 1: The number of columns are not equal to 2.
        path_batchfile = os.path.join(self.dir_tmp, 'batchfile.txt')
        path_genome_1 = os.path.join(self.dir_tmp, 'genome_1.fna')
        path_genome_2 = os.path.join(self.dir_tmp, 'genome_2.fna')
        open(path_genome_1, 'a').close()
        open(path_genome_2, 'a').close()

        with open(path_batchfile, 'a') as f:
            f.write('%s\tgenome_1\n' % path_genome_1)
            f.write('\n')
            f.write('%s\tgenome_1\n' % path_genome_2)

        self.assertRaises(GTDBTkExit, self.options_parser._genomes_to_process,
                          '', path_batchfile, 'fna')

    # def test__genomes_to_process__batchfile__invalid_genome_id(self):
    #     """ Test that a batchfile containing duplicate genome ids throws an exception. """
    #     # Branch 1: The number of columns are not equal to 2.
    #     path_batchfile_1 = os.path.join(self.dir_tmp, 'batchfile_1.txt')
    #     path_batchfile_2 = os.path.join(self.dir_tmp, 'batchfile_2.txt')
    #     path_batchfile_3 = os.path.join(self.dir_tmp, 'batchfile_3.txt')
    #     path_genome_1 = os.path.join(self.dir_tmp, 'genome_1.fna')
    #     path_genome_2 = os.path.join(self.dir_tmp, 'genome_2.fna')
    #     open(path_genome_1, 'a').close()
    #     open(path_genome_2, 'a').close()
    #
    #     with open(path_batchfile_1, 'a') as f:
    #         f.write('%s\tgenome_1\n' % path_genome_1)
    #         f.write('\n')
    #         f.write('%s\tGB_genome_2\n' % path_genome_2)
    #
    #     with open(path_batchfile_2, 'a') as f:
    #         f.write('%s\tgenome_1\n' % path_genome_1)
    #         f.write('\n')
    #         f.write('%s\tRS_genome_2\n' % path_genome_2)
    #
    #     with open(path_batchfile_3, 'a') as f:
    #         f.write('%s\tgenome_1\n' % path_genome_1)
    #         f.write('\n')
    #         f.write('%s\tUBAgenome_2\n' % path_genome_2)
    #
    #     self.assertRaises(GTDBTkExit, self.options_parser._genomes_to_process, '', path_batchfile_1, 'fna')
    #     self.assertRaises(GTDBTkExit, self.options_parser._genomes_to_process, '', path_batchfile_2, 'fna')
    #     self.assertRaises(GTDBTkExit, self.options_parser._genomes_to_process, '', path_batchfile_3, 'fna')

    def test__genomes_to_process__no_files(self):
        """ Test that an exception is thrown if no files are found to process """
        # Branch 1 : genome_dir is specified
        tmp_genome_dir = tempfile.mkdtemp()
        try:
            self.assertRaises(GTDBTkExit,
                              self.options_parser._genomes_to_process,
                              tmp_genome_dir, '', 'fna')
        finally:
            shutil.rmtree(tmp_genome_dir)

        # Branch 2: batchfile is specified
        tmp_genome_dir = tempfile.mkdtemp()
        try:
            path_batchfile = os.path.join(tmp_genome_dir, 'batchfile.txt')
            open(path_batchfile, 'a').close()
            self.assertRaises(GTDBTkExit,
                              self.options_parser._genomes_to_process, '',
                              path_batchfile, 'fna')
        finally:
            shutil.rmtree(tmp_genome_dir)

    def test_identify__genome_dir_raises_io_exception(self):
        """ Test that the identify method raises an exception on invalid genome_dir """
        options = argparse.ArgumentParser()
        options.genome_dir = os.path.join(tempfile.gettempdir(),
                                          'non-existent-dir')
        self.assertRaises(BioLibDirNotFound, self.options_parser.identify,
                          options)

    def test_identify__batchfile_raises_io_exception(self):
        """ Test that the identify method raises an exception on invalid batchfile """
        options = argparse.ArgumentParser()
        options.genome_dir = None
        options.batchfile = os.path.join(tempfile.gettempdir(),
                                         'non-existent-file.txt')
        self.assertRaises(BioLibFileNotFound, self.options_parser.identify,
                          options)

    def test_align__identify_dir_raises_io_exception(self):
        """ Test that the align method raises an exception on invalid identify dir """
        options = argparse.ArgumentParser()
        options.identify_dir = os.path.join(tempfile.gettempdir(),
                                            'non-existent-dir')
        self.assertRaises(BioLibDirNotFound, self.options_parser.align,
                          options)

    def test_infer__msa_raises_io_exception(self):
        """ Test that the infer method raises an exception on invalid MSA """
        options = argparse.ArgumentParser()
        options.msa_file = os.path.join(tempfile.gettempdir(),
                                        'non-existent-msa.txt')
        self.assertRaises(BioLibFileNotFound, self.options_parser.infer,
                          options)

    def test_run_test(self):
        """Test that the user-test method runs correctly"""
        options = argparse.ArgumentParser()
        options.out_dir = self.dir_tmp
        options.cpus = 3
        self.assertTrue(self.options_parser.run_test(options))

    # def test_run_test__throws_exception(self):
    #     """Test that the user-test method fails correctly"""
    #     options = argparse.ArgumentParser()
    #     options.out_dir = self.dir_tmp
    #     os.mkdir(os.path.join(self.dir_tmp, 'genomes'))
    #     options.cpus = 3
    #     self.assertRaises(GTDBTkTestFailure, self.options_parser.run_test, options)

    def test_classify__align_dir_raises_io_exception(self):
        """ Test that the classify method raises an exception on invalid align dir """
        options = argparse.ArgumentParser()
        options.align_dir = os.path.join(tempfile.gettempdir(),
                                         'non-existent-dir')
        self.assertRaises(BioLibDirNotFound, self.options_parser.classify,
                          options)

    def test_root__no_tree_raises_io_exception(self):
        """ Test that the infer method raises an exception on invalid tree """
        options = argparse.ArgumentParser()
        options.input_tree = os.path.join(tempfile.gettempdir(),
                                          'non-existent-tree.tree')
        self.assertRaises(BioLibFileNotFound, self.options_parser.root,
                          options)

    def test_decorate__no_tree_raises_io_exception(self):
        """ Test that the infer method raises an exception on invalid tree """
        options = argparse.ArgumentParser()
        options.input_tree = os.path.join(tempfile.gettempdir(),
                                          'non-existent-tree.tree')
        self.assertRaises(BioLibFileNotFound, self.options_parser.decorate,
                          options)

    def test_trim_msa__mask_file(self):
        """ Test that the expected result is returned when running trim_msa with mask_file """
        path_untrimmed_msa = os.path.join(self.dir_tmp, 'untrimmed_msa.fasta')
        path_mask_file = os.path.join(self.dir_tmp, 'mask_file.txt')
        path_output = os.path.join(self.dir_tmp, 'trimmed_msa.fasta')

        with open(path_untrimmed_msa, 'w') as f:
            f.write('>genome_1\n')
            f.write('ALGPVW\n')
            f.write('>genome_2\n')
            f.write('WVPGLA\n')

        with open(path_mask_file, 'w') as f:
            f.write('010010\n')

        options = argparse.ArgumentParser()
        # Required arguments
        options.untrimmed_msa = path_untrimmed_msa
        options.output = path_output
        # Mutex arguments
        options.mask_file = path_mask_file
        options.reference_mask = None

        self.options_parser.trim_msa(options)

        results = dict()
        with open(path_output, 'r') as f:
            re_hits = re.findall(r'>(.+)\n(.+)\n', f.read())
            for gid, seq in re_hits:
                results[gid] = seq

        expected = {'genome_1': 'LV', 'genome_2': 'VL'}

        self.assertDictEqual(results, expected)

    def test_trim_msa__reference_mask_arc(self):
        """ Test that the expected result is returned when running trim_msa with archaeal reference_mask """
        path_untrimmed_msa = os.path.join(self.dir_tmp, 'untrimmed_msa.fasta')
        path_output = os.path.join(self.dir_tmp, 'trimmed_msa.fasta')
        shutil.copyfile(Config.CONCAT_AR122, path_untrimmed_msa)

        options = argparse.ArgumentParser()
        # Required arguments
        options.untrimmed_msa = path_untrimmed_msa
        options.output = path_output
        # Mutex arguments
        options.mask_file = None
        options.reference_mask = 'arc'

        self.options_parser.trim_msa(options)

        actual = sha256(path_output)
        expected = '1146351be59ae8d27668256c5b2c425a6f38c37c'

        self.assertEqual(actual, expected)

    def test_trim_msa__reference_mask_bac(self):
        """ Test that the expected result is returned when running trim_msa with bacterial reference_mask """
        path_untrimmed_msa = os.path.join(self.dir_tmp, 'untrimmed_msa.fasta')
        path_output = os.path.join(self.dir_tmp, 'trimmed_msa.fasta')
        shutil.copyfile(Config.CONCAT_BAC120, path_untrimmed_msa)

        options = argparse.ArgumentParser()
        # Required arguments
        options.untrimmed_msa = path_untrimmed_msa
        options.output = path_output
        # Mutex arguments
        options.mask_file = None
        options.reference_mask = 'bac'

        self.options_parser.trim_msa(options)

        actual = sha256(path_output)
        expected = 'ae6e24e89540fed03b81436147f99bcd120d059a'

        self.assertEqual(actual, expected)

    def test_export_msa__arc(self):
        """ Test that the untrimmed archaeal MSA is exported correctly """
        path_out = os.path.join(self.dir_tmp, 'output.fasta')

        options = argparse.ArgumentParser()
        options.domain = 'arc'
        options.output = path_out

        self.options_parser.export_msa(options)

        with open(path_out, 'rb') as f:
            out_hash = hashlib.sha256(f.read()).hexdigest()
        self.assertEqual(
            out_hash,
            '8706b42a3f4b2445273058e7e876f0d8332bd8dec95c0fc8bc024d76a5a5aade')

    def test_export_msa__bac(self):
        """ Test that the untrimmed bacterial MSA is exported correctly """
        path_out = os.path.join(self.dir_tmp, 'output.fasta')

        options = argparse.ArgumentParser()
        options.domain = 'bac'
        options.output = path_out

        self.options_parser.export_msa(options)

        with open(path_out, 'rb') as f:
            out_hash = hashlib.sha256(f.read()).hexdigest()
        self.assertEqual(
            out_hash,
            '3c5dfa4dc5ef943459e6d0ed4da1e5a5858332c824739630beffb57fab303486')