def setUp(self): self.identify_dir_reference = 'tests/data/identify_dir_reference/' self.align_dir_reference = 'tests/data/align_dir_reference/' self.genome_dir = 'gtdbtk/tests/data/genomes/' self.options = argparse.ArgumentParser() self.options.batchfile = None self.options.prefix = 'gtdbtk' self.options.cpus = 1 self.options.extension = 'fna' self.options.debug = False # align option self.options.skip_gtdb_refs = False self.options.taxa_filter = None self.options.custom_msa_filters = False self.options.min_consensus = None self.options.min_perc_taxa = None self.options.skip_gtdb_refs = False self.options.cols_per_gene = None self.options.max_consensus = None self.options.min_perc_aa = 50 # classify options self.options.scratch_dir = None # infer options self.options.prot_model = 'WAG' self.options.no_support = False self.options.no_gamma = True self.version = ' unittest' self.optionparser = OptionsParser(self.version) logger_setup(None, "gtdbtk.log", "GTDB-Tk", self.version, True) self.generic_out_path = 'tests/data/results'
def setUp(self): self.identify_dir_reference = os.path.join( os.path.dirname(__file__), 'data/identify_dir_reference/') self.align_dir_reference = 'tests/data/align_dir_reference/' self.genome_dir = 'gtdbtk/tests/data/genomes/' self.options = argparse.ArgumentParser() self.options.batchfile = None self.options.prefix = 'gtdbtk' self.options.cpus = 1 self.options.extension = 'fna' self.options.debug = False self.options.force = False self.options.genes = False self.options.write_single_copy_genes = False # align option self.options.skip_gtdb_refs = False self.options.taxa_filter = None self.options.custom_msa_filters = False self.options.skip_trimming = False self.options.min_consensus = None self.options.min_perc_taxa = None self.options.skip_gtdb_refs = False self.options.cols_per_gene = None self.options.max_consensus = None self.options.min_perc_aa = 50 self.options.rnd_seed = 42 self.options.outgroup_taxon = None # classify options self.options.scratch_dir = None self.options.keep_ref_red = None self.options.pplacer_cpus = None self.options.min_af = None # infer options self.options.prot_model = 'WAG' self.options.no_support = False self.options.no_gamma = True self.version = ' unittest' self.optionparser = OptionsParser(self.version) logger_setup(None, "gtdbtk.log", "GTDB-Tk", self.version, True) # self.generic_out_path = 'tests/data/results' self.generic_out_path = tempfile.mkdtemp(prefix='gtdbtk_tmp_')
def setUp(self): self.options_parser = OptionsParser('-1') self.dir_tmp = tempfile.mkdtemp(prefix='gtdbtk_tmp_') pass
class TestOptionsParser(unittest.TestCase): def setUp(self): self.options_parser = OptionsParser('-1') self.dir_tmp = tempfile.mkdtemp(prefix='gtdbtk_tmp_') pass def tearDown(self): shutil.rmtree(self.dir_tmp) def test__verify_genome_id__valid(self): """ Test that a valid genome id returns True. """ self.assertTrue( self.options_parser._assert_genome_id_valid('genome_1')) def test__verify_genome_id__invalid(self): """ Test that invalid genome ids throw an exception. """ for c in list('()[],;='): self.assertRaises(GenomeNameInvalid, self.options_parser._assert_genome_id_valid, 'genome%s1' % c) def test__genomes_to_process__genome_dir__valid(self): """ Test that the expected results are returned when using genome_dir. """ open(os.path.join(self.dir_tmp, 'genome_1.fna'), 'a').close() open(os.path.join(self.dir_tmp, 'genome_2.fna'), 'a').close() open(os.path.join(self.dir_tmp, 'other_file.txt'), 'a').close() results = self.options_parser._genomes_to_process( self.dir_tmp, '', 'fna') expected = { 'genome_1': os.path.join(self.dir_tmp, 'genome_1.fna'), 'genome_2': os.path.join(self.dir_tmp, 'genome_2.fna') } self.assertDictEqual(results, expected) def test__genomes_to_process__batchfile__valid(self): """ Test that the expected results are returned when using batchfile """ path_batchfile = os.path.join(self.dir_tmp, 'batchfile.txt') path_genome_1 = os.path.join(self.dir_tmp, 'genome_1.fna') path_genome_2 = os.path.join(self.dir_tmp, 'genome_2.fna') open(path_genome_1, 'a').close() open(path_genome_2, 'a').close() with open(path_batchfile, 'a') as f: f.write('%s\tgenome_1\n' % path_genome_1) f.write('\n') f.write('%s\tgenome_2\n' % path_genome_2) results = self.options_parser._genomes_to_process( '', path_batchfile, 'fna') expected = {'genome_1': path_genome_1, 'genome_2': path_genome_2} self.assertDictEqual(results, expected) def test__genomes_to_process__batchfile__invalid_columns(self): """ Test that a batchfile containing columns not equal to 2 throws an exception. """ path_batchfile = os.path.join(self.dir_tmp, 'batchfile.txt') path_genome_1 = os.path.join(self.dir_tmp, 'genome_1.fna') path_genome_2 = os.path.join(self.dir_tmp, 'genome_2.fna') open(path_genome_1, 'a').close() open(path_genome_2, 'a').close() with open(path_batchfile, 'a') as f: f.write('%s\tgenome_1\n' % path_genome_1) f.write('\n') f.write('%s\tgenome_2\tfoo\n' % path_genome_2) self.assertRaises(GenomeBatchfileMalformed, self.options_parser._genomes_to_process, '', path_batchfile, 'fna') def test__genomes_to_process__batchfile__blank_genome_path(self): """ Test that a batchfile containing a blank genome path throws an exception. """ path_batchfile = os.path.join(self.dir_tmp, 'batchfile.txt') path_genome_1 = os.path.join(self.dir_tmp, 'genome_1.fna') path_genome_2 = os.path.join(self.dir_tmp, 'genome_2.fna') open(path_genome_1, 'a').close() open(path_genome_2, 'a').close() with open(path_batchfile, 'a') as f: f.write('%s\tgenome_1\n' % path_genome_1) f.write('\n') f.write('%s\tgenome_2\n' % '') self.assertRaises(GenomeBatchfileMalformed, self.options_parser._genomes_to_process, '', path_batchfile, 'fna') def test__genomes_to_process__batchfile__blank_genome_id(self): """ Test that a batchfile containing a blank genome id throws an exception. """ path_batchfile = os.path.join(self.dir_tmp, 'batchfile.txt') path_genome_1 = os.path.join(self.dir_tmp, 'genome_1.fna') path_genome_2 = os.path.join(self.dir_tmp, 'genome_2.fna') open(path_genome_1, 'a').close() open(path_genome_2, 'a').close() with open(path_batchfile, 'a') as f: f.write('%s\tgenome_1\n' % path_genome_1) f.write('\n') f.write('%s\t\n' % path_genome_2) self.assertRaises(GenomeBatchfileMalformed, self.options_parser._genomes_to_process, '', path_batchfile, 'fna') def test__genomes_to_process__batchfile__duplicate_genome_id(self): """ Test that a batchfile containing duplicate genome ids throws an exception. """ # Branch 1: The number of columns are not equal to 2. path_batchfile = os.path.join(self.dir_tmp, 'batchfile.txt') path_genome_1 = os.path.join(self.dir_tmp, 'genome_1.fna') path_genome_2 = os.path.join(self.dir_tmp, 'genome_2.fna') open(path_genome_1, 'a').close() open(path_genome_2, 'a').close() with open(path_batchfile, 'a') as f: f.write('%s\tgenome_1\n' % path_genome_1) f.write('\n') f.write('%s\tgenome_1\n' % path_genome_2) self.assertRaises(GenomeBatchfileMalformed, self.options_parser._genomes_to_process, '', path_batchfile, 'fna') def test__genomes_to_process__batchfile__invalid_genome_id(self): """ Test that a batchfile containing duplicate genome ids throws an exception. """ # Branch 1: The number of columns are not equal to 2. path_batchfile_1 = os.path.join(self.dir_tmp, 'batchfile_1.txt') path_batchfile_2 = os.path.join(self.dir_tmp, 'batchfile_2.txt') path_batchfile_3 = os.path.join(self.dir_tmp, 'batchfile_3.txt') path_genome_1 = os.path.join(self.dir_tmp, 'genome_1.fna') path_genome_2 = os.path.join(self.dir_tmp, 'genome_2.fna') open(path_genome_1, 'a').close() open(path_genome_2, 'a').close() with open(path_batchfile_1, 'a') as f: f.write('%s\tgenome_1\n' % path_genome_1) f.write('\n') f.write('%s\tGB_genome_2\n' % path_genome_2) with open(path_batchfile_2, 'a') as f: f.write('%s\tgenome_1\n' % path_genome_1) f.write('\n') f.write('%s\tRS_genome_2\n' % path_genome_2) with open(path_batchfile_3, 'a') as f: f.write('%s\tgenome_1\n' % path_genome_1) f.write('\n') f.write('%s\tUBAgenome_2\n' % path_genome_2) self.assertRaises(GenomeNameInvalid, self.options_parser._genomes_to_process, '', path_batchfile_1, 'fna') self.assertRaises(GenomeNameInvalid, self.options_parser._genomes_to_process, '', path_batchfile_2, 'fna') self.assertRaises(GenomeNameInvalid, self.options_parser._genomes_to_process, '', path_batchfile_3, 'fna') def test__genomes_to_process__no_files(self): """ Test that an exception is thrown if no files are found to process """ # Branch 1 : genome_dir is specified tmp_genome_dir = tempfile.mkdtemp() try: self.assertRaises(NoGenomesFound, self.options_parser._genomes_to_process, tmp_genome_dir, '', 'fna') finally: shutil.rmtree(tmp_genome_dir) # Branch 2: batchfile is specified tmp_genome_dir = tempfile.mkdtemp() try: path_batchfile = os.path.join(tmp_genome_dir, 'batchfile.txt') open(path_batchfile, 'a').close() self.assertRaises(NoGenomesFound, self.options_parser._genomes_to_process, '', path_batchfile, 'fna') finally: shutil.rmtree(tmp_genome_dir) def test__marker_set_id(self): """ Test that the correct marker set id is returned """ self.assertEqual( self.options_parser._marker_set_id(True, False, False), 'bac120') self.assertEqual( self.options_parser._marker_set_id(False, True, False), 'ar122') self.assertEqual( self.options_parser._marker_set_id(False, False, True), 'rps23') def test_identify__genome_dir_raises_io_exception(self): """ Test that the identify method raises an exception on invalid genome_dir """ options = argparse.ArgumentParser() options.genome_dir = os.path.join(tempfile.gettempdir(), 'non-existent-dir') self.assertRaises(BioLibDirNotFound, self.options_parser.identify, options) def test_identify__batchfile_raises_io_exception(self): """ Test that the identify method raises an exception on invalid batchfile """ options = argparse.ArgumentParser() options.genome_dir = None options.batchfile = os.path.join(tempfile.gettempdir(), 'non-existent-file.txt') self.assertRaises(BioLibFileNotFound, self.options_parser.identify, options) def test_align__identify_dir_raises_io_exception(self): """ Test that the align method raises an exception on invalid identify dir """ options = argparse.ArgumentParser() options.identify_dir = os.path.join(tempfile.gettempdir(), 'non-existent-dir') self.assertRaises(BioLibDirNotFound, self.options_parser.align, options) def test_infer__msa_raises_io_exception(self): """ Test that the infer method raises an exception on invalid MSA """ options = argparse.ArgumentParser() options.msa_file = os.path.join(tempfile.gettempdir(), 'non-existent-msa.txt') self.assertRaises(BioLibFileNotFound, self.options_parser.infer, options) def test_run_test(self): """Test that the user-test method runs correctly""" options = argparse.ArgumentParser() options.out_dir = self.dir_tmp options.cpus = 3 self.assertTrue(self.options_parser.run_test(options)) def test_run_test__throws_exception(self): """Test that the user-test method fails correctly""" options = argparse.ArgumentParser() options.out_dir = self.dir_tmp os.mkdir(os.path.join(self.dir_tmp, 'genomes')) options.cpus = 3 self.assertRaises(GTDBTkTestFailure, self.options_parser.run_test, options) def test_classify__align_dir_raises_io_exception(self): """ Test that the classify method raises an exception on invalid align dir """ options = argparse.ArgumentParser() options.align_dir = os.path.join(tempfile.gettempdir(), 'non-existent-dir') self.assertRaises(BioLibDirNotFound, self.options_parser.classify, options) def test_root__no_tree_raises_io_exception(self): """ Test that the infer method raises an exception on invalid tree """ options = argparse.ArgumentParser() options.input_tree = os.path.join(tempfile.gettempdir(), 'non-existent-tree.tree') self.assertRaises(BioLibFileNotFound, self.options_parser.root, options) def test_decorate__no_tree_raises_io_exception(self): """ Test that the infer method raises an exception on invalid tree """ options = argparse.ArgumentParser() options.input_tree = os.path.join(tempfile.gettempdir(), 'non-existent-tree.tree') self.assertRaises(BioLibFileNotFound, self.options_parser.decorate, options) def test_trim_msa__mask_file(self): """ Test that the expected result is returned when running trim_msa with mask_file """ path_untrimmed_msa = os.path.join(self.dir_tmp, 'untrimmed_msa.fasta') path_mask_file = os.path.join(self.dir_tmp, 'mask_file.txt') path_output = os.path.join(self.dir_tmp, 'trimmed_msa.fasta') with open(path_untrimmed_msa, 'w') as f: f.write('>genome_1\n') f.write('ALGPVW\n') f.write('>genome_2\n') f.write('WVPGLA\n') with open(path_mask_file, 'w') as f: f.write('010010\n') options = argparse.ArgumentParser() # Required arguments options.untrimmed_msa = path_untrimmed_msa options.output = path_output # Mutex arguments options.mask_file = path_mask_file options.reference_mask = None self.options_parser.trim_msa(options) results = dict() with open(path_output, 'r') as f: re_hits = re.findall(r'>(.+)\n(.+)\n', f.read()) for gid, seq in re_hits: results[gid] = seq expected = {'genome_1': 'LV', 'genome_2': 'VL'} self.assertDictEqual(results, expected) def test_trim_msa__reference_mask_arc(self): """ Test that the expected result is returned when running trim_msa with archaeal reference_mask """ path_untrimmed_msa = os.path.join(self.dir_tmp, 'untrimmed_msa.fasta') path_output = os.path.join(self.dir_tmp, 'trimmed_msa.fasta') msa_str = str() while len(msa_str) < 32675: msa_str += 'ALGPVW' msa_str = msa_str[0:32675] with open(path_untrimmed_msa, 'w') as f: f.write('>genome_1\n') f.write('%s\n' % msa_str) f.write('>genome_2\n') f.write('%s\n' % msa_str[::-1]) options = argparse.ArgumentParser() # Required arguments options.untrimmed_msa = path_untrimmed_msa options.output = path_output # Mutex arguments options.mask_file = None options.reference_mask = 'arc' self.options_parser.trim_msa(options) results = dict() with open(path_output, 'r') as f: re_hits = re.findall(r'>(.+)\n(.+)\n', f.read()) for gid, seq in re_hits: results[gid] = hashlib.sha256(seq).hexdigest() expected = { 'genome_1': '332b8cd125a36c375196064e136efab78db38e41bbd8bd8484243531bc57df6d', 'genome_2': '84e91b9f5fa1ec0bedc0097233044e6dd0e79557bb6df3625928dc9573795989' } self.assertDictEqual(results, expected) def test_trim_msa__reference_mask_bac(self): """ Test that the expected result is returned when running trim_msa with bacterial reference_mask """ path_untrimmed_msa = os.path.join(self.dir_tmp, 'untrimmed_msa.fasta') path_output = os.path.join(self.dir_tmp, 'trimmed_msa.fasta') msa_str = str() while len(msa_str) < 41155: msa_str += 'ALGPVW' msa_str = msa_str[0:41155] with open(path_untrimmed_msa, 'w') as f: f.write('>genome_1\n') f.write('%s\n' % msa_str) f.write('>genome_2\n') f.write('%s\n' % msa_str[::-1]) options = argparse.ArgumentParser() # Required arguments options.untrimmed_msa = path_untrimmed_msa options.output = path_output # Mutex arguments options.mask_file = None options.reference_mask = 'bac' self.options_parser.trim_msa(options) results = dict() with open(path_output, 'r') as f: re_hits = re.findall(r'>(.+)\n(.+)\n', f.read()) for gid, seq in re_hits: results[gid] = hashlib.sha256(seq).hexdigest() expected = { 'genome_1': '35e080f9ab7d318e8f4a7cef46ce6044bd9c538e6fbe8a69b17431df44bd5a81', 'genome_2': 'bb4beed69063dad0092a809ee5854ff124da0b55c651edd50c47b1d8fdff0d7b' } self.assertDictEqual(results, expected) def test_export_msa__arc(self): """ Test that the untrimmed archaeal MSA is exported correctly """ path_out = os.path.join(self.dir_tmp, 'output.fasta') options = argparse.ArgumentParser() options.domain = 'arc' options.output = path_out self.options_parser.export_msa(options) with open(path_out, 'r') as f: out_hash = hashlib.sha256(f.read()).hexdigest() self.assertEqual( out_hash, '11eb12b91ab20c43824abafb909ccc20bed84a8609a9bf82748b2cdbdd8b7aad') def test_export_msa__bac(self): """ Test that the untrimmed bacterial MSA is exported correctly """ path_out = os.path.join(self.dir_tmp, 'output.fasta') options = argparse.ArgumentParser() options.domain = 'bac' options.output = path_out self.options_parser.export_msa(options) with open(path_out, 'r') as f: out_hash = hashlib.sha256(f.read()).hexdigest() self.assertEqual( out_hash, '50dde1e96df9533def7c7047a1e8627d4ad566db10f8ab3de72751e62c4ac10a')
def setUp(self): self.options_parser = OptionsParser(-1) pass
def main(): # ------------------------------------------------- # get and check options args = None if len(sys.argv) == 1: print_help() sys.exit(0) elif sys.argv[1] in {'-v', '--v', '-version', '--version'}: print(f"gtdbtk: version {__version__} {__copyright__} {__author__}") # Warn the user they are not using the latest version (if possible) latest_ver = get_gtdbtk_latest_version() if latest_ver and latest_ver != __version__: print(f'Note: There is a newer version of GTDB-Tk available: v{latest_ver}') sys.exit(0) elif sys.argv[1] in {'-h', '--h', '-help', '--help'}: print_help() sys.exit(0) else: args = get_main_parser().parse_args() # setup logger logger_setup(args.out_dir if hasattr(args, 'out_dir') else None, "gtdbtk.log", "GTDB-Tk", __version__, False, hasattr(args, 'debug') and args.debug) logger = logging.getLogger('timestamp') # ------------------------------------------------- # do what we came here to do try: gt_parser = OptionsParser(__version__) gt_parser.parse_options(args) except SystemExit: logger.error('Controlled exit resulting from early termination.') sys.exit(1) except KeyboardInterrupt: logger.error('Controlled exit resulting from interrupt signal.') sys.exit(1) except GTDBTkExit as e: if len(str(e)) > 0: logger.error('{}'.format(e)) logger.error('Controlled exit resulting from an unrecoverable error or warning.') sys.exit(1) except (GTDBTkException, BioLibError) as e: msg = 'Controlled exit resulting from an unrecoverable error or warning.\n\n' msg += '=' * 80 + '\n' msg += 'EXCEPTION: {}\n'.format(type(e).__name__) msg += ' MESSAGE: {}\n'.format(e) msg += '_' * 80 + '\n\n' msg += traceback.format_exc() msg += '=' * 80 logger.error(msg) sys.exit(1) except Exception as e: msg = 'Uncontrolled exit resulting from an unexpected error.\n\n' msg += '=' * 80 + '\n' msg += 'EXCEPTION: {}\n'.format(type(e).__name__) msg += ' MESSAGE: {}\n'.format(e) msg += '_' * 80 + '\n\n' msg += traceback.format_exc() msg += '=' * 80 logger.error(msg) sys.exit(1)
class TestCli(unittest.TestCase): def setUp(self): self.identify_dir_reference = os.path.join(os.path.dirname(__file__), 'data/identify_dir_reference/') self.align_dir_reference = 'tests/data/align_dir_reference/' self.genome_dir = 'gtdbtk/tests/data/genomes/' self.options = argparse.ArgumentParser() self.options.batchfile = None self.options.prefix = 'gtdbtk' self.options.cpus = 1 self.options.extension = 'fna' self.options.debug = False self.options.force = False self.options.genes = False self.options.write_single_copy_genes = False # align option self.options.skip_gtdb_refs = False self.options.taxa_filter = None self.options.custom_msa_filters = False self.options.skip_trimming = False self.options.min_consensus = None self.options.min_perc_taxa = None self.options.skip_gtdb_refs = False self.options.cols_per_gene = None self.options.max_consensus = None self.options.min_perc_aa = 50 self.options.rnd_seed = 42 self.options.outgroup_taxon = None # classify options self.options.scratch_dir = None self.options.keep_ref_red = None self.options.pplacer_cpus = None self.options.min_af = None # infer options self.options.prot_model = 'WAG' self.options.no_support = False self.options.no_gamma = True self.version = ' unittest' self.optionparser = OptionsParser(self.version) logger_setup(None, "gtdbtk.log", "GTDB-Tk", self.version, True) # self.generic_out_path = 'tests/data/results' self.generic_out_path = tempfile.mkdtemp(prefix='gtdbtk_tmp_') def tearDown(self): shutil.rmtree(self.generic_out_path) def test_identify(self): tmp_folder = ''.join(random.choice( string.ascii_uppercase + string.digits) for _ in range(10)) identify_options = self.options identify_options.genome_dir = self.genome_dir identify_options.out_dir = os.path.join( self.generic_out_path, tmp_folder, 'identify') self.optionparser.identify(identify_options) ar53_marker_path = os.path.join(self.options.out_dir, PATH_AR53_MARKER_SUMMARY.format(prefix=self.options.prefix)) self.assertTrue(os.path.isfile( os.path.join(self.options.out_dir, PATH_BAC120_MARKER_SUMMARY.format(prefix=self.options.prefix)))) self.assertTrue(os.path.isfile(ar53_marker_path)) results = {} with open(ar53_marker_path, 'r') as f: f.readline() for line in f: infos = line.split('\t', 1) results[infos[0]] = infos[1] self.assertTrue(results.get('genome_1').startswith('120\t2\t0\t')) def test_align(self): tmp_folder = ''.join(random.choice( string.ascii_uppercase + string.digits) for _ in range(10)) align_options = self.options align_options.identify_dir = self.identify_dir_reference align_options.out_dir = os.path.join( self.generic_out_path, tmp_folder, 'align') self.optionparser.align(align_options) path_user_msa = os.path.join(align_options.out_dir, PATH_AR53_USER_MSA.format(prefix=align_options.prefix)) self.assertTrue(os.path.isfile(path_user_msa)) with open(path_user_msa, 'r') as f: lines = f.read().splitlines() last_line = lines[-1] self.assertTrue(len(last_line) > 4500) self.assertTrue(len(last_line) < 5500) self.assertTrue('-' in last_line) self.assertFalse(any(char.isdigit() for char in last_line)) def test_identify_align(self): tmp_folder = ''.join(random.choice( string.ascii_uppercase + string.digits) for _ in range(10)) identify_options = self.options identify_options.genome_dir = self.genome_dir identify_options.out_dir = os.path.join( self.generic_out_path, tmp_folder, 'identify') self.optionparser.identify(identify_options) align_options = self.options align_options.identify_dir = os.path.join( self.generic_out_path, tmp_folder, 'identify') align_options.out_dir = os.path.join( self.generic_out_path, tmp_folder, 'align') self.optionparser.align(align_options) path_user_msa = os.path.join(align_options.out_dir, PATH_AR53_USER_MSA.format(prefix=align_options.prefix)) self.assertTrue(os.path.isfile(path_user_msa)) with open(path_user_msa, 'r') as f: lines = f.read().splitlines() last_line = lines[-1] self.assertTrue(len(last_line) > 4500) self.assertTrue(len(last_line) < 5500) self.assertTrue('-' in last_line) self.assertFalse(any(char.isdigit() for char in last_line)) def test_identify_align_classify(self): tmp_folder = ''.join(random.choice( string.ascii_uppercase + string.digits) for _ in range(10)) identify_options = self.options identify_options.genome_dir = self.genome_dir identify_options.out_dir = os.path.join( self.generic_out_path, tmp_folder, 'identify') self.optionparser.identify(identify_options) align_options = self.options align_options.identify_dir = os.path.join( self.generic_out_path, tmp_folder, 'identify') align_options.out_dir = os.path.join( self.generic_out_path, tmp_folder, 'align') self.optionparser.align(align_options) path_user_msa = os.path.join(align_options.out_dir, PATH_AR53_USER_MSA.format(prefix=align_options.prefix)) self.assertTrue(os.path.isfile(path_user_msa)) with open(path_user_msa, 'r') as f: lines = f.read().splitlines() last_line = lines[-1] self.assertTrue(len(last_line) > 4500) self.assertTrue(len(last_line) < 5500) self.assertTrue('-' in last_line) self.assertFalse(any(char.isdigit() for char in last_line)) classify_options = self.options classify_options.genome_dir = self.genome_dir classify_options.full_tree = True classify_options.align_dir = align_options.out_dir classify_options.out_dir = os.path.join( self.generic_out_path, tmp_folder, 'classify') classify_options.recalculate_red = False self.optionparser.classify(classify_options) summary_out = os.path.join(classify_options.out_dir, PATH_AR53_SUMMARY_OUT.format(prefix=classify_options.prefix)) self.assertTrue(summary_out) with open(summary_out, 'r') as f: lines = f.read().splitlines() last_line = lines[-1] infos = last_line.split('\t') self.assertEqual(len(infos), 20) self.assertTrue(infos[1].startswith('d__Archaea')) self.assertTrue(os.path.isdir(os.path.join(classify_options.out_dir, DIR_IDENTIFY_INTERMEDIATE))) self.assertTrue(os.path.isdir(os.path.join(classify_options.out_dir, DIR_ALIGN_INTERMEDIATE))) self.assertTrue(os.path.isdir(os.path.join(classify_options.out_dir, DIR_CLASSIFY_INTERMEDIATE))) self.optionparser.remove_intermediate_files(classify_options.out_dir,'classify_wf') self.assertFalse(os.path.exists(os.path.join(classify_options.out_dir, DIR_IDENTIFY_INTERMEDIATE))) self.assertFalse(os.path.exists(os.path.join(classify_options.out_dir, DIR_ALIGN_INTERMEDIATE))) self.assertFalse(os.path.exists(os.path.join(classify_options.out_dir, DIR_CLASSIFY_INTERMEDIATE))) def test_classify_wf(self): tmp_folder = ''.join(random.choice( string.ascii_uppercase + string.digits) for _ in range(10)) classify_wf_options = self.options classify_wf_options.genome_dir = self.genome_dir classify_wf_options.out_dir = os.path.join( self.generic_out_path, tmp_folder, 'classify_wf') self.optionparser.identify(classify_wf_options) classify_wf_options.identify_dir = classify_wf_options.out_dir classify_wf_options.align_dir = classify_wf_options.out_dir classify_wf_options.taxa_filter = None classify_wf_options.custom_msa_filters = False classify_wf_options.min_consensus = None classify_wf_options.min_perc_taxa = None classify_wf_options.skip_gtdb_refs = False classify_wf_options.cols_per_gene = None classify_wf_options.max_consensus = None classify_wf_options.recalculate_red = False classify_wf_options.full_tree = True self.optionparser.align(classify_wf_options) self.optionparser.classify(classify_wf_options) summary_out = os.path.join(classify_wf_options.out_dir, PATH_AR53_SUMMARY_OUT.format(prefix=classify_wf_options.prefix)) self.assertTrue(os.path.isfile(summary_out)) with open(summary_out, 'r') as f: lines = f.read().splitlines() last_line = lines[-1] infos = last_line.split('\t') self.assertEqual(len(infos), 20) self.assertTrue(infos[1].startswith('d__Archaea')) def test_infer(self): tmp_folder = ''.join(random.choice( string.ascii_uppercase + string.digits) for _ in range(10)) infer_options = self.options path_user_msa = PATH_AR53_USER_MSA.format(prefix=self.options.prefix) infer_options.msa_file = os.path.join(self.align_dir_reference, path_user_msa) infer_options.out_dir = os.path.join(self.generic_out_path, tmp_folder, 'infer') infer_options.gamma = False # if not os.path.isdir(infer_options.out_dir): # os.makedirs(infer_options.out_dir) self.optionparser.infer(infer_options) with open(os.path.join(infer_options.out_dir, PATH_TREE_LOG.format(prefix=self.options.prefix)), 'r') as f: lines = f.read().splitlines() last_line = lines[-1] self.assertEqual(last_line.strip(), 'TreeCompleted') with open(os.path.join(infer_options.out_dir, PATH_UNROOTED_TREE.format(prefix=self.options.prefix)), 'r') as f: lines = f.read().splitlines() last_line = lines[-1] self.assertTrue('genome_1' in last_line) self.assertTrue('genome_2' in last_line) self.assertTrue('genome_3' in last_line) def test_de_novo_wf(self): tmp_folder = ''.join(random.choice( string.ascii_uppercase + string.digits) for _ in range(10)) de_novo_wf_options = self.options de_novo_wf_options.genome_dir = self.genome_dir de_novo_wf_options.suffix = ".ar53" de_novo_wf_options.gamma = False de_novo_wf_options.out_dir = os.path.join( self.generic_out_path, tmp_folder, 'de_novo_wf') de_novo_wf_options.identify_dir = de_novo_wf_options.out_dir de_novo_wf_options.msa_file = os.path.join( de_novo_wf_options.out_dir, de_novo_wf_options.prefix + de_novo_wf_options.suffix + ".user_msa.fasta") self.optionparser.identify(de_novo_wf_options) self.optionparser.align(de_novo_wf_options) self.optionparser.infer(de_novo_wf_options) def test_root(self): """Test that rooting is successful when called through the CLI""" options = argparse.ArgumentParser() options.input_tree = 'tests/data/pplacer_dir_reference/gtdbtk.ar53.classify.tree' options.outgroup_taxon = 'p__Altiarchaeota' options.output_tree = os.path.join(self.generic_out_path, 'test.rooted.tree') options.custom_taxonomy_file = None options.gtdbtk_classification_file = None self.optionparser.root(options) self.assertTrue(os.path.isfile(options.output_tree))
class TestCli(unittest.TestCase): def setUp(self): self.identify_dir_reference = 'tests/data/identify_dir_reference/' self.align_dir_reference = 'tests/data/align_dir_reference/' self.genome_dir = 'gtdbtk/tests/data/genomes/' self.options = argparse.ArgumentParser() self.options.batchfile = None self.options.prefix = 'gtdbtk' self.options.cpus = 1 self.options.extension = 'fna' self.options.debug = False # align option self.options.skip_gtdb_refs = False self.options.taxa_filter = None self.options.custom_msa_filters = False self.options.min_consensus = None self.options.min_perc_taxa = None self.options.skip_gtdb_refs = False self.options.cols_per_gene = None self.options.max_consensus = None self.options.min_perc_aa = 50 # classify options self.options.scratch_dir = None # infer options self.options.prot_model = 'WAG' self.options.no_support = False self.options.no_gamma = True self.version = ' unittest' self.optionparser = OptionsParser(self.version) logger_setup(None, "gtdbtk.log", "GTDB-Tk", self.version, True) self.generic_out_path = 'tests/data/results' def test_identify(self): tmp_folder = ''.join( random.choice(string.ascii_uppercase + string.digits) for _ in range(10)) identify_options = self.options identify_options.genome_dir = self.genome_dir identify_options.out_dir = os.path.join(self.generic_out_path, tmp_folder, 'identify') self.optionparser.identify(identify_options) self.assertTrue( os.path.isfile( os.path.join(self.options.out_dir, 'gtdbtk_bac120_markers_summary.tsv'))) self.assertTrue( os.path.isfile( os.path.join(self.options.out_dir, 'gtdbtk_ar122_markers_summary.tsv'))) results = {} with open( os.path.join(identify_options.out_dir, 'gtdbtk_ar122_markers_summary.tsv'), 'r') as f: f.readline() for line in f: infos = line.split('\t', 1) results[infos[0]] = infos[1] self.assertTrue(results.get('genome_1').startswith('120\t2\t0\t')) def test_align(self): tmp_folder = ''.join( random.choice(string.ascii_uppercase + string.digits) for _ in range(10)) align_options = self.options align_options.identify_dir = self.identify_dir_reference align_options.out_dir = os.path.join(self.generic_out_path, tmp_folder, 'align') self.optionparser.align(align_options) self.assertTrue( os.path.isfile( os.path.join(align_options.out_dir, 'gtdbtk.ar122.user_msa.fasta'))) with open( os.path.join(align_options.out_dir, 'gtdbtk.ar122.user_msa.fasta'), 'r') as f: lines = f.read().splitlines() last_line = lines[-1] self.assertTrue(len(last_line) > 4500) self.assertTrue(len(last_line) < 5500) self.assertTrue('-' in last_line) self.assertFalse(any(char.isdigit() for char in last_line)) def test_classify(self): tmp_folder = ''.join( random.choice(string.ascii_uppercase + string.digits) for _ in range(10)) classify_options = self.options classify_options.genome_dir = self.genome_dir classify_options.align_dir = self.align_dir_reference classify_options.out_dir = os.path.join(self.generic_out_path, tmp_folder, 'classify') self.optionparser.classify(classify_options) self.assertTrue( os.path.isfile( os.path.join(classify_options.out_dir, 'gtdbtk.ar122.summary.tsv'))) with open( os.path.join(classify_options.out_dir, 'gtdbtk.ar122.summary.tsv'), 'r') as f: lines = f.read().splitlines() last_line = lines[-1] infos = last_line.split('\t') self.assertEquals(len(infos), 17) self.assertTrue(infos[1].startswith('d__Archaea')) def test_identify_align(self): tmp_folder = ''.join( random.choice(string.ascii_uppercase + string.digits) for _ in range(10)) identify_options = self.options identify_options.genome_dir = self.genome_dir identify_options.out_dir = os.path.join(self.generic_out_path, tmp_folder, 'identify') self.optionparser.identify(identify_options) align_options = self.options align_options.identify_dir = os.path.join(self.generic_out_path, tmp_folder, 'identify') align_options.out_dir = os.path.join(self.generic_out_path, tmp_folder, 'align') self.optionparser.align(align_options) self.assertTrue( os.path.isfile( os.path.join(align_options.out_dir, 'gtdbtk.ar122.user_msa.fasta'))) with open( os.path.join(align_options.out_dir, 'gtdbtk.ar122.user_msa.fasta'), 'r') as f: lines = f.read().splitlines() last_line = lines[-1] self.assertTrue(len(last_line) > 4500) self.assertTrue(len(last_line) < 5500) self.assertTrue('-' in last_line) self.assertFalse(any(char.isdigit() for char in last_line)) def test_identify_align_classify(self): tmp_folder = ''.join( random.choice(string.ascii_uppercase + string.digits) for _ in range(10)) identify_options = self.options identify_options.genome_dir = self.genome_dir identify_options.out_dir = os.path.join(self.generic_out_path, tmp_folder, 'identify') self.optionparser.identify(identify_options) align_options = self.options align_options.identify_dir = os.path.join(self.generic_out_path, tmp_folder, 'identify') align_options.out_dir = os.path.join(self.generic_out_path, tmp_folder, 'align') self.optionparser.align(align_options) self.assertTrue( os.path.isfile( os.path.join(align_options.out_dir, 'gtdbtk.ar122.user_msa.fasta'))) with open( os.path.join(align_options.out_dir, 'gtdbtk.ar122.user_msa.fasta'), 'r') as f: lines = f.read().splitlines() last_line = lines[-1] self.assertTrue(len(last_line) > 4500) self.assertTrue(len(last_line) < 5500) self.assertTrue('-' in last_line) self.assertFalse(any(char.isdigit() for char in last_line)) classify_options = self.options classify_options.genome_dir = self.genome_dir classify_options.align_dir = align_options.out_dir classify_options.out_dir = os.path.join(self.generic_out_path, tmp_folder, 'classify') self.optionparser.classify(classify_options) self.assertTrue( os.path.isfile( os.path.join(classify_options.out_dir, 'gtdbtk.ar122.summary.tsv'))) with open( os.path.join(classify_options.out_dir, 'gtdbtk.ar122.summary.tsv'), 'r') as f: lines = f.read().splitlines() last_line = lines[-1] infos = last_line.split('\t') self.assertEquals(len(infos), 17) self.assertTrue(infos[1].startswith('d__Archaea')) def test_classify_wf(self): tmp_folder = ''.join( random.choice(string.ascii_uppercase + string.digits) for _ in range(10)) classify_wf_options = self.options classify_wf_options.genome_dir = self.genome_dir classify_wf_options.out_dir = os.path.join(self.generic_out_path, tmp_folder, 'classify_wf') self.optionparser.identify(classify_wf_options) classify_wf_options.identify_dir = classify_wf_options.out_dir classify_wf_options.align_dir = classify_wf_options.out_dir classify_wf_options.taxa_filter = None classify_wf_options.custom_msa_filters = False classify_wf_options.min_consensus = None classify_wf_options.min_perc_taxa = None classify_wf_options.skip_gtdb_refs = False classify_wf_options.cols_per_gene = None classify_wf_options.max_consensus = None self.optionparser.align(classify_wf_options) self.optionparser.classify(classify_wf_options) self.assertTrue( os.path.isfile( os.path.join(classify_wf_options.out_dir, 'gtdbtk.ar122.summary.tsv'))) with open( os.path.join(classify_wf_options.out_dir, 'gtdbtk.ar122.summary.tsv'), 'r') as f: lines = f.read().splitlines() last_line = lines[-1] infos = last_line.split('\t') self.assertEquals(len(infos), 17) self.assertTrue(infos[1].startswith('d__Archaea')) def test_infer(self): tmp_folder = ''.join( random.choice(string.ascii_uppercase + string.digits) for _ in range(10)) infer_options = self.options infer_options.msa_file = os.path.join(self.align_dir_reference, 'gtdbtk.ar122.user_msa.fasta') infer_options.out_dir = os.path.join(self.generic_out_path, tmp_folder, 'infer') self.optionparser.infer(infer_options) with open(os.path.join(infer_options.out_dir, 'gtdbtk.tree.log'), 'r') as f: lines = f.read().splitlines() last_line = lines[-1] self.assertEqual(last_line.strip(), 'TreeCompleted') with open(os.path.join(infer_options.out_dir, 'gtdbtk.unrooted.tree'), 'r') as f: lines = f.read().splitlines() last_line = lines[-1] self.assertTrue('genome_1' in last_line) self.assertTrue('genome_2' in last_line) self.assertTrue('genome_3' in last_line) def test_de_novo_wf(self): tmp_folder = ''.join( random.choice(string.ascii_uppercase + string.digits) for _ in range(10)) de_novo_wf_options = self.options de_novo_wf_options.genome_dir = self.genome_dir de_novo_wf_options.suffix = ".ar122" de_novo_wf_options.out_dir = os.path.join(self.generic_out_path, tmp_folder, 'de_novo_wf') de_novo_wf_options.identify_dir = de_novo_wf_options.out_dir de_novo_wf_options.msa_file = os.path.join( de_novo_wf_options.out_dir, de_novo_wf_options.prefix + de_novo_wf_options.suffix + ".user_msa.fasta") self.optionparser.identify(de_novo_wf_options) self.optionparser.align(de_novo_wf_options) self.optionparser.infer(de_novo_wf_options) def tearDown(self): shutil.rmtree(self.generic_out_path)
class TestOptionsParser(unittest.TestCase): def setUp(self): self.options_parser = OptionsParser('-1') self.dir_tmp = tempfile.mkdtemp(prefix='gtdbtk_tmp_') pass def tearDown(self): shutil.rmtree(self.dir_tmp) def test__verify_genome_id__valid(self): """ Test that a valid genome id returns True. """ self.assertTrue(self.options_parser._verify_genome_id('genome_1')) def test__verify_genome_id__invalid(self): """ Test that invalid genome ids throw an exception. """ for c in list('()[],;='): self.assertRaises(GenomeNameInvalid, self.options_parser._verify_genome_id, 'genome%s1' % c) def test__genomes_to_process__genome_dir__valid(self): """ Test that the expected results are returned when using genome_dir. """ open(os.path.join(self.dir_tmp, 'genome_1.fna'), 'a').close() open(os.path.join(self.dir_tmp, 'genome_2.fna'), 'a').close() open(os.path.join(self.dir_tmp, 'other_file.txt'), 'a').close() results = self.options_parser._genomes_to_process( self.dir_tmp, '', 'fna') expected = { 'genome_1': os.path.join(self.dir_tmp, 'genome_1.fna'), 'genome_2': os.path.join(self.dir_tmp, 'genome_2.fna') } self.assertDictEqual(results, expected) def test__genomes_to_process__batchfile__valid(self): """ Test that the expected results are returned when using batchfile """ path_batchfile = os.path.join(self.dir_tmp, 'batchfile.txt') path_genome_1 = os.path.join(self.dir_tmp, 'genome_1.fna') path_genome_2 = os.path.join(self.dir_tmp, 'genome_2.fna') open(path_genome_1, 'a').close() open(path_genome_2, 'a').close() with open(path_batchfile, 'a') as f: f.write('%s\tgenome_1\n' % path_genome_1) f.write('\n') f.write('%s\tgenome_2\n' % path_genome_2) results = self.options_parser._genomes_to_process( '', path_batchfile, 'fna') expected = {'genome_1': path_genome_1, 'genome_2': path_genome_2} self.assertDictEqual(results, expected) def test__genomes_to_process__batchfile__invalid_columns(self): """ Test that a batchfile containing columns not equal to 2 throws an exception. """ path_batchfile = os.path.join(self.dir_tmp, 'batchfile.txt') path_genome_1 = os.path.join(self.dir_tmp, 'genome_1.fna') path_genome_2 = os.path.join(self.dir_tmp, 'genome_2.fna') open(path_genome_1, 'a').close() open(path_genome_2, 'a').close() with open(path_batchfile, 'a') as f: f.write('%s\tgenome_1\n' % path_genome_1) f.write('\n') f.write('%s\tgenome_2\tfoo\n' % path_genome_2) self.assertRaises(GenomeBatchfileMalformed, self.options_parser._genomes_to_process, '', path_batchfile, 'fna') def test__genomes_to_process__batchfile__blank_genome_path(self): """ Test that a batchfile containing a blank genome path throws an exception. """ path_batchfile = os.path.join(self.dir_tmp, 'batchfile.txt') path_genome_1 = os.path.join(self.dir_tmp, 'genome_1.fna') path_genome_2 = os.path.join(self.dir_tmp, 'genome_2.fna') open(path_genome_1, 'a').close() open(path_genome_2, 'a').close() with open(path_batchfile, 'a') as f: f.write('%s\tgenome_1\n' % path_genome_1) f.write('\n') f.write('%s\tgenome_2\n' % '') self.assertRaises(GenomeBatchfileMalformed, self.options_parser._genomes_to_process, '', path_batchfile, 'fna') def test__genomes_to_process__batchfile__blank_genome_id(self): """ Test that a batchfile containing a blank genome id throws an exception. """ path_batchfile = os.path.join(self.dir_tmp, 'batchfile.txt') path_genome_1 = os.path.join(self.dir_tmp, 'genome_1.fna') path_genome_2 = os.path.join(self.dir_tmp, 'genome_2.fna') open(path_genome_1, 'a').close() open(path_genome_2, 'a').close() with open(path_batchfile, 'a') as f: f.write('%s\tgenome_1\n' % path_genome_1) f.write('\n') f.write('%s\t\n' % path_genome_2) self.assertRaises(GenomeBatchfileMalformed, self.options_parser._genomes_to_process, '', path_batchfile, 'fna') def test__genomes_to_process__batchfile__duplicate_genome_id(self): """ Test that a batchfile containing duplicate genome ids throws an exception. """ # Branch 1: The number of columns are not equal to 2. path_batchfile = os.path.join(self.dir_tmp, 'batchfile.txt') path_genome_1 = os.path.join(self.dir_tmp, 'genome_1.fna') path_genome_2 = os.path.join(self.dir_tmp, 'genome_2.fna') open(path_genome_1, 'a').close() open(path_genome_2, 'a').close() with open(path_batchfile, 'a') as f: f.write('%s\tgenome_1\n' % path_genome_1) f.write('\n') f.write('%s\tgenome_1\n' % path_genome_2) self.assertRaises(GTDBTkExit, self.options_parser._genomes_to_process, '', path_batchfile, 'fna') def test__genomes_to_process__batchfile__invalid_genome_id(self): """ Test that a batchfile containing duplicate genome ids throws an exception. """ # Branch 1: The number of columns are not equal to 2. path_batchfile_1 = os.path.join(self.dir_tmp, 'batchfile_1.txt') path_batchfile_2 = os.path.join(self.dir_tmp, 'batchfile_2.txt') path_batchfile_3 = os.path.join(self.dir_tmp, 'batchfile_3.txt') path_genome_1 = os.path.join(self.dir_tmp, 'genome_1.fna') path_genome_2 = os.path.join(self.dir_tmp, 'genome_2.fna') open(path_genome_1, 'a').close() open(path_genome_2, 'a').close() with open(path_batchfile_1, 'a') as f: f.write('%s\tgenome_1\n' % path_genome_1) f.write('\n') f.write('%s\tGB_genome_2\n' % path_genome_2) with open(path_batchfile_2, 'a') as f: f.write('%s\tgenome_1\n' % path_genome_1) f.write('\n') f.write('%s\tRS_genome_2\n' % path_genome_2) with open(path_batchfile_3, 'a') as f: f.write('%s\tgenome_1\n' % path_genome_1) f.write('\n') f.write('%s\tUBAgenome_2\n' % path_genome_2) self.assertRaises(GTDBTkExit, self.options_parser._genomes_to_process, '', path_batchfile_1, 'fna') self.assertRaises(GTDBTkExit, self.options_parser._genomes_to_process, '', path_batchfile_2, 'fna') self.assertRaises(GTDBTkExit, self.options_parser._genomes_to_process, '', path_batchfile_3, 'fna') def test__genomes_to_process__no_files(self): """ Test that an exception is thrown if no files are found to process """ # Branch 1 : genome_dir is specified tmp_genome_dir = tempfile.mkdtemp() try: self.assertRaises(GTDBTkExit, self.options_parser._genomes_to_process, tmp_genome_dir, '', 'fna') finally: shutil.rmtree(tmp_genome_dir) # Branch 2: batchfile is specified tmp_genome_dir = tempfile.mkdtemp() try: path_batchfile = os.path.join(tmp_genome_dir, 'batchfile.txt') open(path_batchfile, 'a').close() self.assertRaises(GTDBTkExit, self.options_parser._genomes_to_process, '', path_batchfile, 'fna') finally: shutil.rmtree(tmp_genome_dir) def test__marker_set_id(self): """ Test that the correct marker set id is returned """ self.assertEqual( self.options_parser._marker_set_id(True, False, False), 'bac120') self.assertEqual( self.options_parser._marker_set_id(False, True, False), 'ar122') self.assertEqual( self.options_parser._marker_set_id(False, False, True), 'rps23') def test_identify__genome_dir_raises_io_exception(self): """ Test that the identify method raises an exception on invalid genome_dir """ options = argparse.ArgumentParser() options.genome_dir = os.path.join(tempfile.gettempdir(), 'non-existent-dir') self.assertRaises(BioLibDirNotFound, self.options_parser.identify, options) def test_identify__batchfile_raises_io_exception(self): """ Test that the identify method raises an exception on invalid batchfile """ options = argparse.ArgumentParser() options.genome_dir = None options.batchfile = os.path.join(tempfile.gettempdir(), 'non-existent-file.txt') self.assertRaises(BioLibFileNotFound, self.options_parser.identify, options) def test_align__identify_dir_raises_io_exception(self): """ Test that the align method raises an exception on invalid identify dir """ options = argparse.ArgumentParser() options.identify_dir = os.path.join(tempfile.gettempdir(), 'non-existent-dir') self.assertRaises(BioLibDirNotFound, self.options_parser.align, options) def test_infer__msa_raises_io_exception(self): """ Test that the infer method raises an exception on invalid MSA """ options = argparse.ArgumentParser() options.msa_file = os.path.join(tempfile.gettempdir(), 'non-existent-msa.txt') self.assertRaises(BioLibFileNotFound, self.options_parser.infer, options) def test_run_test(self): """Test that the user-test method runs correctly""" options = argparse.ArgumentParser() options.out_dir = self.dir_tmp options.cpus = 3 self.assertTrue(self.options_parser.run_test(options)) # def test_run_test__throws_exception(self): # """Test that the user-test method fails correctly""" # options = argparse.ArgumentParser() # options.out_dir = self.dir_tmp # os.mkdir(os.path.join(self.dir_tmp, 'genomes')) # options.cpus = 3 # self.assertRaises(GTDBTkTestFailure, self.options_parser.run_test, options) def test_classify__align_dir_raises_io_exception(self): """ Test that the classify method raises an exception on invalid align dir """ options = argparse.ArgumentParser() options.align_dir = os.path.join(tempfile.gettempdir(), 'non-existent-dir') self.assertRaises(BioLibDirNotFound, self.options_parser.classify, options) def test_root__no_tree_raises_io_exception(self): """ Test that the infer method raises an exception on invalid tree """ options = argparse.ArgumentParser() options.input_tree = os.path.join(tempfile.gettempdir(), 'non-existent-tree.tree') self.assertRaises(BioLibFileNotFound, self.options_parser.root, options) def test_decorate__no_tree_raises_io_exception(self): """ Test that the infer method raises an exception on invalid tree """ options = argparse.ArgumentParser() options.input_tree = os.path.join(tempfile.gettempdir(), 'non-existent-tree.tree') self.assertRaises(BioLibFileNotFound, self.options_parser.decorate, options) def test_trim_msa__mask_file(self): """ Test that the expected result is returned when running trim_msa with mask_file """ path_untrimmed_msa = os.path.join(self.dir_tmp, 'untrimmed_msa.fasta') path_mask_file = os.path.join(self.dir_tmp, 'mask_file.txt') path_output = os.path.join(self.dir_tmp, 'trimmed_msa.fasta') with open(path_untrimmed_msa, 'w') as f: f.write('>genome_1\n') f.write('ALGPVW\n') f.write('>genome_2\n') f.write('WVPGLA\n') with open(path_mask_file, 'w') as f: f.write('010010\n') options = argparse.ArgumentParser() # Required arguments options.untrimmed_msa = path_untrimmed_msa options.output = path_output # Mutex arguments options.mask_file = path_mask_file options.reference_mask = None self.options_parser.trim_msa(options) results = dict() with open(path_output, 'r') as f: re_hits = re.findall(r'>(.+)\n(.+)\n', f.read()) for gid, seq in re_hits: results[gid] = seq expected = {'genome_1': 'LV', 'genome_2': 'VL'} self.assertDictEqual(results, expected) def test_trim_msa__reference_mask_arc(self): """ Test that the expected result is returned when running trim_msa with archaeal reference_mask """ path_untrimmed_msa = os.path.join(self.dir_tmp, 'untrimmed_msa.fasta') path_output = os.path.join(self.dir_tmp, 'trimmed_msa.fasta') msa_str = str() while len(msa_str) < 32675: msa_str += 'ALGPVW' msa_str = msa_str[0:32675] with open(path_untrimmed_msa, 'w') as f: f.write('>genome_1\n') f.write('%s\n' % msa_str) f.write('>genome_2\n') f.write('%s\n' % msa_str[::-1]) options = argparse.ArgumentParser() # Required arguments options.untrimmed_msa = path_untrimmed_msa options.output = path_output # Mutex arguments options.mask_file = None options.reference_mask = 'arc' self.options_parser.trim_msa(options) results = dict() with open(path_output, 'r') as f: re_hits = re.findall(r'>(.+)\n(.+)\n', f.read()) for gid, seq in re_hits: results[gid] = hashlib.sha256(seq.encode('utf-8')).hexdigest() expected = { 'genome_1': '4975c04d640415de4c715552f6f6b460a8996226239440faa6539ac777622515', 'genome_2': '7b53881aecb13bbe54612962e22736db7ab83271ffe4685d63c16e962e3561d9' } self.assertDictEqual(results, expected) def test_trim_msa__reference_mask_bac(self): """ Test that the expected result is returned when running trim_msa with bacterial reference_mask """ path_untrimmed_msa = os.path.join(self.dir_tmp, 'untrimmed_msa.fasta') path_output = os.path.join(self.dir_tmp, 'trimmed_msa.fasta') msa_str = str() while len(msa_str) < 41155: msa_str += 'ALGPVW' msa_str = msa_str[0:41155] with open(path_untrimmed_msa, 'w') as f: f.write('>genome_1\n') f.write('%s\n' % msa_str) f.write('>genome_2\n') f.write('%s\n' % msa_str[::-1]) options = argparse.ArgumentParser() # Required arguments options.untrimmed_msa = path_untrimmed_msa options.output = path_output # Mutex arguments options.mask_file = None options.reference_mask = 'bac' self.options_parser.trim_msa(options) results = dict() with open(path_output, 'r') as f: re_hits = re.findall(r'>(.+)\n(.+)\n', f.read()) for gid, seq in re_hits: results[gid] = hashlib.sha256(seq.encode('utf-8')).hexdigest() expected = { 'genome_1': '32798bdc3245b2ac5ecd8a15ea2cfb21011b22b6021baa51066864b1c02d72b4', 'genome_2': '0b63d416c72e9641011f80fcf64fa41eb3f0e8e85dbaa4bd8feba12cf3b64c62' } self.assertDictEqual(results, expected) def test_export_msa__arc(self): """ Test that the untrimmed archaeal MSA is exported correctly """ path_out = os.path.join(self.dir_tmp, 'output.fasta') options = argparse.ArgumentParser() options.domain = 'arc' options.output = path_out self.options_parser.export_msa(options) with open(path_out, 'rb') as f: out_hash = hashlib.sha256(f.read()).hexdigest() self.assertEqual( out_hash, 'e84edf65511002b73f110ff44c9acee3ae44220448dfc971a2778d43c966bbba') def test_export_msa__bac(self): """ Test that the untrimmed bacterial MSA is exported correctly """ path_out = os.path.join(self.dir_tmp, 'output.fasta') options = argparse.ArgumentParser() options.domain = 'bac' options.output = path_out self.options_parser.export_msa(options) with open(path_out, 'rb') as f: out_hash = hashlib.sha256(f.read()).hexdigest() self.assertEqual( out_hash, '5e37bc123819061490681068b49450fc43587d09b87df90ef62452bd73f961cc')
class TestOptionsParser(unittest.TestCase): def setUp(self): self.options_parser = OptionsParser('-1') self.dir_tmp = tempfile.mkdtemp(prefix='gtdbtk_tmp_') pass def tearDown(self): shutil.rmtree(self.dir_tmp) def test__verify_genome_id__valid(self): """ Test that a valid genome id returns True. """ self.assertTrue(self.options_parser._verify_genome_id('genome_1')) def test__verify_genome_id__invalid(self): """ Test that invalid genome ids throw an exception. """ for c in list('()[],;='): self.assertRaises(GTDBTkExit, self.options_parser._verify_genome_id, 'genome%s1' % c) def test__genomes_to_process__genome_dir__valid(self): """ Test that the expected results are returned when using genome_dir. """ open(os.path.join(self.dir_tmp, 'genome_1.fna'), 'a').close() open(os.path.join(self.dir_tmp, 'genome_2.fna'), 'a').close() open(os.path.join(self.dir_tmp, 'other_file.txt'), 'a').close() results, tln_table = self.options_parser._genomes_to_process( self.dir_tmp, '', 'fna') expected = { 'genome_1': os.path.join(self.dir_tmp, 'genome_1.fna'), 'genome_2': os.path.join(self.dir_tmp, 'genome_2.fna') } self.assertDictEqual(results, expected) def test__genomes_to_process__batchfile__valid(self): """ Test that the expected results are returned when using batchfile """ path_batchfile = os.path.join(self.dir_tmp, 'batchfile.txt') path_genome_1 = os.path.join(self.dir_tmp, 'genome_1.fna') path_genome_2 = os.path.join(self.dir_tmp, 'genome_2.fna') open(path_genome_1, 'a').close() open(path_genome_2, 'a').close() with open(path_batchfile, 'a') as f: f.write(f'{path_genome_1}\tgenome_1\n') f.write('\n') f.write(f'{path_genome_2}\tgenome_2\t4\n') results, tln_table = self.options_parser._genomes_to_process( '', path_batchfile, 'fna') expected = {'genome_1': path_genome_1, 'genome_2': path_genome_2} expected_tln = {'genome_2': 4} self.assertDictEqual(results, expected) self.assertDictEqual(tln_table, expected_tln) def test__genomes_to_process__batchfile__invalid_columns(self): """ Test that a batchfile containing columns not equal to 2 throws an exception. """ path_batchfile = os.path.join(self.dir_tmp, 'batchfile.txt') path_genome_1 = os.path.join(self.dir_tmp, 'genome_1.fna') path_genome_2 = os.path.join(self.dir_tmp, 'genome_2.fna') open(path_genome_1, 'a').close() open(path_genome_2, 'a').close() with open(path_batchfile, 'a') as f: f.write('%s\tgenome_1\n' % path_genome_1) f.write('\n') f.write('%s\tgenome_2\tfoo\n' % path_genome_2) self.assertRaises(GTDBTkExit, self.options_parser._genomes_to_process, '', path_batchfile, 'fna') def test__genomes_to_process__batchfile__blank_genome_path(self): """ Test that a batchfile containing a blank genome path throws an exception. """ path_batchfile = os.path.join(self.dir_tmp, 'batchfile.txt') path_genome_1 = os.path.join(self.dir_tmp, 'genome_1.fna') path_genome_2 = os.path.join(self.dir_tmp, 'genome_2.fna') open(path_genome_1, 'a').close() open(path_genome_2, 'a').close() with open(path_batchfile, 'a') as f: f.write('%s\tgenome_1\n' % path_genome_1) f.write('\n') f.write('%s\tgenome_2\n' % '') self.assertRaises(GTDBTkExit, self.options_parser._genomes_to_process, '', path_batchfile, 'fna') def test__genomes_to_process__batchfile__blank_genome_id(self): """ Test that a batchfile containing a blank genome id throws an exception. """ path_batchfile = os.path.join(self.dir_tmp, 'batchfile.txt') path_genome_1 = os.path.join(self.dir_tmp, 'genome_1.fna') path_genome_2 = os.path.join(self.dir_tmp, 'genome_2.fna') open(path_genome_1, 'a').close() open(path_genome_2, 'a').close() with open(path_batchfile, 'a') as f: f.write('%s\tgenome_1\n' % path_genome_1) f.write('\n') f.write('%s\t\n' % path_genome_2) self.assertRaises(GTDBTkExit, self.options_parser._genomes_to_process, '', path_batchfile, 'fna') def test__genomes_to_process__batchfile__duplicate_genome_id(self): """ Test that a batchfile containing duplicate genome ids throws an exception. """ # Branch 1: The number of columns are not equal to 2. path_batchfile = os.path.join(self.dir_tmp, 'batchfile.txt') path_genome_1 = os.path.join(self.dir_tmp, 'genome_1.fna') path_genome_2 = os.path.join(self.dir_tmp, 'genome_2.fna') open(path_genome_1, 'a').close() open(path_genome_2, 'a').close() with open(path_batchfile, 'a') as f: f.write('%s\tgenome_1\n' % path_genome_1) f.write('\n') f.write('%s\tgenome_1\n' % path_genome_2) self.assertRaises(GTDBTkExit, self.options_parser._genomes_to_process, '', path_batchfile, 'fna') # def test__genomes_to_process__batchfile__invalid_genome_id(self): # """ Test that a batchfile containing duplicate genome ids throws an exception. """ # # Branch 1: The number of columns are not equal to 2. # path_batchfile_1 = os.path.join(self.dir_tmp, 'batchfile_1.txt') # path_batchfile_2 = os.path.join(self.dir_tmp, 'batchfile_2.txt') # path_batchfile_3 = os.path.join(self.dir_tmp, 'batchfile_3.txt') # path_genome_1 = os.path.join(self.dir_tmp, 'genome_1.fna') # path_genome_2 = os.path.join(self.dir_tmp, 'genome_2.fna') # open(path_genome_1, 'a').close() # open(path_genome_2, 'a').close() # # with open(path_batchfile_1, 'a') as f: # f.write('%s\tgenome_1\n' % path_genome_1) # f.write('\n') # f.write('%s\tGB_genome_2\n' % path_genome_2) # # with open(path_batchfile_2, 'a') as f: # f.write('%s\tgenome_1\n' % path_genome_1) # f.write('\n') # f.write('%s\tRS_genome_2\n' % path_genome_2) # # with open(path_batchfile_3, 'a') as f: # f.write('%s\tgenome_1\n' % path_genome_1) # f.write('\n') # f.write('%s\tUBAgenome_2\n' % path_genome_2) # # self.assertRaises(GTDBTkExit, self.options_parser._genomes_to_process, '', path_batchfile_1, 'fna') # self.assertRaises(GTDBTkExit, self.options_parser._genomes_to_process, '', path_batchfile_2, 'fna') # self.assertRaises(GTDBTkExit, self.options_parser._genomes_to_process, '', path_batchfile_3, 'fna') def test__genomes_to_process__no_files(self): """ Test that an exception is thrown if no files are found to process """ # Branch 1 : genome_dir is specified tmp_genome_dir = tempfile.mkdtemp() try: self.assertRaises(GTDBTkExit, self.options_parser._genomes_to_process, tmp_genome_dir, '', 'fna') finally: shutil.rmtree(tmp_genome_dir) # Branch 2: batchfile is specified tmp_genome_dir = tempfile.mkdtemp() try: path_batchfile = os.path.join(tmp_genome_dir, 'batchfile.txt') open(path_batchfile, 'a').close() self.assertRaises(GTDBTkExit, self.options_parser._genomes_to_process, '', path_batchfile, 'fna') finally: shutil.rmtree(tmp_genome_dir) def test_identify__genome_dir_raises_io_exception(self): """ Test that the identify method raises an exception on invalid genome_dir """ options = argparse.ArgumentParser() options.genome_dir = os.path.join(tempfile.gettempdir(), 'non-existent-dir') self.assertRaises(BioLibDirNotFound, self.options_parser.identify, options) def test_identify__batchfile_raises_io_exception(self): """ Test that the identify method raises an exception on invalid batchfile """ options = argparse.ArgumentParser() options.genome_dir = None options.batchfile = os.path.join(tempfile.gettempdir(), 'non-existent-file.txt') self.assertRaises(BioLibFileNotFound, self.options_parser.identify, options) def test_align__identify_dir_raises_io_exception(self): """ Test that the align method raises an exception on invalid identify dir """ options = argparse.ArgumentParser() options.identify_dir = os.path.join(tempfile.gettempdir(), 'non-existent-dir') self.assertRaises(BioLibDirNotFound, self.options_parser.align, options) def test_infer__msa_raises_io_exception(self): """ Test that the infer method raises an exception on invalid MSA """ options = argparse.ArgumentParser() options.msa_file = os.path.join(tempfile.gettempdir(), 'non-existent-msa.txt') self.assertRaises(BioLibFileNotFound, self.options_parser.infer, options) def test_run_test(self): """Test that the user-test method runs correctly""" options = argparse.ArgumentParser() options.out_dir = self.dir_tmp options.cpus = 3 self.assertTrue(self.options_parser.run_test(options)) # def test_run_test__throws_exception(self): # """Test that the user-test method fails correctly""" # options = argparse.ArgumentParser() # options.out_dir = self.dir_tmp # os.mkdir(os.path.join(self.dir_tmp, 'genomes')) # options.cpus = 3 # self.assertRaises(GTDBTkTestFailure, self.options_parser.run_test, options) def test_classify__align_dir_raises_io_exception(self): """ Test that the classify method raises an exception on invalid align dir """ options = argparse.ArgumentParser() options.align_dir = os.path.join(tempfile.gettempdir(), 'non-existent-dir') self.assertRaises(BioLibDirNotFound, self.options_parser.classify, options) def test_root__no_tree_raises_io_exception(self): """ Test that the infer method raises an exception on invalid tree """ options = argparse.ArgumentParser() options.input_tree = os.path.join(tempfile.gettempdir(), 'non-existent-tree.tree') self.assertRaises(BioLibFileNotFound, self.options_parser.root, options) def test_decorate__no_tree_raises_io_exception(self): """ Test that the infer method raises an exception on invalid tree """ options = argparse.ArgumentParser() options.input_tree = os.path.join(tempfile.gettempdir(), 'non-existent-tree.tree') self.assertRaises(BioLibFileNotFound, self.options_parser.decorate, options)
def main(): parser = argparse.ArgumentParser(prog='gtdbtk', add_help=False, conflict_handler='resolve') parser.add_argument('-f', '--force', action="store_true", default=False, help="overwrite existing files without prompting.") subparsers = parser.add_subparsers(help="--", dest='subparser_name') # de novo workflow denovo_wf_parser = subparsers.add_parser( 'de_novo_wf', conflict_handler='resolve', formatter_class=CustomHelpFormatter, help='Infer de novo tree and decorate with GTDB taxonomy.') mutual_genome_denovo_wf = denovo_wf_parser.add_argument_group( 'mutually exclusive required arguments') mutex_group = mutual_genome_denovo_wf.add_mutually_exclusive_group( required=True) mutex_group.add_argument( '--genome_dir', help="directory containing genome files in FASTA format") mutex_group.add_argument( '--batchfile', help= "file describing genomes - tab separated in 2 columns (FASTA file, genome ID)" ) mutual_ms_denovo_wf = denovo_wf_parser.add_argument_group( 'mutually exclusive required arguments') mutex_group = mutual_ms_denovo_wf.add_mutually_exclusive_group( required=True) mutex_group.add_argument('--bacteria', action='store_true', help='process bacterial genomes') mutex_group.add_argument('--archaea', action='store_true', help='process archaeal genomes') required_denovo_wf = denovo_wf_parser.add_argument_group( 'required named arguments') required_denovo_wf.add_argument( '--outgroup_taxon', required=True, help= "taxon to use as outgroup (e.g., p__Patescibacteria or p__Altiarchaeota)" ) required_denovo_wf.add_argument('--out_dir', required=True, help="directory to output files") optional_denovo_wf = denovo_wf_parser.add_argument_group( 'optional arguments') optional_denovo_wf.add_argument( '-x', '--extension', default='fna', help='extension of files to process, gz = gzipped') optional_denovo_wf.add_argument( '--skip_gtdb_refs', action="store_true", help= 'do not include GTDB reference genomes in multiple sequence alignment') optional_denovo_wf.add_argument( '--taxa_filter', help=('filter GTDB genomes to taxa (comma separated) within ' + 'specific taxonomic groups (e.g., d__Bacteria ' + 'or p__Proteobacteria, p__Actinobacteria)')) optional_denovo_wf.add_argument( '--min_perc_aa', type=float, default=10, help= 'filter genomes with an insufficient percentage of AA in the MSA (inclusive bound)' ) optional_denovo_wf.add_argument( '--custom_msa_filters', action="store_true", help= ('perform custom filtering of MSA with cols_per_gene, min_consensus ' + 'max_consensus, and min_perc_taxa parameters instead of using canonical mask' )) optional_denovo_wf.add_argument( '--cols_per_gene', type=int, default=42, help='maximum number of columns to retain per gene') optional_denovo_wf.add_argument( '--min_consensus', type=float, default=25, help= 'minimum percentage of the same amino acid required to retain column (inclusive bound)' ) optional_denovo_wf.add_argument( '--max_consensus', type=float, default=95, help= 'maximum percentage of the same amino acid required to retain column (exclusive bound)' ) optional_denovo_wf.add_argument( '--min_perc_taxa', type=float, default=50, help= 'minimum percentage of taxa required to retain column (inclusive bound)' ) optional_denovo_wf.add_argument( '--rnd_seed', type=int, default=None, help='random seed to use for selecting columns') optional_denovo_wf.add_argument( '--prot_model', choices=['JTT', 'WAG', 'LG'], help='protein substitution model for tree inference', default='WAG') optional_denovo_wf.add_argument( '--no_support', action="store_true", help= "do not compute local support values using the Shimodaira-Hasegawa test" ) optional_denovo_wf.add_argument( '--gamma', action="store_true", help="rescale branch lengths to optimize the Gamma20 likelihood") optional_denovo_wf.add_argument( '--gtdbtk_classification_file', help= "file with GTDB-Tk classifications produced by the `classify` command") optional_denovo_wf.add_argument( '--custom_taxonomy_file', help= "file indicating custom taxonomy string for at least the genomes belonging to the outgroup" ) optional_denovo_wf.add_argument('--prefix', default='gtdbtk', help='desired prefix for output files') optional_denovo_wf.add_argument('--cpus', default=1, type=int, help='number of CPUs to use') optional_denovo_wf.add_argument( '--force', action='store_const', const=True, default=False, help='continue processing if an error occurs on a single genome') optional_denovo_wf.add_argument( '--debug', action="store_true", help='create intermediate files for debugging purposes') optional_denovo_wf.add_argument('-h', '--help', action="help", help="show help message") # classify workflow classify_wf_parser = subparsers.add_parser( 'classify_wf', conflict_handler='resolve', formatter_class=CustomHelpFormatter, help='Classify genomes by placement in GTDB reference tree.') mutual_genome_classify_wf = classify_wf_parser.add_argument_group( 'mutually exclusive required arguments') mutex_group = mutual_genome_classify_wf.add_mutually_exclusive_group( required=True) mutex_group.add_argument( '--genome_dir', help="directory containing genome files in FASTA format") mutex_group.add_argument( '--batchfile', help= "file describing genomes - tab separated in 3 columns (FASTA file, genome ID, translation table [optional])" ) required_classify_wf = classify_wf_parser.add_argument_group( 'required named arguments') required_classify_wf.add_argument('--out_dir', required=True, help="directory to output files") optional_classify_wf = classify_wf_parser.add_argument_group( 'optional arguments') optional_classify_wf.add_argument( '-x', '--extension', default='fna', help='extension of files to process, gz = gzipped') optional_classify_wf.add_argument( '--min_perc_aa', type=float, default=10, help='filter genomes with an insufficient percentage of AA in the MSA') optional_classify_wf.add_argument('--prefix', required=False, default='gtdbtk', help='desired prefix for output files') optional_classify_wf.add_argument('--cpus', default=1, type=int, help='number of CPUs to use') optional_classify_wf.add_argument( '--pplacer_cpus', type=int, default=None, help='use PPLACER_CPUS during placement (default: CPUS)') optional_classify_wf.add_argument( '--force', action='store_const', const=True, default=False, help='continue processing if an error occurs on a single genome') optional_classify_wf.add_argument( '--scratch_dir', help='Reduce memory usage by writing to disk (slower).') optional_classify_wf.add_argument( '-r', '--recalculate_red', action='store_true', help= 'recalculate RED values based on the reference tree and all added user genomes' ) # optional_classify_wf.add_argument('-s', '--split_tree', action='store_true', # help='Use shards of the reference tree (for Bacteria only). reduce memory usage (slower).') optional_classify_wf.add_argument( '-d', '--debug', action="store_true", help='create intermediate files for debugging purposes') optional_classify_wf.add_argument('-h', '--help', action="help", help="show help message") # identify marker genes in genomes identify_parser = subparsers.add_parser( 'identify', conflict_handler='resolve', formatter_class=CustomHelpFormatter, help='Identify marker genes in genome.') mutex_identify = identify_parser.add_argument_group( 'mutually exclusive required arguments') mutex_group = mutex_identify.add_mutually_exclusive_group(required=True) mutex_group.add_argument( '--genome_dir', help="directory containing genome files in FASTA format") mutex_group.add_argument( '--batchfile', help= "file describing genomes - tab separated in 3 columns (FASTA file, genome ID, translation table [optional])" ) required_identify = identify_parser.add_argument_group( 'required named arguments') required_identify.add_argument('--out_dir', required=True, help="directory to output files") optional_identify = identify_parser.add_argument_group( 'optional arguments') optional_identify.add_argument( '-x', '--extension', default='fna', help='extension of files to process, gz = gzipped') optional_identify.add_argument('--prefix', default='gtdbtk', help='desired prefix for output files') optional_identify.add_argument('--cpus', default=1, type=int, help='number of CPUs to use') optional_identify.add_argument( '--force', action='store_const', const=True, default=False, help='continue processing if an error occurs on a single genome') optional_identify.add_argument('-h', '--help', action="help", help="show help message") # create multiple sequence alignment align_parser = subparsers.add_parser( 'align', conflict_handler='resolve', formatter_class=CustomHelpFormatter, help='Create multiple sequence alignment.', ) required_align = align_parser.add_argument_group( 'required named arguments') required_align.add_argument('--identify_dir', required=True, help="output directory of 'identify' command") required_align.add_argument('--out_dir', required=True, help='directory to output files') optional_align = align_parser.add_argument_group('optional arguments') optional_align.add_argument( '--skip_gtdb_refs', action="store_true", help= 'do not include GTDB reference genomes in multiple sequence alignment') optional_align.add_argument( '--taxa_filter', help=('filter GTDB genomes to taxa (comma separated) within ' + 'specific taxonomic groups (e.g., d__Bacteria ' + 'or p__Proteobacteria, p__Actinobacteria)')) optional_align.add_argument( '--min_perc_aa', type=float, default=10, help= 'filter genomes with an insufficient percentage of AA in the MSA (inclusive bound)' ) mutual_genome_align = align_parser.add_argument_group( 'mutually exclusive optional arguments') mutex_align_group = mutual_genome_align.add_mutually_exclusive_group() mutex_align_group.add_argument( '--custom_msa_filters', action="store_true", help= ('perform custom filtering of MSA with cols_per_gene, min_consensus ' + 'max_consensus, and min_perc_taxa parameters instead of using canonical mask' )) mutex_align_group.add_argument( '--skip_trimming', action="store_true", default=False, help='skip trimming step and return the full MSAs') optional_align.add_argument( '--cols_per_gene', type=int, default=42, help='maximum number of columns to retain per gene') optional_align.add_argument( '--min_consensus', type=float, default=25, help= 'minimum percentage of the same amino acid required to retain column (inclusive bound)' ) optional_align.add_argument( '--max_consensus', type=float, default=95, help= 'maximum percentage of the same amino acid required to retain column (exclusive bound)' ) optional_align.add_argument( '--min_perc_taxa', type=float, default=50, help= 'minimum percentage of taxa required to retain column (inclusive bound)' ) optional_align.add_argument( '--rnd_seed', type=int, default=None, help='random seed to use for selecting columns') optional_align.add_argument('--prefix', required=False, default='gtdbtk', help='desired prefix for output files') optional_align.add_argument('--cpus', default=1, type=int, help='number of CPUs to use') optional_align.add_argument( '--debug', action="store_true", help='create intermediate files for debugging purposes') optional_align.add_argument('-h', '--help', action="help", help="show help message") # infer tree infer_parser = subparsers.add_parser( 'infer', conflict_handler='resolve', formatter_class=CustomHelpFormatter, help='Infer tree from multiple sequence alignment.', ) required_infer = infer_parser.add_argument_group( 'required named arguments') required_infer.add_argument( '--msa_file', required=True, help="multiple sequence alignment in FASTA format") required_infer.add_argument('--out_dir', required=True, help='directory to output files') optional_infer = infer_parser.add_argument_group('optional arguments') optional_infer.add_argument( '--prot_model', choices=['JTT', 'WAG', 'LG'], help='protein substitution model for tree inference', default='WAG') optional_infer.add_argument( '--no_support', action="store_true", help= "do not compute local support values using the Shimodaira-Hasegawa test" ) optional_infer.add_argument( '--gamma', action="store_true", help="rescale branch lengths to optimize the Gamma20 likelihood") optional_infer.add_argument('--prefix', required=False, default='gtdbtk', help='desired prefix for output files') optional_infer.add_argument('--cpus', default=1, type=int, help='number of CPUs to use') optional_infer.add_argument('-h', '--help', action="help", help="show help message") # classify genomes via placement with pplacer classify_parser = subparsers.add_parser( 'classify', conflict_handler='resolve', formatter_class=CustomHelpFormatter, help='Determine taxonomic classification of genomes.', ) mutual_genome_classify = classify_parser.add_argument_group( 'mutually exclusive required arguments') mutex_group = mutual_genome_classify.add_mutually_exclusive_group( required=True) mutex_group.add_argument( '--genome_dir', help="directory containing genome files in FASTA format") mutex_group.add_argument( '--batchfile', help= "file describing genomes - tab separated in 2 columns (FASTA file, genome ID)" ) required_classify = classify_parser.add_argument_group( 'required named arguments') required_classify.add_argument('--align_dir', required=True, help="output directory of 'align' command") required_classify.add_argument('--out_dir', required=True, help='directory to output files') optional_classify = classify_parser.add_argument_group( 'optional arguments') optional_classify.add_argument( '-x', '--extension', default='fna', help='extension of files to process, gz = gzipped') optional_classify.add_argument('--prefix', required=False, default='gtdbtk', help='desired prefix for output files') optional_classify.add_argument('--cpus', default=1, type=int, help='number of CPUs to use') optional_classify.add_argument( '--pplacer_cpus', type=int, default=None, help='use PPLACER_CPUS during placement (default: CPUS)') optional_classify.add_argument( '--scratch_dir', help='reduce memory usage by writing to disk (slower)') # optional_classify.add_argument('-s', '--split_tree', action='store_true', # help='Use shards of the reference tree (for Bacteria only). reduce memory usage (slower).') optional_classify.add_argument( '-r', '--recalculate_red', action='store_true', help= 'recalculate RED values based on the reference tree and all added user genomes' ) optional_classify.add_argument( '--debug', action="store_true", help='create intermediate files for debugging purposes') optional_classify.add_argument('-h', '--help', action="help", help="show help message") # root tree using outgroup root_parser = subparsers.add_parser( 'root', conflict_handler='resolve', formatter_class=CustomHelpFormatter, help='Root tree using an outgroup.', ) required_root = root_parser.add_argument_group('required named arguments') required_root.add_argument('--input_tree', required=True, help="tree to root in Newick format") required_root.add_argument( '--outgroup_taxon', required=True, help= "taxon to use as outgroup (e.g., p__Patescibacteria or p__Altiarchaeota)" ) required_root.add_argument('--output_tree', required=True, help='output tree') optional_root = root_parser.add_argument_group('optional arguments') optional_root.add_argument( '--gtdbtk_classification_file', help= "file with GTDB-Tk classifications produced by the `classify` command") optional_root.add_argument( '--custom_taxonomy_file', help= "file indicating custom taxonomy strings for user genomes, which should contain any genomes belonging to the outgroup" ) optional_root.add_argument('-h', '--help', action="help", help="show help message") # decorate tree decorate_parser = subparsers.add_parser( 'decorate', conflict_handler='resolve', formatter_class=CustomHelpFormatter, help='Decorate tree with GTDB taxonomy.', ) required_decorate = decorate_parser.add_argument_group( 'required named arguments') required_decorate.add_argument('--input_tree', required=True, help="tree to root in Newick format") required_decorate.add_argument('--output_tree', required=True, help='output tree') optional_decorate = decorate_parser.add_argument_group( 'optional arguments') optional_decorate.add_argument( '--gtdbtk_classification_file', help= "file with GTDB-Tk classifications produced by the `classify` command") optional_decorate.add_argument( '--custom_taxonomy_file', help="file indicating custom taxonomy strings for user genomes") optional_decorate.add_argument('-h', '--help', action="help", help="show help message") # establish taxonomic ranks of internal nodes using RED infer_ranks_parser = subparsers.add_parser( 'infer_ranks', conflict_handler='resolve', formatter_class=CustomHelpFormatter, help='Establish taxonomic ranks of internal nodes using RED.', ) infer_ranks_req = infer_ranks_parser.add_argument_group( 'required named arguments') infer_ranks_req.add_argument( '--input_tree', required=True, help="rooted input tree with labelled ingroup taxon") infer_ranks_req.add_argument( '--ingroup_taxon', required=True, help= "labelled ingroup taxon to use as root for establish RED values (e.g., c__Bacilli or f__Lactobacillaceae" ) infer_ranks_req.add_argument('--output_tree', required=True, help="output tree") infer_ranks_opt = infer_ranks_parser.add_argument_group( 'optional arguments') infer_ranks_opt.add_argument('-h', '--help', action="help", help="show help message") # ani_rep ani_rep_parser = subparsers.add_parser( 'ani_rep', conflict_handler='resolve', formatter_class=CustomHelpFormatter, help='Calculates ANI to GTDB representative genomes.', ) # ani_rep mutex required input genomes ani_rep_mutex_genome = ani_rep_parser.add_argument_group( 'mutually exclusive required arguments') ani_rep_mutex_in = ani_rep_mutex_genome.add_mutually_exclusive_group( required=True) ani_rep_mutex_in.add_argument( '--genome_dir', help="directory containing genome files in FASTA format") ani_rep_mutex_in.add_argument( '--batchfile', help= "file describing genomes - tab separated in 2 columns (FASTA file, genome ID)" ) # ani_rep required arguments ani_rep_req = ani_rep_parser.add_argument_group('required named arguments') ani_rep_req.add_argument('--out_dir', required=True, help="directory to output files") # ani_rep mash arguments ani_rep_mash = ani_rep_parser.add_argument_group('optional Mash arguments') ani_rep_mash.add_argument('--no_mash', action='store_const', const=True, default=False, help='skip pre-filtering using MASH') ani_rep_mash.add_argument('--mash_k', default=16, type=int, help='k-mer size [1-32]') ani_rep_mash.add_argument('--mash_s', default=5000, type=int, help='maximum number of non-redundant hashes') ani_rep_mash.add_argument('--mash_d', default=0.1, type=float, help='maximum distance to keep [0-1]') ani_rep_mash.add_argument('--mash_v', default=1.0, type=float, help='maximum p-value to keep [0-1]') ani_rep_fastani_opt = ani_rep_parser.add_argument_group( 'optional FastANI arguments') ani_rep_fastani_opt.add_argument( '--min_af', default=AF_THRESHOLD, type=float, help='alignment fraction to consider closest genome') # ani_rep optional arguments ani_rep_opt = ani_rep_parser.add_argument_group('optional arguments') ani_rep_opt.add_argument( '-x', '--extension', default='fna', help='extension of files to process, gz = gzipped') ani_rep_opt.add_argument('--prefix', default='gtdbtk', help='desired prefix for output files') ani_rep_opt.add_argument('--cpus', default=1, type=int, help='number of CPUs to use') ani_rep_opt.add_argument('-h', '--help', action="help", help="show help message") # test test_parser = subparsers.add_parser( 'test', conflict_handler='resolve', formatter_class=CustomHelpFormatter, help='Test the classify_wf pipeline with 3 archaeal genomes.') required_test = test_parser.add_argument_group('required named arguments') required_test.add_argument('--out_dir', required=True, help='directory to output files') optional_test = test_parser.add_argument_group('optional arguments') optional_test.add_argument('--cpus', default=1, type=int, help='number of CPUs to use') optional_test.add_argument('-h', '--help', action="help", help="show help message") # trim MSA msa_parser = subparsers.add_parser( 'trim_msa', conflict_handler='resolve', formatter_class=CustomHelpFormatter, help='Trim an untrimmed MSA file based on a mask.', ) required_msa = msa_parser.add_argument_group('required named arguments') required_msa.add_argument('--untrimmed_msa', required=True, help="untrimmed MSA file") required_msa.add_argument('--output', required=True, help='output file') mutual_trim_msa = msa_parser.add_argument_group( 'mutually exclusive required arguments') mutex_msa_group = mutual_trim_msa.add_mutually_exclusive_group( required=True) mutex_msa_group.add_argument('--mask_file', help="mask file to use for trimming the MSA") mutex_msa_group.add_argument( '--reference_mask', choices=['arc', 'bac'], help="reference mask already present in GTDB-Tk") optional_msa = msa_parser.add_argument_group('optional arguments') optional_msa.add_argument('-h', '--help', action="help", help="show help message") # export msa export_msa_parser = subparsers.add_parser( 'export_msa', conflict_handler='resolve', formatter_class=CustomHelpFormatter, help='Export the untrimmed archaeal or bacterial MSA file.', ) required_export_msa = export_msa_parser.add_argument_group( 'required named arguments') required_export_msa.add_argument('--domain', required=True, choices=['arc', 'bac'], help="select domain to download") required_export_msa.add_argument('--output', required=True, help='output file') optional_export_msa = export_msa_parser.add_argument_group( 'optional arguments') optional_export_msa.add_argument('-h', '--help', action="help", help="show help message") # verify install check_install_parser = subparsers.add_parser( 'check_install', conflict_handler='resolve', formatter_class=CustomHelpFormatter, help='Verify if all gtdb data files are present to run GTDB-Tk.', ) optional_check_install = check_install_parser.add_argument_group( 'optional arguments') optional_check_install.add_argument('-h', '--help', action="help", help="show help message") # ------------------------------------------------- # get and check options args = None if len(sys.argv) == 1: print_help() sys.exit(0) elif sys.argv[1] in {'-v', '--v', '-version', '--version'}: print("gtdbtk: version %s %s %s" % (__version__, __copyright__, __author__)) sys.exit(0) elif sys.argv[1] in {'-h', '--h', '-help', '--help'}: print_help() sys.exit(0) else: args = parser.parse_args() # setup logger logger_setup(args.out_dir if hasattr(args, 'out_dir') else None, "gtdbtk.log", "GTDB-Tk", __version__, False, hasattr(args, 'debug') and args.debug) logger = logging.getLogger('timestamp') # ------------------------------------------------- # do what we came here to do try: gt_parser = OptionsParser(__version__) if False: import cProfile cProfile.run('gt_parser.parseOptions(args)', 'prof') else: gt_parser.parse_options(args) except SystemExit: sys.stdout.write('\n') sys.stdout.flush() logger.error('Controlled exit resulting from early termination.') sys.exit(1) except KeyboardInterrupt: sys.stdout.write('\n') sys.stdout.flush() logger.error('Controlled exit resulting from interrupt signal.') sys.exit(1) except GTDBTkExit as e: sys.stdout.write('\n') sys.stdout.flush() if len(str(e)) > 0: logger.error('{}'.format(e)) logger.error( 'Controlled exit resulting from an unrecoverable error or warning.' ) sys.exit(1) except (GTDBTkException, BioLibError) as e: sys.stdout.write('\n') sys.stdout.flush() msg = 'Controlled exit resulting from an unrecoverable error or warning.\n\n' msg += '=' * 80 + '\n' msg += 'EXCEPTION: {}\n'.format(type(e).__name__) msg += ' MESSAGE: {}\n'.format(e) msg += '_' * 80 + '\n\n' msg += traceback.format_exc() msg += '=' * 80 logger.error(msg) sys.exit(1) except Exception as e: sys.stdout.write('\n') sys.stdout.flush() msg = 'Uncontrolled exit resulting from an unexpected error.\n\n' msg += '=' * 80 + '\n' msg += 'EXCEPTION: {}\n'.format(type(e).__name__) msg += ' MESSAGE: {}\n'.format(e) msg += '_' * 80 + '\n\n' msg += traceback.format_exc() msg += '=' * 80 logger.error(msg) sys.exit(1)
class TestCli(unittest.TestCase): def setUp(self): self.identify_dir_reference = os.path.join( os.path.dirname(__file__), 'data/identify_dir_reference/') self.align_dir_reference = 'tests/data/align_dir_reference/' self.genome_dir = 'gtdbtk/tests/data/genomes/' self.options = argparse.ArgumentParser() self.options.batchfile = None self.options.prefix = 'gtdbtk' self.options.cpus = 1 self.options.extension = 'fna' self.options.debug = False self.options.force = False self.options.genes = False self.options.write_single_copy_genes = False # align option self.options.skip_gtdb_refs = False self.options.taxa_filter = None self.options.custom_msa_filters = False self.options.skip_trimming = False self.options.min_consensus = None self.options.min_perc_taxa = None self.options.skip_gtdb_refs = False self.options.cols_per_gene = None self.options.max_consensus = None self.options.min_perc_aa = 50 self.options.rnd_seed = 42 # classify options self.options.scratch_dir = None self.options.keep_ref_red = None self.options.pplacer_cpus = None # infer options self.options.prot_model = 'WAG' self.options.no_support = False self.options.no_gamma = True self.version = ' unittest' self.optionparser = OptionsParser(self.version) logger_setup(None, "gtdbtk.log", "GTDB-Tk", self.version, True) # self.generic_out_path = 'tests/data/results' self.generic_out_path = tempfile.mkdtemp(prefix='gtdbtk_tmp_') def test_identify(self): tmp_folder = ''.join( random.choice(string.ascii_uppercase + string.digits) for _ in range(10)) identify_options = self.options identify_options.genome_dir = self.genome_dir identify_options.out_dir = os.path.join(self.generic_out_path, tmp_folder, 'identify') self.optionparser.identify(identify_options) ar122_marker_path = os.path.join( self.options.out_dir, PATH_AR122_MARKER_SUMMARY.format(prefix=self.options.prefix)) self.assertTrue( os.path.isfile( os.path.join( self.options.out_dir, PATH_BAC120_MARKER_SUMMARY.format( prefix=self.options.prefix)))) self.assertTrue(os.path.isfile(ar122_marker_path)) results = {} with open(ar122_marker_path, 'r') as f: f.readline() for line in f: infos = line.split('\t', 1) results[infos[0]] = infos[1] self.assertTrue(results.get('genome_1').startswith('120\t2\t0\t')) def test_align(self): tmp_folder = ''.join( random.choice(string.ascii_uppercase + string.digits) for _ in range(10)) align_options = self.options align_options.identify_dir = self.identify_dir_reference align_options.out_dir = os.path.join(self.generic_out_path, tmp_folder, 'align') self.optionparser.align(align_options) path_user_msa = os.path.join( align_options.out_dir, PATH_AR122_USER_MSA.format(prefix=align_options.prefix)) self.assertTrue(os.path.isfile(path_user_msa)) with open(path_user_msa, 'r') as f: lines = f.read().splitlines() last_line = lines[-1] self.assertTrue(len(last_line) > 4500) self.assertTrue(len(last_line) < 5500) self.assertTrue('-' in last_line) self.assertFalse(any(char.isdigit() for char in last_line)) def test_classify(self): tmp_folder = ''.join( random.choice(string.ascii_uppercase + string.digits) for _ in range(10)) classify_options = self.options classify_options.genome_dir = self.genome_dir classify_options.align_dir = self.align_dir_reference classify_options.out_dir = os.path.join(self.generic_out_path, tmp_folder, 'classify') classify_options.recalculate_red = False classify_options.split_tree = False self.optionparser.classify(classify_options) summary_fh = ClassifySummaryFileAR122(classify_options.out_dir, classify_options.prefix) summary_fh.read() self.assertEqual( 'd__Archaea;p__Euryarchaeota;c__Methanobacteria;o__Methanobacteriales;f__Methanobacteriaceae;g__Methanobrevibacter;s__Methanobrevibacter ruminantium', summary_fh.rows['genome_1'].classification) self.assertEqual( 'd__Archaea;p__Thermoplasmatota;c__Thermoplasmata;o__Methanomassiliicoccales;f__Methanomethylophilaceae;g__VadinCA11;s__VadinCA11 sp002498365', summary_fh.rows['genome_2'].classification) self.assertEqual( 'd__Archaea;p__Thermoplasmatota;c__Thermoplasmata;o__Methanomassiliicoccales;f__Methanomethylophilaceae;g__VadinCA11;s__VadinCA11 sp002498365', summary_fh.rows['genome_3'].classification) def test_identify_align(self): tmp_folder = ''.join( random.choice(string.ascii_uppercase + string.digits) for _ in range(10)) identify_options = self.options identify_options.genome_dir = self.genome_dir identify_options.out_dir = os.path.join(self.generic_out_path, tmp_folder, 'identify') self.optionparser.identify(identify_options) align_options = self.options align_options.identify_dir = os.path.join(self.generic_out_path, tmp_folder, 'identify') align_options.out_dir = os.path.join(self.generic_out_path, tmp_folder, 'align') self.optionparser.align(align_options) path_user_msa = os.path.join( align_options.out_dir, PATH_AR122_USER_MSA.format(prefix=align_options.prefix)) self.assertTrue(os.path.isfile(path_user_msa)) with open(path_user_msa, 'r') as f: lines = f.read().splitlines() last_line = lines[-1] self.assertTrue(len(last_line) > 4500) self.assertTrue(len(last_line) < 5500) self.assertTrue('-' in last_line) self.assertFalse(any(char.isdigit() for char in last_line)) def test_identify_align_classify(self): tmp_folder = ''.join( random.choice(string.ascii_uppercase + string.digits) for _ in range(10)) identify_options = self.options identify_options.genome_dir = self.genome_dir identify_options.out_dir = os.path.join(self.generic_out_path, tmp_folder, 'identify') self.optionparser.identify(identify_options) align_options = self.options align_options.identify_dir = os.path.join(self.generic_out_path, tmp_folder, 'identify') align_options.out_dir = os.path.join(self.generic_out_path, tmp_folder, 'align') self.optionparser.align(align_options) path_user_msa = os.path.join( align_options.out_dir, PATH_AR122_USER_MSA.format(prefix=align_options.prefix)) self.assertTrue(os.path.isfile(path_user_msa)) with open(path_user_msa, 'r') as f: lines = f.read().splitlines() last_line = lines[-1] self.assertTrue(len(last_line) > 4500) self.assertTrue(len(last_line) < 5500) self.assertTrue('-' in last_line) self.assertFalse(any(char.isdigit() for char in last_line)) classify_options = self.options classify_options.genome_dir = self.genome_dir classify_options.split_tree = False classify_options.align_dir = align_options.out_dir classify_options.out_dir = os.path.join(self.generic_out_path, tmp_folder, 'classify') classify_options.recalculate_red = False self.optionparser.classify(classify_options) summary_out = os.path.join( classify_options.out_dir, PATH_AR122_SUMMARY_OUT.format(prefix=classify_options.prefix)) self.assertTrue(summary_out) with open(summary_out, 'r') as f: lines = f.read().splitlines() last_line = lines[-1] infos = last_line.split('\t') self.assertEquals(len(infos), 19) self.assertTrue(infos[1].startswith('d__Archaea')) def test_classify_wf(self): tmp_folder = ''.join( random.choice(string.ascii_uppercase + string.digits) for _ in range(10)) classify_wf_options = self.options classify_wf_options.genome_dir = self.genome_dir classify_wf_options.out_dir = os.path.join(self.generic_out_path, tmp_folder, 'classify_wf') self.optionparser.identify(classify_wf_options) classify_wf_options.identify_dir = classify_wf_options.out_dir classify_wf_options.align_dir = classify_wf_options.out_dir classify_wf_options.taxa_filter = None classify_wf_options.custom_msa_filters = False classify_wf_options.min_consensus = None classify_wf_options.min_perc_taxa = None classify_wf_options.skip_gtdb_refs = False classify_wf_options.cols_per_gene = None classify_wf_options.max_consensus = None classify_wf_options.recalculate_red = False classify_wf_options.split_tree = False self.optionparser.align(classify_wf_options) self.optionparser.classify(classify_wf_options) summary_out = os.path.join( classify_wf_options.out_dir, PATH_AR122_SUMMARY_OUT.format(prefix=classify_wf_options.prefix)) self.assertTrue(os.path.isfile(summary_out)) with open(summary_out, 'r') as f: lines = f.read().splitlines() last_line = lines[-1] infos = last_line.split('\t') self.assertEqual(len(infos), 19) self.assertTrue(infos[1].startswith('d__Archaea')) def test_infer(self): tmp_folder = ''.join( random.choice(string.ascii_uppercase + string.digits) for _ in range(10)) infer_options = self.options path_user_msa = PATH_AR122_USER_MSA.format(prefix=self.options.prefix) infer_options.msa_file = os.path.join(self.align_dir_reference, path_user_msa) infer_options.out_dir = os.path.join(self.generic_out_path, tmp_folder, 'infer') infer_options.gamma = False # if not os.path.isdir(infer_options.out_dir): # os.makedirs(infer_options.out_dir) self.optionparser.infer(infer_options) with open( os.path.join(infer_options.out_dir, PATH_TREE_LOG.format(prefix=self.options.prefix)), 'r') as f: lines = f.read().splitlines() last_line = lines[-1] self.assertEqual(last_line.strip(), 'TreeCompleted') with open( os.path.join( infer_options.out_dir, PATH_UNROOTED_TREE.format(prefix=self.options.prefix)), 'r') as f: lines = f.read().splitlines() last_line = lines[-1] self.assertTrue('genome_1' in last_line) self.assertTrue('genome_2' in last_line) self.assertTrue('genome_3' in last_line) def test_de_novo_wf(self): tmp_folder = ''.join( random.choice(string.ascii_uppercase + string.digits) for _ in range(10)) de_novo_wf_options = self.options de_novo_wf_options.genome_dir = self.genome_dir de_novo_wf_options.suffix = ".ar122" de_novo_wf_options.gamma = False de_novo_wf_options.out_dir = os.path.join(self.generic_out_path, tmp_folder, 'de_novo_wf') de_novo_wf_options.identify_dir = de_novo_wf_options.out_dir de_novo_wf_options.msa_file = os.path.join( de_novo_wf_options.out_dir, de_novo_wf_options.prefix + de_novo_wf_options.suffix + ".user_msa.fasta") self.optionparser.identify(de_novo_wf_options) self.optionparser.align(de_novo_wf_options) self.optionparser.infer(de_novo_wf_options) def test_identify_gzipped_genomes(self): """ Test that gene calling is successful when using gzipped genomes """ options = argparse.ArgumentParser() options.genome_dir = 'tests/data/genomes_gz/' options.cpus = 5 options.batchfile = None options.extension = 'fna.gz' options.write_single_copy_genes = False options.prefix = 'gtdbtk' options.force = None options.genes = False options.out_dir = self.generic_out_path self.optionparser.identify(options) self.assertTrue( are_files_equal( os.path.join( self.identify_dir_reference, PATH_BAC120_MARKER_SUMMARY.format(prefix='gtdbtk')), os.path.join( self.generic_out_path, PATH_BAC120_MARKER_SUMMARY.format(prefix='gtdbtk')), ignore_order=True)) self.assertTrue( are_files_equal( os.path.join( self.identify_dir_reference, PATH_AR122_MARKER_SUMMARY.format(prefix='gtdbtk')), os.path.join( self.generic_out_path, PATH_AR122_MARKER_SUMMARY.format(prefix='gtdbtk')), ignore_order=True)) self.assertTrue( are_files_equal( os.path.join(self.identify_dir_reference, PATH_TLN_TABLE_SUMMARY.format(prefix='gtdbtk')), os.path.join(self.generic_out_path, PATH_TLN_TABLE_SUMMARY.format(prefix='gtdbtk')), ignore_order=True)) def test_root(self): """Test that rooting is successful when called through the CLI""" options = argparse.ArgumentParser() options.input_tree = 'tests/data/pplacer_dir_reference/gtdbtk.ar122.classify.tree' options.outgroup_taxon = 'p__Altarchaeota' options.output_tree = os.path.join(self.generic_out_path, 'test.rooted.tree') options.custom_taxonomy_file = None options.gtdbtk_classification_file = None self.optionparser.root(options) self.assertTrue(os.path.isfile(options.output_tree)) def tearDown(self): shutil.rmtree(self.generic_out_path)
class TestCli(unittest.TestCase): def setUp(self): self.identify_dir_reference = os.path.join( os.path.dirname(__file__), 'data/identify_dir_reference/') self.align_dir_reference = 'tests/data/align_dir_reference/' self.genome_dir = 'gtdbtk/tests/data/genomes/' self.options = argparse.ArgumentParser() self.options.batchfile = None self.options.prefix = 'gtdbtk' self.options.cpus = 1 self.options.extension = 'fna' self.options.debug = False self.options.force = False # align option self.options.skip_gtdb_refs = False self.options.taxa_filter = None self.options.custom_msa_filters = False self.options.skip_trimming = False self.options.min_consensus = None self.options.min_perc_taxa = None self.options.skip_gtdb_refs = False self.options.cols_per_gene = None self.options.max_consensus = None self.options.min_perc_aa = 50 self.options.rnd_seed = 42 # classify options self.options.scratch_dir = None self.options.keep_ref_red = None # infer options self.options.prot_model = 'WAG' self.options.no_support = False self.options.no_gamma = True self.version = ' unittest' self.optionparser = OptionsParser(self.version) logger_setup(None, "gtdbtk.log", "GTDB-Tk", self.version, True) # self.generic_out_path = 'tests/data/results' self.generic_out_path = '/tmp/GTDBTk/tests' def test_identify(self): tmp_folder = ''.join( random.choice(string.ascii_uppercase + string.digits) for _ in range(10)) identify_options = self.options identify_options.genome_dir = self.genome_dir identify_options.out_dir = os.path.join(self.generic_out_path, tmp_folder, 'identify') self.optionparser.identify(identify_options) ar122_marker_path = os.path.join( self.options.out_dir, PATH_AR122_MARKER_SUMMARY.format(prefix=self.options.prefix)) self.assertTrue( os.path.isfile( os.path.join( self.options.out_dir, PATH_BAC120_MARKER_SUMMARY.format( prefix=self.options.prefix)))) self.assertTrue(os.path.isfile(ar122_marker_path)) results = {} with open(ar122_marker_path, 'r') as f: f.readline() for line in f: infos = line.split('\t', 1) results[infos[0]] = infos[1] self.assertTrue(results.get('genome_1').startswith('120\t2\t0\t')) def test_align(self): tmp_folder = ''.join( random.choice(string.ascii_uppercase + string.digits) for _ in range(10)) align_options = self.options align_options.identify_dir = self.identify_dir_reference align_options.out_dir = os.path.join(self.generic_out_path, tmp_folder, 'align') self.optionparser.align(align_options) path_user_msa = os.path.join( align_options.out_dir, PATH_AR122_USER_MSA.format(prefix=align_options.prefix)) self.assertTrue(os.path.isfile(path_user_msa)) with open(path_user_msa, 'r') as f: lines = f.read().splitlines() last_line = lines[-1] self.assertTrue(len(last_line) > 4500) self.assertTrue(len(last_line) < 5500) self.assertTrue('-' in last_line) self.assertFalse(any(char.isdigit() for char in last_line)) def test_classify(self): tmp_folder = ''.join( random.choice(string.ascii_uppercase + string.digits) for _ in range(10)) classify_options = self.options classify_options.genome_dir = self.genome_dir classify_options.align_dir = self.align_dir_reference classify_options.out_dir = os.path.join(self.generic_out_path, tmp_folder, 'classify') self.optionparser.classify(classify_options) summary_out = os.path.join( classify_options.out_dir, PATH_AR122_SUMMARY_OUT.format(prefix=classify_options.prefix)) self.assertTrue(os.path.isfile(summary_out)) with open(summary_out, 'r') as f: lines = f.read().splitlines() last_line = lines[-1] infos = last_line.split('\t') self.assertEquals(len(infos), 18) self.assertTrue(infos[1].startswith('d__Archaea')) def test_identify_align(self): tmp_folder = ''.join( random.choice(string.ascii_uppercase + string.digits) for _ in range(10)) identify_options = self.options identify_options.genome_dir = self.genome_dir identify_options.out_dir = os.path.join(self.generic_out_path, tmp_folder, 'identify') self.optionparser.identify(identify_options) align_options = self.options align_options.identify_dir = os.path.join(self.generic_out_path, tmp_folder, 'identify') align_options.out_dir = os.path.join(self.generic_out_path, tmp_folder, 'align') self.optionparser.align(align_options) path_user_msa = os.path.join( align_options.out_dir, PATH_AR122_USER_MSA.format(prefix=align_options.prefix)) self.assertTrue(os.path.isfile(path_user_msa)) with open(path_user_msa, 'r') as f: lines = f.read().splitlines() last_line = lines[-1] self.assertTrue(len(last_line) > 4500) self.assertTrue(len(last_line) < 5500) self.assertTrue('-' in last_line) self.assertFalse(any(char.isdigit() for char in last_line)) def test_identify_align_classify(self): tmp_folder = ''.join( random.choice(string.ascii_uppercase + string.digits) for _ in range(10)) identify_options = self.options identify_options.genome_dir = self.genome_dir identify_options.out_dir = os.path.join(self.generic_out_path, tmp_folder, 'identify') self.optionparser.identify(identify_options) align_options = self.options align_options.identify_dir = os.path.join(self.generic_out_path, tmp_folder, 'identify') align_options.out_dir = os.path.join(self.generic_out_path, tmp_folder, 'align') self.optionparser.align(align_options) path_user_msa = os.path.join( align_options.out_dir, PATH_AR122_USER_MSA.format(prefix=align_options.prefix)) self.assertTrue(os.path.isfile(path_user_msa)) with open(path_user_msa, 'r') as f: lines = f.read().splitlines() last_line = lines[-1] self.assertTrue(len(last_line) > 4500) self.assertTrue(len(last_line) < 5500) self.assertTrue('-' in last_line) self.assertFalse(any(char.isdigit() for char in last_line)) classify_options = self.options classify_options.genome_dir = self.genome_dir classify_options.align_dir = align_options.out_dir classify_options.out_dir = os.path.join(self.generic_out_path, tmp_folder, 'classify') self.optionparser.classify(classify_options) summary_out = os.path.join( classify_options.out_dir, PATH_AR122_SUMMARY_OUT.format(prefix=classify_options.prefix)) self.assertTrue(summary_out) with open(summary_out, 'r') as f: lines = f.read().splitlines() last_line = lines[-1] infos = last_line.split('\t') self.assertEquals(len(infos), 18) self.assertTrue(infos[1].startswith('d__Archaea')) def test_classify_wf(self): tmp_folder = ''.join( random.choice(string.ascii_uppercase + string.digits) for _ in range(10)) classify_wf_options = self.options classify_wf_options.genome_dir = self.genome_dir classify_wf_options.out_dir = os.path.join(self.generic_out_path, tmp_folder, 'classify_wf') self.optionparser.identify(classify_wf_options) classify_wf_options.identify_dir = classify_wf_options.out_dir classify_wf_options.align_dir = classify_wf_options.out_dir classify_wf_options.taxa_filter = None classify_wf_options.custom_msa_filters = False classify_wf_options.min_consensus = None classify_wf_options.min_perc_taxa = None classify_wf_options.skip_gtdb_refs = False classify_wf_options.cols_per_gene = None classify_wf_options.max_consensus = None self.optionparser.align(classify_wf_options) self.optionparser.classify(classify_wf_options) summary_out = os.path.join( classify_wf_options.out_dir, PATH_AR122_SUMMARY_OUT.format(prefix=classify_wf_options.prefix)) self.assertTrue(os.path.isfile(summary_out)) with open(summary_out, 'r') as f: lines = f.read().splitlines() last_line = lines[-1] infos = last_line.split('\t') self.assertEquals(len(infos), 18) self.assertTrue(infos[1].startswith('d__Archaea')) def test_infer(self): tmp_folder = ''.join( random.choice(string.ascii_uppercase + string.digits) for _ in range(10)) infer_options = self.options path_user_msa = PATH_AR122_USER_MSA.format(prefix=self.options.prefix) infer_options.msa_file = os.path.join(self.align_dir_reference, path_user_msa) infer_options.out_dir = os.path.join(self.generic_out_path, tmp_folder, 'infer') # if not os.path.isdir(infer_options.out_dir): # os.makedirs(infer_options.out_dir) self.optionparser.infer(infer_options) with open( os.path.join(infer_options.out_dir, PATH_TREE_LOG.format(prefix=self.options.prefix)), 'r') as f: lines = f.read().splitlines() last_line = lines[-1] self.assertEqual(last_line.strip(), 'TreeCompleted') with open( os.path.join( infer_options.out_dir, PATH_UNROOTED_TREE.format(prefix=self.options.prefix)), 'r') as f: lines = f.read().splitlines() last_line = lines[-1] self.assertTrue('genome_1' in last_line) self.assertTrue('genome_2' in last_line) self.assertTrue('genome_3' in last_line) def test_de_novo_wf(self): tmp_folder = ''.join( random.choice(string.ascii_uppercase + string.digits) for _ in range(10)) de_novo_wf_options = self.options de_novo_wf_options.genome_dir = self.genome_dir de_novo_wf_options.suffix = ".ar122" de_novo_wf_options.out_dir = os.path.join(self.generic_out_path, tmp_folder, 'de_novo_wf') de_novo_wf_options.identify_dir = de_novo_wf_options.out_dir de_novo_wf_options.msa_file = os.path.join( de_novo_wf_options.out_dir, de_novo_wf_options.prefix + de_novo_wf_options.suffix + ".user_msa.fasta") self.optionparser.identify(de_novo_wf_options) self.optionparser.align(de_novo_wf_options) self.optionparser.infer(de_novo_wf_options) def tearDown(self): shutil.rmtree(self.generic_out_path)
class TestOptionsParser(unittest.TestCase): def setUp(self): self.options_parser = OptionsParser('-1') self.dir_tmp = tempfile.mkdtemp(prefix='gtdbtk_tmp_') pass def tearDown(self): shutil.rmtree(self.dir_tmp) def test__verify_genome_id__valid(self): """ Test that a valid genome id returns True. """ self.assertTrue(self.options_parser._verify_genome_id('genome_1')) def test__verify_genome_id__invalid(self): """ Test that invalid genome ids throw an exception. """ for c in list('()[],;='): self.assertRaises(GenomeNameInvalid, self.options_parser._verify_genome_id, 'genome%s1' % c) def test__genomes_to_process__genome_dir__valid(self): """ Test that the expected results are returned when using genome_dir. """ open(os.path.join(self.dir_tmp, 'genome_1.fna'), 'a').close() open(os.path.join(self.dir_tmp, 'genome_2.fna'), 'a').close() open(os.path.join(self.dir_tmp, 'other_file.txt'), 'a').close() results, tln_table = self.options_parser._genomes_to_process( self.dir_tmp, '', 'fna') expected = { 'genome_1': os.path.join(self.dir_tmp, 'genome_1.fna'), 'genome_2': os.path.join(self.dir_tmp, 'genome_2.fna') } self.assertDictEqual(results, expected) def test__genomes_to_process__batchfile__valid(self): """ Test that the expected results are returned when using batchfile """ path_batchfile = os.path.join(self.dir_tmp, 'batchfile.txt') path_genome_1 = os.path.join(self.dir_tmp, 'genome_1.fna') path_genome_2 = os.path.join(self.dir_tmp, 'genome_2.fna') open(path_genome_1, 'a').close() open(path_genome_2, 'a').close() with open(path_batchfile, 'a') as f: f.write(f'{path_genome_1}\tgenome_1\n') f.write('\n') f.write(f'{path_genome_2}\tgenome_2\t4\n') results, tln_table = self.options_parser._genomes_to_process( '', path_batchfile, 'fna') expected = {'genome_1': path_genome_1, 'genome_2': path_genome_2} expected_tln = {'genome_2': 4} self.assertDictEqual(results, expected) self.assertDictEqual(tln_table, expected_tln) def test__genomes_to_process__batchfile__invalid_columns(self): """ Test that a batchfile containing columns not equal to 2 throws an exception. """ path_batchfile = os.path.join(self.dir_tmp, 'batchfile.txt') path_genome_1 = os.path.join(self.dir_tmp, 'genome_1.fna') path_genome_2 = os.path.join(self.dir_tmp, 'genome_2.fna') open(path_genome_1, 'a').close() open(path_genome_2, 'a').close() with open(path_batchfile, 'a') as f: f.write('%s\tgenome_1\n' % path_genome_1) f.write('\n') f.write('%s\tgenome_2\tfoo\n' % path_genome_2) self.assertRaises(GTDBTkExit, self.options_parser._genomes_to_process, '', path_batchfile, 'fna') def test__genomes_to_process__batchfile__blank_genome_path(self): """ Test that a batchfile containing a blank genome path throws an exception. """ path_batchfile = os.path.join(self.dir_tmp, 'batchfile.txt') path_genome_1 = os.path.join(self.dir_tmp, 'genome_1.fna') path_genome_2 = os.path.join(self.dir_tmp, 'genome_2.fna') open(path_genome_1, 'a').close() open(path_genome_2, 'a').close() with open(path_batchfile, 'a') as f: f.write('%s\tgenome_1\n' % path_genome_1) f.write('\n') f.write('%s\tgenome_2\n' % '') self.assertRaises(GTDBTkExit, self.options_parser._genomes_to_process, '', path_batchfile, 'fna') def test__genomes_to_process__batchfile__blank_genome_id(self): """ Test that a batchfile containing a blank genome id throws an exception. """ path_batchfile = os.path.join(self.dir_tmp, 'batchfile.txt') path_genome_1 = os.path.join(self.dir_tmp, 'genome_1.fna') path_genome_2 = os.path.join(self.dir_tmp, 'genome_2.fna') open(path_genome_1, 'a').close() open(path_genome_2, 'a').close() with open(path_batchfile, 'a') as f: f.write('%s\tgenome_1\n' % path_genome_1) f.write('\n') f.write('%s\t\n' % path_genome_2) self.assertRaises(GTDBTkExit, self.options_parser._genomes_to_process, '', path_batchfile, 'fna') def test__genomes_to_process__batchfile__duplicate_genome_id(self): """ Test that a batchfile containing duplicate genome ids throws an exception. """ # Branch 1: The number of columns are not equal to 2. path_batchfile = os.path.join(self.dir_tmp, 'batchfile.txt') path_genome_1 = os.path.join(self.dir_tmp, 'genome_1.fna') path_genome_2 = os.path.join(self.dir_tmp, 'genome_2.fna') open(path_genome_1, 'a').close() open(path_genome_2, 'a').close() with open(path_batchfile, 'a') as f: f.write('%s\tgenome_1\n' % path_genome_1) f.write('\n') f.write('%s\tgenome_1\n' % path_genome_2) self.assertRaises(GTDBTkExit, self.options_parser._genomes_to_process, '', path_batchfile, 'fna') # def test__genomes_to_process__batchfile__invalid_genome_id(self): # """ Test that a batchfile containing duplicate genome ids throws an exception. """ # # Branch 1: The number of columns are not equal to 2. # path_batchfile_1 = os.path.join(self.dir_tmp, 'batchfile_1.txt') # path_batchfile_2 = os.path.join(self.dir_tmp, 'batchfile_2.txt') # path_batchfile_3 = os.path.join(self.dir_tmp, 'batchfile_3.txt') # path_genome_1 = os.path.join(self.dir_tmp, 'genome_1.fna') # path_genome_2 = os.path.join(self.dir_tmp, 'genome_2.fna') # open(path_genome_1, 'a').close() # open(path_genome_2, 'a').close() # # with open(path_batchfile_1, 'a') as f: # f.write('%s\tgenome_1\n' % path_genome_1) # f.write('\n') # f.write('%s\tGB_genome_2\n' % path_genome_2) # # with open(path_batchfile_2, 'a') as f: # f.write('%s\tgenome_1\n' % path_genome_1) # f.write('\n') # f.write('%s\tRS_genome_2\n' % path_genome_2) # # with open(path_batchfile_3, 'a') as f: # f.write('%s\tgenome_1\n' % path_genome_1) # f.write('\n') # f.write('%s\tUBAgenome_2\n' % path_genome_2) # # self.assertRaises(GTDBTkExit, self.options_parser._genomes_to_process, '', path_batchfile_1, 'fna') # self.assertRaises(GTDBTkExit, self.options_parser._genomes_to_process, '', path_batchfile_2, 'fna') # self.assertRaises(GTDBTkExit, self.options_parser._genomes_to_process, '', path_batchfile_3, 'fna') def test__genomes_to_process__no_files(self): """ Test that an exception is thrown if no files are found to process """ # Branch 1 : genome_dir is specified tmp_genome_dir = tempfile.mkdtemp() try: self.assertRaises(GTDBTkExit, self.options_parser._genomes_to_process, tmp_genome_dir, '', 'fna') finally: shutil.rmtree(tmp_genome_dir) # Branch 2: batchfile is specified tmp_genome_dir = tempfile.mkdtemp() try: path_batchfile = os.path.join(tmp_genome_dir, 'batchfile.txt') open(path_batchfile, 'a').close() self.assertRaises(GTDBTkExit, self.options_parser._genomes_to_process, '', path_batchfile, 'fna') finally: shutil.rmtree(tmp_genome_dir) def test_identify__genome_dir_raises_io_exception(self): """ Test that the identify method raises an exception on invalid genome_dir """ options = argparse.ArgumentParser() options.genome_dir = os.path.join(tempfile.gettempdir(), 'non-existent-dir') self.assertRaises(BioLibDirNotFound, self.options_parser.identify, options) def test_identify__batchfile_raises_io_exception(self): """ Test that the identify method raises an exception on invalid batchfile """ options = argparse.ArgumentParser() options.genome_dir = None options.batchfile = os.path.join(tempfile.gettempdir(), 'non-existent-file.txt') self.assertRaises(BioLibFileNotFound, self.options_parser.identify, options) def test_align__identify_dir_raises_io_exception(self): """ Test that the align method raises an exception on invalid identify dir """ options = argparse.ArgumentParser() options.identify_dir = os.path.join(tempfile.gettempdir(), 'non-existent-dir') self.assertRaises(BioLibDirNotFound, self.options_parser.align, options) def test_infer__msa_raises_io_exception(self): """ Test that the infer method raises an exception on invalid MSA """ options = argparse.ArgumentParser() options.msa_file = os.path.join(tempfile.gettempdir(), 'non-existent-msa.txt') self.assertRaises(BioLibFileNotFound, self.options_parser.infer, options) def test_run_test(self): """Test that the user-test method runs correctly""" options = argparse.ArgumentParser() options.out_dir = self.dir_tmp options.cpus = 3 self.assertTrue(self.options_parser.run_test(options)) # def test_run_test__throws_exception(self): # """Test that the user-test method fails correctly""" # options = argparse.ArgumentParser() # options.out_dir = self.dir_tmp # os.mkdir(os.path.join(self.dir_tmp, 'genomes')) # options.cpus = 3 # self.assertRaises(GTDBTkTestFailure, self.options_parser.run_test, options) def test_classify__align_dir_raises_io_exception(self): """ Test that the classify method raises an exception on invalid align dir """ options = argparse.ArgumentParser() options.align_dir = os.path.join(tempfile.gettempdir(), 'non-existent-dir') self.assertRaises(BioLibDirNotFound, self.options_parser.classify, options) def test_root__no_tree_raises_io_exception(self): """ Test that the infer method raises an exception on invalid tree """ options = argparse.ArgumentParser() options.input_tree = os.path.join(tempfile.gettempdir(), 'non-existent-tree.tree') self.assertRaises(BioLibFileNotFound, self.options_parser.root, options) def test_decorate__no_tree_raises_io_exception(self): """ Test that the infer method raises an exception on invalid tree """ options = argparse.ArgumentParser() options.input_tree = os.path.join(tempfile.gettempdir(), 'non-existent-tree.tree') self.assertRaises(BioLibFileNotFound, self.options_parser.decorate, options) def test_trim_msa__mask_file(self): """ Test that the expected result is returned when running trim_msa with mask_file """ path_untrimmed_msa = os.path.join(self.dir_tmp, 'untrimmed_msa.fasta') path_mask_file = os.path.join(self.dir_tmp, 'mask_file.txt') path_output = os.path.join(self.dir_tmp, 'trimmed_msa.fasta') with open(path_untrimmed_msa, 'w') as f: f.write('>genome_1\n') f.write('ALGPVW\n') f.write('>genome_2\n') f.write('WVPGLA\n') with open(path_mask_file, 'w') as f: f.write('010010\n') options = argparse.ArgumentParser() # Required arguments options.untrimmed_msa = path_untrimmed_msa options.output = path_output # Mutex arguments options.mask_file = path_mask_file options.reference_mask = None self.options_parser.trim_msa(options) results = dict() with open(path_output, 'r') as f: re_hits = re.findall(r'>(.+)\n(.+)\n', f.read()) for gid, seq in re_hits: results[gid] = seq expected = {'genome_1': 'LV', 'genome_2': 'VL'} self.assertDictEqual(results, expected) def test_trim_msa__reference_mask_arc(self): """ Test that the expected result is returned when running trim_msa with archaeal reference_mask """ path_untrimmed_msa = os.path.join(self.dir_tmp, 'untrimmed_msa.fasta') path_output = os.path.join(self.dir_tmp, 'trimmed_msa.fasta') shutil.copyfile(Config.CONCAT_AR122, path_untrimmed_msa) options = argparse.ArgumentParser() # Required arguments options.untrimmed_msa = path_untrimmed_msa options.output = path_output # Mutex arguments options.mask_file = None options.reference_mask = 'arc' self.options_parser.trim_msa(options) actual = sha256(path_output) expected = '1146351be59ae8d27668256c5b2c425a6f38c37c' self.assertEqual(actual, expected) def test_trim_msa__reference_mask_bac(self): """ Test that the expected result is returned when running trim_msa with bacterial reference_mask """ path_untrimmed_msa = os.path.join(self.dir_tmp, 'untrimmed_msa.fasta') path_output = os.path.join(self.dir_tmp, 'trimmed_msa.fasta') shutil.copyfile(Config.CONCAT_BAC120, path_untrimmed_msa) options = argparse.ArgumentParser() # Required arguments options.untrimmed_msa = path_untrimmed_msa options.output = path_output # Mutex arguments options.mask_file = None options.reference_mask = 'bac' self.options_parser.trim_msa(options) actual = sha256(path_output) expected = 'ae6e24e89540fed03b81436147f99bcd120d059a' self.assertEqual(actual, expected) def test_export_msa__arc(self): """ Test that the untrimmed archaeal MSA is exported correctly """ path_out = os.path.join(self.dir_tmp, 'output.fasta') options = argparse.ArgumentParser() options.domain = 'arc' options.output = path_out self.options_parser.export_msa(options) with open(path_out, 'rb') as f: out_hash = hashlib.sha256(f.read()).hexdigest() self.assertEqual( out_hash, '8706b42a3f4b2445273058e7e876f0d8332bd8dec95c0fc8bc024d76a5a5aade') def test_export_msa__bac(self): """ Test that the untrimmed bacterial MSA is exported correctly """ path_out = os.path.join(self.dir_tmp, 'output.fasta') options = argparse.ArgumentParser() options.domain = 'bac' options.output = path_out self.options_parser.export_msa(options) with open(path_out, 'rb') as f: out_hash = hashlib.sha256(f.read()).hexdigest() self.assertEqual( out_hash, '3c5dfa4dc5ef943459e6d0ed4da1e5a5858332c824739630beffb57fab303486')