def __init__(self, threads, pfam_top_hit_suffix, tigrfam_top_hit_suffix, protein_file_suffix, pfam_hmm_dir, tigrfam_hmm_dir, bac120_markers, ar122_markers): """Initialization.""" check_dependencies(['hmmalign']) self.logger = logging.getLogger('timestamp') self.threads = threads self.pfam_top_hit_suffix = pfam_top_hit_suffix self.tigrfam_top_hit_suffix = tigrfam_top_hit_suffix self.protein_file_suffix = protein_file_suffix self.pfam_hmm_dir = pfam_hmm_dir self.tigrfam_hmm_dir = tigrfam_hmm_dir self.bac120_markers = bac120_markers self.ar122_markers = ar122_markers self.marker_path_prefix = { "PFAM": os.path.join(self.pfam_hmm_dir, 'individual_hmms'), "TIGRFAM": os.path.join(os.path.dirname(self.tigrfam_hmm_dir), 'individual_hmms') } self.ar122_marker_sizes = None self.bac120_marker_sizes = None self.version = self.get_version()
def identify(self, genomes, tln_tables, out_dir, prefix, force): """Identify marker genes in genomes. Parameters ---------- genomes : dict Genome IDs as the key, path to genome file as value. tln_tables: Dict[str, int] Genome ID -> translation table mapping for those user-specified. out_dir : str Path to the output directory. prefix : str Prefix to append to generated files. force : bool Overwrite any existing files. Raises ------ GTDBTkException If an exception is encountered during the identify step. """ check_dependencies(['prodigal', 'hmmsearch']) self.logger.info('Identifying markers in %d genomes with %d threads.' % (len(genomes), self.cpus)) self.marker_gene_dir = os.path.join(out_dir, DIR_MARKER_GENE) prodigal = Prodigal(self.cpus, self.marker_gene_dir, self.protein_file_suffix, self.nt_gene_file_suffix, self.gff_file_suffix, force) self.logger.info("Running Prodigal {} to identify genes.".format( prodigal.version)) genome_dictionary = prodigal.run(genomes, tln_tables) # annotated genes against TIGRFAM and Pfam databases self.logger.info("Identifying TIGRFAM protein families.") gene_files = [ genome_dictionary[db_genome_id]['aa_gene_path'] for db_genome_id in genome_dictionary.keys() ] tigr_search = TigrfamSearch(self.cpus, self.tigrfam_hmms, self.protein_file_suffix, self.tigrfam_suffix, self.tigrfam_top_hit_suffix, self.checksum_suffix, self.marker_gene_dir) tigr_search.run(gene_files) self.logger.info("Identifying Pfam protein families.") pfam_search = PfamSearch(self.cpus, self.pfam_hmm_dir, self.protein_file_suffix, self.pfam_suffix, self.pfam_top_hit_suffix, self.checksum_suffix, self.marker_gene_dir) pfam_search.run(gene_files) self.logger.info("Annotations done using HMMER {}.".format( tigr_search.version)) self._report_identified_marker_genes(genome_dictionary, out_dir, prefix)
def infer(self, options): """Infer a tree from a user specified MSA. Parameters ---------- options : argparse.Namespace The CLI arguments input by the user. """ check_file_exists(options.msa_file) make_sure_path_exists(options.out_dir) if options.cpus > 1: check_dependencies(['FastTreeMP']) else: check_dependencies(['FastTree']) if hasattr(options, 'suffix'): output_tree = os.path.join( options.out_dir, PATH_MARKER_UNROOTED_TREE.format(prefix=options.prefix, marker=options.suffix)) tree_log = os.path.join( options.out_dir, PATH_MARKER_TREE_LOG.format(prefix=options.prefix, marker=options.suffix)) fasttree_log = os.path.join( options.out_dir, PATH_MARKER_FASTTREE_LOG.format(prefix=options.prefix, marker=options.suffix)) else: output_tree = os.path.join( options.out_dir, PATH_UNROOTED_TREE.format(prefix=options.prefix)) tree_log = os.path.join( options.out_dir, PATH_TREE_LOG.format(prefix=options.prefix)) fasttree_log = os.path.join( options.out_dir, PATH_FASTTREE_LOG.format(prefix=options.prefix)) fasttree = FastTree() fasttree.run(output_tree, tree_log, fasttree_log, options.prot_model, options.no_support, options.no_gamma, options.msa_file, options.cpus) self.logger.info(f'FastTree version: {fasttree.version}') if hasattr(options, 'subparser_name') and options.subparser_name == 'infer': symlink_f( output_tree[len(options.out_dir) + 1:], os.path.join(options.out_dir, os.path.basename(output_tree))) self.logger.info('Done.')
def check_dependencies(no_mash): """Exits the system if the required programs are not on the path. Parameters ---------- no_mash : bool True if Mash will be used, False otherwise. """ dependencies = ['fastANI'] if not no_mash: dependencies.append('mash') check_dependencies(dependencies)
def __init__(self, threads, pfam_top_hit_suffix, tigrfam_top_hit_suffix, protein_file_suffix, pfam_hmm_dir, tigrfam_hmm_dir, bac120_markers, ar122_markers, rps23_markers): """Initialization.""" check_dependencies(['hmmalign']) self.threads = threads self.pfam_top_hit_suffix = pfam_top_hit_suffix self.tigrfam_top_hit_suffix = tigrfam_top_hit_suffix self.protein_file_suffix = protein_file_suffix self.pfam_hmm_dir = pfam_hmm_dir self.tigrfam_hmm_dir = tigrfam_hmm_dir self.bac120_markers = bac120_markers self.ar122_markers = ar122_markers self.rps23_markers = rps23_markers
def check_install(self): """Check that all reference files exist. Returns ------- bool True if the installation is complete, False otherwise. """ # Check that all programs are on the system path. self.logger.info( f'Checking that all third-party software are on the system path:') names = { 'prodigal', 'hmmsearch', 'fastANI', 'mash', 'pplacer', 'guppy', 'FastTree', 'FastTreeMP', 'hmmalign' } for name in sorted(names): on_path = False try: on_path = on_path or check_dependencies([name], exit_on_fail=False) except: pass if on_path: self.logger.info(" |-- {:16} {}".format( name, colour('OK', ['bright'], fg='green'))) else: self.logger.info(" |-- {:16} {}".format( name, colour('NOT FOUND', ['bright'], fg='yellow'))) # Assume this was successful unless otherwise observed. ok = True # Compute the hash for each directory self.logger.info( f'Checking integrity of reference package: {Config.GENERIC_PATH}') for obj_path, expected_hash in Config.REF_HASHES.items(): base_name = obj_path[:-1] if obj_path.endswith('/') else obj_path base_name = base_name.split('/')[-1] user_hash = sha1_dir(obj_path, progress=True) if user_hash != expected_hash: self.logger.info(" |-- {:16} {}".format( base_name, colour(f'HASH MISMATCH {user_hash}', ['bright'], fg='yellow'))) ok = False else: self.logger.info(" |-- {:16} {}".format( base_name, colour('OK', ['bright'], fg='green'))) if not ok: raise GTDBTkExit( 'Unexpected files were seen, or the reference package is corrupt.' )
def __init__(self, threads, pfam_top_hit_suffix, tigrfam_top_hit_suffix, protein_file_suffix, pfam_hmm_dir, tigrfam_hmm_dir, bac120_markers, ar122_markers): """Initialization.""" check_dependencies(['hmmalign']) self.logger = logging.getLogger('timestamp') self.threads = threads self.pfam_top_hit_suffix = pfam_top_hit_suffix self.tigrfam_top_hit_suffix = tigrfam_top_hit_suffix self.protein_file_suffix = protein_file_suffix self.pfam_hmm_dir = pfam_hmm_dir self.tigrfam_hmm_dir = tigrfam_hmm_dir self.bac120_markers = bac120_markers self.ar122_markers = ar122_markers self.version = self.get_version()
def parse_options(self, options): """Parse user options and call the correct pipeline(s) Parameters ---------- options : argparse.Namespace The CLI arguments input by the user. """ # Stop processing if python 2 is being used. if sys.version_info.major < 3: raise GTDBTkExit('Python 2 is no longer supported.') # Correct user paths if hasattr(options, 'out_dir') and options.out_dir: options.out_dir = os.path.expanduser(options.out_dir) # Assert that the number of CPUs is a positive integer. if hasattr(options, 'cpus') and options.cpus < 1: self.logger.warning( 'You cannot use less than 1 CPU, defaulting to 1.') options.cpus = 1 if options.subparser_name == 'de_novo_wf': check_dependencies(['prodigal', 'hmmalign']) check_dependencies(['FastTree' + ('MP' if options.cpus > 1 else '')]) options.write_single_copy_genes = False self.identify(options) options.identify_dir = options.out_dir options.skip_trimming = False self.align(options) if options.bacteria: options.suffix = "bac120" else: options.suffix = "ar122" if options.skip_gtdb_refs: if options.suffix == 'bac120': options.msa_file = os.path.join( options.out_dir, PATH_BAC120_USER_MSA.format(prefix=options.prefix)) elif options.suffix == 'ar122': options.msa_file = os.path.join( options.out_dir, PATH_AR122_USER_MSA.format(prefix=options.prefix)) else: self.logger.error( 'There was an error determining the marker set.') raise GenomeMarkerSetUnknown( 'Unknown marker set: {}'.format(options.suffix)) else: if options.suffix == 'bac120': options.msa_file = os.path.join( options.out_dir, PATH_BAC120_MSA.format(prefix=options.prefix)) elif options.suffix == 'ar122': options.msa_file = os.path.join( options.out_dir, PATH_AR122_MSA.format(prefix=options.prefix)) else: self.logger.error( 'There was an error determining the marker set.') raise GenomeMarkerSetUnknown( 'Unknown marker set: {}'.format(options.suffix)) self.infer(options) if options.suffix == 'bac120': options.input_tree = os.path.join(options.out_dir, PATH_BAC120_UNROOTED_TREE.format(prefix=options.prefix)) options.output_tree = os.path.join(options.out_dir, PATH_BAC120_ROOTED_TREE.format(prefix=options.prefix)) elif options.suffix == 'ar122': options.input_tree = os.path.join(options.out_dir, PATH_AR122_UNROOTED_TREE.format(prefix=options.prefix)) options.output_tree = os.path.join(options.out_dir, PATH_AR122_ROOTED_TREE.format(prefix=options.prefix)) self.root(options) if options.suffix == 'bac120': options.input_tree = os.path.join(options.out_dir, PATH_BAC120_ROOTED_TREE.format(prefix=options.prefix)) options.output_tree = os.path.join(options.out_dir, PATH_BAC120_DECORATED_TREE.format(prefix=options.prefix)) elif options.suffix == 'ar122': options.input_tree = os.path.join(options.out_dir, PATH_AR122_ROOTED_TREE.format(prefix=options.prefix)) options.output_tree = os.path.join(options.out_dir, PATH_AR122_DECORATED_TREE.format(prefix=options.prefix)) self.decorate(options) elif options.subparser_name == 'classify_wf': # TODO: Remove this block once the split_tree function is implemented. if hasattr(options, 'split_tree') and options.split_tree: self.logger.warning('The split tree option is not yet ' ' supported, overriding value to False.') options.split_tree = False check_dependencies(['prodigal', 'hmmalign', 'pplacer', 'guppy', 'fastANI']) options.write_single_copy_genes = False self.identify(options) options.identify_dir = options.out_dir options.align_dir = options.out_dir options.taxa_filter = None options.custom_msa_filters = False # Added here due to the other mutex argument being include above. options.skip_trimming = False options.min_consensus = None options.min_perc_taxa = None options.skip_gtdb_refs = False options.cols_per_gene = None options.max_consensus = None options.rnd_seed = None options.skip_trimming = False options.scratch_dir = None options.recalculate_red = False self.align(options) self.classify(options) elif options.subparser_name == 'identify': self.identify(options) elif options.subparser_name == 'align': self.align(options) elif options.subparser_name == 'infer': self.infer(options) elif options.subparser_name == 'classify': # TODO: Remove this block once the split_tree function is implemented. if hasattr(options, 'split_tree') and options.split_tree: self.logger.warning('The split tree option is not yet ' ' supported, overriding value to False.') options.split_tree = False if options.recalculate_red and options.split_tree: raise GTDBTkExit('--split_tree and --recalculate_red are mutually exclusive.') self.classify(options) elif options.subparser_name == 'root': self.root(options) elif options.subparser_name == 'decorate': self.decorate(options) elif options.subparser_name == 'infer_ranks': self.infer_ranks(options) elif options.subparser_name == 'ani_rep': self.ani_rep(options) elif options.subparser_name == 'trim_msa': self.trim_msa(options) elif options.subparser_name == 'export_msa': self.export_msa(options) elif options.subparser_name == 'test': check_dependencies(['prodigal', 'hmmalign', 'pplacer', 'guppy', 'fastANI']) self.run_test(options) elif options.subparser_name == 'check_install': self.check_install() else: self.logger.error('Unknown GTDB-Tk command: "' + options.subparser_name + '"\n') sys.exit(1) return 0
def identify(self, genomes, tln_tables, out_dir, prefix, force, genes, write_single_copy_genes): """Identify marker genes in genomes. Parameters ---------- genomes : dict Genome IDs as the key, path to genome file as value. tln_tables: Dict[str, int] Genome ID -> translation table mapping for those user-specified. out_dir : str Path to the output directory. prefix : str Prefix to append to generated files. force : bool Overwrite any existing files. genes : bool True if the supplied genomes are called genes, False otherwise. write_single_copy_genes : bool Write unique AR53/BAC120 marker files to disk. Raises ------ GTDBTkException If an exception is encountered during the identify step. """ check_dependencies(['prodigal', 'hmmsearch']) self.logger.info( f'Identifying markers in {len(genomes):,} genomes with ' f'{self.cpus} threads.') self.marker_gene_dir = os.path.join(out_dir, DIR_MARKER_GENE) self.failed_genomes = os.path.join(out_dir, PATH_FAILS.format(prefix=prefix)) if not genes: prodigal = Prodigal(self.cpus, self.failed_genomes, self.marker_gene_dir, self.protein_file_suffix, self.nt_gene_file_suffix, self.gff_file_suffix, force) self.logger.log( Config.LOG_TASK, f'Running Prodigal {prodigal.version} to identify genes.') genome_dictionary = prodigal.run(genomes, tln_tables) else: self.logger.info( 'Using supplied genomes as called genes, skipping Prodigal.') genome_dictionary = dict() for gid, gpath in genomes.items(): genome_dictionary[gid] = { 'aa_gene_path': gpath, 'translation_table_path': None, 'nt_gene_path': None, 'best_translation_table': 'user_supplied', 'gff_path': None } # annotated genes against TIGRFAM and Pfam databases self.logger.log(Config.LOG_TASK, 'Identifying TIGRFAM protein families.') gene_files = [ genome_dictionary[db_genome_id]['aa_gene_path'] for db_genome_id in genome_dictionary.keys() ] tigr_search = TigrfamSearch(self.cpus, self.tigrfam_hmms, self.protein_file_suffix, self.tigrfam_suffix, self.tigrfam_top_hit_suffix, self.checksum_suffix, self.marker_gene_dir) tigr_search.run(gene_files) self.logger.log(Config.LOG_TASK, 'Identifying Pfam protein families.') pfam_search = PfamSearch(self.cpus, self.pfam_hmm_dir, self.protein_file_suffix, self.pfam_suffix, self.pfam_top_hit_suffix, self.checksum_suffix, self.marker_gene_dir) pfam_search.run(gene_files) self.logger.info( f'Annotations done using HMMER {tigr_search.version}.') self.logger.log(Config.LOG_TASK, 'Summarising identified marker genes.') self._report_identified_marker_genes(genome_dictionary, out_dir, prefix, write_single_copy_genes)
def run(self, output_tree, tree_log, fasttree_log, prot_model, no_support, gamma, msa_file, cpus=1): """Run FastTree. Parameters ---------- output_tree : str The path where the resulting tree should be written to. tree_log : str The path where the FastTree stats should be written to. fasttree_log : str The path where the FastTree log should be written to. prot_model : str Either 'JTT', 'WAG', or 'LG'. no_support : bool True if no support should be used, False otherwise. gamma : bool True if Gamma20 should be used, False otherwise. msa_file : str The path to the input MSA. cpus : int The maximum number of CPUs for FastTree to use. Raises ------ FastTreeException If an error is encountered while running FastTree. """ env = os.environ.copy() if cpus > 1: cmd = 'FastTreeMP' env['OMP_NUM_THREADS'] = str(cpus) else: cmd = 'FastTree' check_dependencies([cmd]) make_sure_path_exists(os.path.dirname(output_tree)) make_sure_path_exists(os.path.dirname(tree_log)) make_sure_path_exists(os.path.dirname(fasttree_log)) # Setup arguments args = [cmd] model_out = [prot_model] if prot_model == 'WAG': args.append('-wag') elif prot_model == 'LG': args.append('-lg') if gamma: args.append('-gamma') model_out.append('+G') if no_support: args.append('-nosupport') else: model_out.append('SH support values') args.append('-log') args.append(tree_log) self.logger.info('Inferring FastTree ({}) using a maximum of {} CPUs.'.format( ', '.join(model_out), cpus)) # Use a temporary directory if the input file is gzipped with tempfile.TemporaryDirectory(prefix='gtdbtk_') as tmp_dir: # Uncompress the archive if it's compressed if msa_file.endswith('.gz'): msa_path = os.path.join(tmp_dir, os.path.basename(msa_file[0:-3])) with gzip.open(msa_file, 'rb') as f_in: with open(msa_path, 'wb') as f_out: shutil.copyfileobj(f_in, f_out) else: msa_path = msa_file args.append(msa_path) with open(output_tree, 'w') as f_out_tree: with open(fasttree_log, 'w') as f_out_err: proc = subprocess.Popen( args, stdout=f_out_tree, stderr=f_out_err, env=env) proc.communicate() # Validate results if proc.returncode != 0: self.logger.error( 'An error was encountered while running FastTree.') raise FastTreeException('FastTree returned a non-zero exit code.') if not os.path.isfile(output_tree): self.logger.error( 'An error was encountered while running FastTree.') raise FastTreeException( 'Tree output file is missing: {}'.format(output_tree)) elif os.path.getsize(output_tree) < 1: self.logger.error( 'An error was encountered while running FastTree.') raise FastTreeException( 'Tree output file is empty: {}'.format(output_tree))
def parse_options(self, options): """Parse user options and call the correct pipeline(s) Parameters ---------- options : argparse.Namespace The CLI arguments input by the user. """ if options.subparser_name == 'de_novo_wf': check_dependencies(['prodigal', 'hmmalign']) check_dependencies( ['FastTree' + ('MP' if options.cpus > 1 else '')]) self.identify(options) options.identify_dir = options.out_dir options.skip_trimming = False self.align(options) if options.bac120_ms: options.suffix = "bac120" else: options.suffix = "ar122" if options.skip_gtdb_refs: if options.suffix == 'bac120': options.msa_file = os.path.join( options.out_dir, PATH_BAC120_USER_MSA.format(prefix=options.prefix)) elif options.suffix == 'ar122': options.msa_file = os.path.join( options.out_dir, PATH_AR122_USER_MSA.format(prefix=options.prefix)) else: self.logger.error( 'There was an error determining the marker set.') raise GenomeMarkerSetUnknown( 'Unknown marker set: {}'.format(options.suffix)) else: if options.suffix == 'bac120': options.msa_file = os.path.join( options.out_dir, PATH_BAC120_MSA.format(prefix=options.prefix)) elif options.suffix == 'ar122': options.msa_file = os.path.join( options.out_dir, PATH_AR122_MSA.format(prefix=options.prefix)) else: self.logger.error( 'There was an error determining the marker set.') raise GenomeMarkerSetUnknown( 'Unknown marker set: {}'.format(options.suffix)) self.infer(options) if options.suffix == 'bac120': options.input_tree = os.path.join( options.out_dir, PATH_BAC120_UNROOTED_TREE.format(prefix=options.prefix)) options.output_tree = os.path.join( options.out_dir, PATH_BAC120_ROOTED_TREE.format(prefix=options.prefix)) elif options.suffix == 'ar122': options.input_tree = os.path.join( options.out_dir, PATH_AR122_UNROOTED_TREE.format(prefix=options.prefix)) options.output_tree = os.path.join( options.out_dir, PATH_AR122_ROOTED_TREE.format(prefix=options.prefix)) else: self.logger.error( 'There was an error determining the marker set.') raise GenomeMarkerSetUnknown('Unknown marker set: {}'.format( options.suffix)) self.root(options) self.decorate(options) elif options.subparser_name == 'classify_wf': check_dependencies( ['prodigal', 'hmmalign', 'pplacer', 'guppy', 'fastANI']) self.identify(options) options.identify_dir = options.out_dir options.align_dir = options.out_dir options.taxa_filter = None options.custom_msa_filters = False options.skip_trimming = False # Added here due to the other mutex argument being include above. options.min_consensus = None options.min_perc_taxa = None options.skip_gtdb_refs = False options.cols_per_gene = None options.max_consensus = None options.rnd_seed = None options.skip_trimming = False self.align(options) self.classify(options) elif options.subparser_name == 'identify': self.identify(options) elif options.subparser_name == 'align': self.align(options) elif options.subparser_name == 'infer': self.infer(options) elif options.subparser_name == 'classify': self.classify(options) elif options.subparser_name == 'root': self.root(options) elif options.subparser_name == 'decorate': self.decorate(options) elif options.subparser_name == 'trim_msa': self.trim_msa(options) elif options.subparser_name == 'export_msa': self.export_msa(options) elif options.subparser_name == 'test': check_dependencies( ['prodigal', 'hmmalign', 'pplacer', 'guppy', 'fastANI']) self.run_test(options) elif options.subparser_name == 'check_install': self.check_install() else: self.logger.error('Unknown GTDB-Tk command: "' + options.subparser_name + '"\n') sys.exit(1) return 0
def check_dependencies(no_mash): """Exits the system if the required programs are not on the path.""" dependencies = ['fastANI'] if not no_mash: dependencies.append('mash') check_dependencies(dependencies)
def identify(self, genomes, out_dir, prefix, force, genes): """Identify marker genes in genomes. Parameters ---------- genomes : dict Genome IDs as the key, path to genome file as value. out_dir : str Path to the output directory. prefix : str Prefix to append to generated files. force : bool Overwrite any existing files. genes : bool True if the supplied genomes are called genes, False otherwise. Raises ------ GTDBTkException If an exception is encountered during the identify step. """ check_dependencies(['prodigal', 'hmmsearch']) self.logger.info('Identifying markers in %d genomes with %d threads.' % (len(genomes), self.cpus)) marker_gene_dir = os.path.join(out_dir, DIR_MARKER_GENE) if not genes: self.logger.info("Running Prodigal to identify genes.") prodigal = Prodigal(self.cpus, False, marker_gene_dir, self.protein_file_suffix, self.nt_gene_file_suffix, self.gff_file_suffix, force) genome_dictionary = prodigal.run(genomes) else: self.logger.info( 'Using supplied genomes as called genes, skipping Prodigal.') genome_dictionary = dict() for gid, gpath in genomes.items(): genome_dictionary[gid] = { 'aa_gene_path': gpath, 'translation_table_path': None, 'nt_gene_path': None, 'best_translation_table': 'user_supplied', 'gff_path': None } gene_files = [(db_genome_id, genome_dictionary[db_genome_id]['aa_gene_path']) for db_genome_id in genome_dictionary.keys()] # annotated genes against TIGRFAM and Pfam databases self.logger.info("Identifying TIGRFAM protein families.") tigr_search = TigrfamSearch(self.cpus, self.tigrfam_hmms, self.protein_file_suffix, self.tigrfam_suffix, self.tigrfam_top_hit_suffix, self.checksum_suffix, marker_gene_dir) tigr_search.run(gene_files) self.logger.info("Identifying Pfam protein families.") pfam_search = PfamSearch(self.cpus, self.pfam_hmm_dir, self.protein_file_suffix, self.pfam_suffix, self.pfam_top_hit_suffix, self.checksum_suffix, marker_gene_dir) pfam_search.run(gene_files) self._report_identified_marker_genes(genome_dictionary, out_dir, prefix)
def run(self, output_tree, tree_log, fasttree_log, prot_model, no_support, no_gamma, msa_file, cpus=1): """Run FastTree. Parameters ---------- output_tree : str The path where the resulting tree should be written to. tree_log : str The path where the FastTree stats should be written to. fasttree_log : str The path where the FastTree log should be written to. prot_model : str Either 'JTT', 'WAG', or 'LG'. no_support : bool True if no support should be used, False otherwise. no_gamma : bool True if no gamma should be used, False otherwise. msa_file : str The path to the input MSA. cpus : int The maximum number of CPUs for FastTree to use. Raises ------ FastTreeException If an error is encountered while running FastTree. """ env = os.environ.copy() if cpus > 1: cmd = 'FastTreeMP' env['OMP_NUM_THREADS'] = str(cpus) else: cmd = 'FastTree' check_dependencies([cmd]) make_sure_path_exists(os.path.dirname(output_tree)) make_sure_path_exists(os.path.dirname(tree_log)) make_sure_path_exists(os.path.dirname(fasttree_log)) # Setup arguments args = [cmd] if prot_model == 'WAG': args.append('-wag') elif prot_model == 'LG': args.append('-lg') if no_support: args.append('-nosupport') if not no_gamma: args.append('-gamma') args.append('-log') args.append(tree_log) args.append(msa_file) model_out = [ prot_model, ('-' if no_gamma else '+') + 'gamma', ('no' if no_support else '') + 'support' ] self.logger.info( 'Inferring FastTree ({}) using a maximum of {} CPUs.'.format( ', '.join(model_out), cpus)) self.logger.info('FastTree version: {}'.format(self.version)) with open(output_tree, 'w') as f_out_tree: with open(fasttree_log, 'w') as f_out_err: proc = subprocess.Popen(args, stdout=f_out_tree, stderr=f_out_err, env=env) proc.communicate() # Validate results if proc.returncode != 0: self.logger.error( 'An error was encountered while running FastTree.') raise FastTreeException('FastTree returned a non-zero exit code.') if not os.path.isfile(output_tree): self.logger.error( 'An error was encountered while running FastTree.') raise FastTreeException( 'Tree output file is missing: {}'.format(output_tree)) elif os.path.getsize(output_tree) < 1: self.logger.error( 'An error was encountered while running FastTree.') raise FastTreeException( 'Tree output file is empty: {}'.format(output_tree))