def __init__(self, genomes, root, prefix, cpus, k, s, mash_db=None): """Create a query file for a given set of genomes. Parameters ---------- genomes : dict[str, str] The genomes to create a sketch file from (genome_id, fasta_path). root : str The directory where the sketch file will be saved. prefix : str The prefix to use for this file. cpus : int The maximum number of CPUs available for Mash. k : int The k-mer size. s : int Maximum number of non-redundant hashes. mash_db : Optional[str] The path to read/write the pre-computed Mash reference sketch database. """ if mash_db is not None: export_msh = mash_db.rstrip('\\') if not export_msh.endswith(".msh"): export_msh = export_msh + ".msh" if os.path.isdir(export_msh): raise GTDBTkExit(f"{export_msh} is a directory") make_sure_path_exists(os.path.dirname(export_msh)) path = export_msh else: path = os.path.join(root, f'{prefix}.{self.name}') super().__init__(genomes, path, cpus, k, s)
def write(self): """Write the file to disk.""" make_sure_path_exists(os.path.dirname(self.path)) if len(self.data) > 0 : with open(self.path, 'w') as fh: for gid, tax_str in sorted(self.data.items()): fh.write(f'{gid}\t{tax_str}\n')
def classify(self, options): """Determine taxonomic classification of genomes. Parameters ---------- options : argparse.Namespace The CLI arguments input by the user. """ # See ticket #255... perhaps an upstream version/OS issue? if not hasattr(options, 'pplacer_cpus'): options.pplacer_cpus = None check_dir_exists(options.align_dir) make_sure_path_exists(options.out_dir) if options.scratch_dir: make_sure_path_exists(options.scratch_dir) genomes, _ = self._genomes_to_process(options.genome_dir, options.batchfile, options.extension) classify = Classify(options.cpus, options.pplacer_cpus) classify.run(genomes, options.align_dir, options.out_dir, options.prefix, options.scratch_dir, options.recalculate_red, options.debug, options.split_tree) self.logger.info('Done.')
def _write_individual_markers(self, user_msa, marker_set_id, marker_list, out_dir, prefix): marker_dir = join(out_dir, DIR_ALIGN_MARKERS) make_sure_path_exists(marker_dir) markers, total_msa_len = self._parse_marker_info_file(marker_list) marker_to_msa = dict() offset = 0 for marker_id, marker_desc, marker_len in sorted(markers, key=lambda x: x[0]): path_msa = os.path.join( marker_dir, f'{prefix}.{marker_set_id}.{marker_id}.faa') marker_to_msa[path_msa] = defaultdict(str) for gid, msa in user_msa.items(): marker_to_msa[path_msa][gid] += msa[offset:marker_len + offset] offset += marker_len if total_msa_len != offset: self.logger.warning( 'Internal error: the total MSA length is not equal to the offset.' ) for path_marker, gid_dict in marker_to_msa.items(): with open(path_marker, 'w') as fh: for genome_id, genome_msa in gid_dict.items(): fh.write(f'>{genome_id}\n{genome_msa}\n') self.logger.debug(f'Successfully written all markers to: {marker_dir}')
def identify(self, options): """Identify marker genes in genomes. Parameters ---------- options : argparse.Namespace The CLI arguments input by the user. """ if options.genome_dir: check_dir_exists(options.genome_dir) if options.batchfile: check_file_exists(options.batchfile) make_sure_path_exists(options.out_dir) genomes, tln_tables = self._genomes_to_process(options.genome_dir, options.batchfile, options.extension) self.genomes_to_process = genomes markers = Markers(options.cpus) markers.identify(genomes, tln_tables, options.out_dir, options.prefix, options.force, options.write_single_copy_genes) self.logger.info('Done.')
def classify(self, options): """Determine taxonomic classification of genomes. Parameters ---------- options : argparse.Namespace The CLI arguments input by the user. """ check_dir_exists(options.align_dir) make_sure_path_exists(options.out_dir) if options.scratch_dir: make_sure_path_exists(options.scratch_dir) genomes, _ = self._genomes_to_process(options.genome_dir, options.batchfile, options.extension) classify = Classify(options.cpus, options.pplacer_cpus, options.min_af) classify.run(genomes=genomes, align_dir=options.align_dir, out_dir=options.out_dir, prefix=options.prefix, scratch_dir=options.scratch_dir, debugopt=options.debug, fulltreeopt=options.full_tree, recalculate_red=False) self.logger.info( 'Note that Tk classification mode is insufficient for publication of new taxonomic ' 'designations. New designations should be based on one or more de novo trees, an ' 'example of which can be produced by Tk in de novo mode.') self.logger.info('Done.')
def align(self, options): """Create MSA from marker genes. Parameters ---------- options : argparse.Namespace The CLI arguments input by the user. """ check_dir_exists(options.identify_dir) make_sure_path_exists(options.out_dir) markers = Markers(options.cpus, options.debug) markers.align(options.identify_dir, options.skip_gtdb_refs, options.taxa_filter, options.min_perc_aa, options.custom_msa_filters, options.skip_trimming, options.rnd_seed, options.cols_per_gene, options.min_consensus, options.max_consensus, options.min_perc_taxa, options.out_dir, options.prefix, options.outgroup_taxon if hasattr(options, 'outgroup_taxon') else None, self.genomes_to_process) self.logger.info('Done.')
def classify(self, options): """Determine taxonomic classification of genomes. Parameters ---------- options : argparse.Namespace The CLI arguments input by the user. """ check_dir_exists(options.align_dir) make_sure_path_exists(options.out_dir) if options.scratch_dir: make_sure_path_exists(options.scratch_dir) genomes, _ = self._genomes_to_process(options.genome_dir, options.batchfile, options.extension) classify = Classify(options.cpus, options.pplacer_cpus, options.min_af) classify.run(genomes, options.align_dir, options.out_dir, options.prefix, options.scratch_dir, options.recalculate_red, options.debug, options.split_tree) self.logger.info('Done.')
def write(self): """Write the file to disk, note that domain is omitted.""" make_sure_path_exists(os.path.dirname(self.path)) with open(self.path, 'w') as fh: fh.write(f'genome_id\tdomain\ttree_index\n') for seqid, infos in self.data.items(): fh.write(f'{seqid}\t{self.domain}\t{infos}\n')
def write(self): """Write the file to disk, note that domain is omitted.""" make_sure_path_exists(os.path.dirname(self.path)) with open(self.path, 'w') as fh: fh.write('Marker ID\tName\tDescription\tLength (bp)\n') for marker_id, marker_d in sorted(self.markers.items()): row = [marker_id, marker_d['name'], marker_d['desc'], str(marker_d['size'])] fh.write('\t'.join(row) + '\n')
def write(self): """Write the file to disk, note that domain is omitted.""" make_sure_path_exists(os.path.dirname(self.path)) with open(self.path, 'w') as fh: fh.write(f'Phylum\t{self.data["p__"]}\n') fh.write(f'Class\t{self.data["c__"]}\n') fh.write(f'Order\t{self.data["o__"]}\n') fh.write(f'Family\t{self.data["f__"]}\n') fh.write(f'Genus\t{self.data["g__"]}\n')
def write(self): """Write the file to disk.""" make_sure_path_exists(os.path.dirname(self.path)) cols = ['gid','gtdb_taxonomy','pplacer_taxonomy','is_terminal','red'] with open(self.path, 'w') as fh: fh.write('\t'.join(self.get_col_order()[0]) + '\n') for gid, row in sorted(self.rows.items()): buf = list() for data in self.get_col_order(row)[1]: buf.append(self.none_value if data is None else str(data)) fh.write('\t'.join(buf) + '\n')
def infer(self, options): """Infer a tree from a user specified MSA. Parameters ---------- options : argparse.Namespace The CLI arguments input by the user. """ check_file_exists(options.msa_file) make_sure_path_exists(options.out_dir) if options.cpus > 1: check_dependencies(['FastTreeMP']) else: check_dependencies(['FastTree']) if hasattr(options, 'suffix'): output_tree = os.path.join( options.out_dir, PATH_MARKER_UNROOTED_TREE.format(prefix=options.prefix, marker=options.suffix)) tree_log = os.path.join( options.out_dir, PATH_MARKER_TREE_LOG.format(prefix=options.prefix, marker=options.suffix)) fasttree_log = os.path.join( options.out_dir, PATH_MARKER_FASTTREE_LOG.format(prefix=options.prefix, marker=options.suffix)) else: output_tree = os.path.join( options.out_dir, PATH_UNROOTED_TREE.format(prefix=options.prefix)) tree_log = os.path.join( options.out_dir, PATH_TREE_LOG.format(prefix=options.prefix)) fasttree_log = os.path.join( options.out_dir, PATH_FASTTREE_LOG.format(prefix=options.prefix)) fasttree = FastTree() fasttree.run(output_tree, tree_log, fasttree_log, options.prot_model, options.no_support, options.no_gamma, options.msa_file, options.cpus) self.logger.info(f'FastTree version: {fasttree.version}') if hasattr(options, 'subparser_name') and options.subparser_name == 'infer': symlink_f( output_tree[len(options.out_dir) + 1:], os.path.join(options.out_dir, os.path.basename(output_tree))) self.logger.info('Done.')
def export_msa(domain: Domain, output_file: str): """Exports the GTDB MSA to the specified path. :param domain: The domain used to determine the marker set. :param output_file: The path to write the MSA. """ if domain is Domain.ARCHAEA: file_to_export = CONCAT_AR53 elif domain is Domain.BACTERIA: file_to_export = CONCAT_BAC120 else: raise GTDBTkExit(f'Unknown domain: "{domain}"') make_sure_path_exists(os.path.dirname(output_file)) copyfile(file_to_export, output_file)
def export_msa(self, domain, output_file): """Export the MSA to a file, create the path if it doesn't exist. Parameters ---------- domain : str The domain used to determine the marker set. output_file : str The path where the MSA should be exported. """ file_to_export = Config.CONCAT_BAC120 if domain == 'arc': file_to_export = Config.CONCAT_AR122 make_sure_path_exists(os.path.dirname(output_file)) copyfile(file_to_export, output_file)
def write(self): """Writes the file to disk and creates a checksum.""" # Write the top hit file. make_sure_path_exists(os.path.dirname(self.path)) header = ['Gene Id', 'Top hits (Family id,e-value,bitscore)'] with open(self.path, 'w') as fh: fh.write('\t'.join(header) + '\n') for gene_id, hits in sorted(self.hits.items()): out_hits = list() for cur_hit in sorted(hits.values(), reverse=True): out_hits.append(cur_hit.hmm_str()) concat_hits = ';'.join(out_hits) fh.write(f'{gene_id}\t{concat_hits}\n') # Write the checksum. with open(f'{self.path}{CHECKSUM_SUFFIX}', 'w') as fh: fh.write(sha256(self.path))
def _workerThread(self, queueIn, queueOut): """Process each data item in parallel.""" while True: queue_next = queueIn.get(block=True, timeout=None) if queue_next is None: break genome_id, gene_file = queue_next output_hit_file = os.path.join( self.output_dir, genome_id, '{}{}'.format(genome_id, self.tigrfam_suffix)) output_tophit_file = os.path.join( self.output_dir, genome_id, '{}{}'.format(genome_id, self.tigrfam_top_hit_suffix)) # Genome has already been processed if file_has_checksum(output_hit_file) and file_has_checksum( output_tophit_file): self.logger.info( 'Skipping result from a previous run: {}'.format( genome_id)) # Process this genome else: genome_dir = os.path.join(self.output_dir, genome_id) hmmsearch_out = os.path.join( genome_dir, '{}_tigrfam.out'.format(genome_id)) make_sure_path_exists(genome_dir) cmd = 'hmmsearch -o %s --tblout %s --noali --notextw --cut_nc --cpu %d %s %s' % ( hmmsearch_out, output_hit_file, self.cpus_per_genome, self.tigrfam_hmms, gene_file) os.system(cmd) # calculate checksum checksum = sha256(output_hit_file) with open(output_hit_file + self.checksum_suffix, 'w') as fout: fout.write(checksum) # identify top hit for each gene self._topHit(output_hit_file) # allow results to be processed or written to file queueOut.put(gene_file)
def ani_rep(self, options): """Calculates ANI to GTDB representative genomes. Parameters ---------- options : argparse.Namespace The CLI arguments input by the user. """ make_sure_path_exists(options.out_dir) genomes, _ = self._genomes_to_process(options.genome_dir, options.batchfile, options.extension) ani_rep = ANIRep(options.cpus) ani_rep.run(genomes, options.no_mash, options.mash_d, options.out_dir, options.prefix, options.mash_k, options.mash_v, options.mash_s, options.min_af, options.mash_db) self.logger.info('Done.')
def write(self): """Write the summary file to disk.""" make_sure_path_exists(os.path.dirname(self.path)) header = ['name', 'number_unique_genes', 'number_multiple_genes', 'number_multiple_unique_genes', 'number_missing_genes', 'list_unique_genes', 'list_multiple_genes', 'list_multiple_unique_genes', 'list_missing_genes'] with open(self.path, 'w') as fh: fh.write('\t'.join(header) + '\n') for genome_id, marker_dict in sorted(self.genomes.items()): fh.write(f'{genome_id}\t' f'{len(marker_dict["unq"])}\t' f'{len(marker_dict["mul"])}\t' f'{len(marker_dict["muq"])}\t' f'{len(marker_dict["mis"])}\t' f'{",".join(sorted(marker_dict["unq"]))}\t' f'{",".join(sorted(marker_dict["mul"]))}\t' f'{",".join(sorted(marker_dict["muq"]))}\t' f'{",".join(sorted(marker_dict["mis"]))}\n')
def _workerThread(self, queueIn, queueOut): """Process each data item in parallel.""" while True: queue_next = queueIn.get(block=True, timeout=None) if queue_next is None: break genome_id, gene_file = queue_next output_hit_file = os.path.join(self.output_dir, genome_id, '{}{}'.format(genome_id, self.tigrfam_suffix)) output_tophit_file = os.path.join(self.output_dir, genome_id, '{}{}'.format(genome_id, self.tigrfam_top_hit_suffix)) # Genome has already been processed if file_has_checksum(output_hit_file) and file_has_checksum(output_tophit_file): self.logger.info('Skipping result from a previous run: {}'.format(genome_id)) # Process this genome else: genome_dir = os.path.join(self.output_dir, genome_id) hmmsearch_out = os.path.join(genome_dir, '{}_tigrfam.out'.format(genome_id)) make_sure_path_exists(genome_dir) args = ['hmmsearch', '-o', hmmsearch_out, '--tblout', output_hit_file, '--noali', '--notextw', '--cut_nc', '--cpu', str(self.cpus_per_genome), self.tigrfam_hmms, gene_file] proc = subprocess.Popen(args, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) proc_out, proc_err = proc.communicate() if proc.returncode != 0: queueOut.put((proc.returncode, genome_id, proc_out, proc_err)) sys.exit(proc.returncode) # calculate checksum checksum = sha256(output_hit_file) with open(output_hit_file + self.checksum_suffix, 'w') as fout: fout.write(checksum) # identify top hit for each gene self._topHit(output_hit_file) # allow results to be processed or written to file queueOut.put((0, genome_id, None, None))
def __init__(self, genomes, path, cpus, k, s): """Create a sketch file for a given set of genomes. Parameters ---------- genomes : dict[str, str] The genomes to create a sketch file from (genome_id, fasta_path). path : str The path to write the sketch file to. cpus : int The maximum number of CPUs available for Mash. k : int The k-mer size. s : int Maximum number of non-redundant hashes. """ self.logger = logging.getLogger('timestamp') self.genomes = genomes self.path = path self.data = dict() self.args = dict() self.cpus = cpus self.k = k self.s = s make_sure_path_exists(os.path.dirname(self.path)) # Use the pre-existing sketch file, otherwise generate it. if os.path.isfile(self.path): self.logger.info( f'Loading data from existing Mash sketch file: {self.path}') self._load_metadata() if not self._is_consistent(): raise GTDBTkExit(f'The sketch file is not consistent with the ' f'input genomes. Remove the existing sketch ' f'file or specify a new output directory.') else: self.logger.info(f'Creating Mash sketch file: {self.path}') self._generate()
def __init__(self, genomes, path, cpus, k, s): self.logger = logging.getLogger('timestamp') self.genomes = genomes self.path = path self.data = dict() self.args = dict() self.cpus = cpus self.k = k self.s = s make_sure_path_exists(os.path.dirname(self.path)) # Use the pre-existing sketch file, otherwise generate it. if os.path.isfile(self.path): self.logger.info( f'Loading data from existing Mash sketch file: {self.path}') self._load_metadata() if not self._is_consistent(): raise GTDBTkExit(f'The sketch file is not consistent with the ' f'input genomes. Remove the existing sketch ' f'file or specify a new output directory.') else: self.logger.info(f'Creating Mash sketch file: {self.path}') self._generate()
def run_test(self, options): """Run test of classify workflow. Parameters ---------- options : argparse.Namespace The CLI arguments input by the user. Returns ------- bool True if the test succeeds. Raises ------ GTDBTkTestFailure If the test fails. """ # Use a temporary directory if none is supplied. if options.out_dir: out_dir_fh = None make_sure_path_exists(options.out_dir) else: out_dir_fh = tempfile.TemporaryDirectory(prefix='gtdbtk_tmp_') options.out_dir = out_dir_fh.name self.logger.info('Using a temporary directory as out_dir was not specified.') try: output_dir = os.path.join(options.out_dir, 'output') genome_test_dir = os.path.join(options.out_dir, 'genomes') if os.path.exists(genome_test_dir): self.logger.error(f'Test directory {genome_test_dir} already exists.') self.logger.error('Test must be run in a new directory.') sys.exit(1) current_path = os.path.dirname(os.path.realpath(__file__)) input_dir = os.path.join(current_path, 'tests', 'data', 'genomes') shutil.copytree(input_dir, genome_test_dir) args = ['gtdbtk', 'classify_wf', '--genome_dir', genome_test_dir, '--out_dir', output_dir, '--cpus', str(options.cpus)] self.logger.info('Command: {}'.format(' '.join(args))) # Pipe the output and write to disk. path_stdout = os.path.join(options.out_dir, 'test_execution.log') with subprocess.Popen(args, stdout=subprocess.PIPE, stderr=subprocess.PIPE, encoding='utf-8') as proc: with open(path_stdout, 'w') as fh_stdout: bar_fmt = ' <TEST OUTPUT> '.center(22) + '{desc}' with tqdm(bar_format=bar_fmt, leave=False) as p_bar: while True: line = proc.stdout.readline() if not line: break fh_stdout.write(f'{line}') p_bar.set_description_str(line.strip()) proc.wait() exit_code = proc.returncode summary_fh = ClassifySummaryFileAR122(output_dir, 'gtdbtk') if exit_code != 0: self.logger.error('The test returned a non-zero exit code.') self.logger.error('A detailed summary of the execution log can be ' 'found here: {}'.format(path_stdout)) self.logger.error('The test has failed.') sys.exit(1) if not os.path.exists(summary_fh.path): self.logger.error(f"{summary_fh.path} is missing.") self.logger.error('A detailed summary of the execution log can be ' 'found here: {}'.format(path_stdout)) self.logger.error('The test has failed.') sys.exit(1) finally: if out_dir_fh: out_dir_fh.cleanup() self.logger.info('Test has successfully finished.') return True
def run_test(self, options): """Run test of classify workflow. Parameters ---------- options : argparse.Namespace The CLI arguments input by the user. Returns ------- bool True if the test succeeds. Raises ------ GTDBTkTestFailure If the test fails. """ make_sure_path_exists(options.out_dir) output_dir = os.path.join(options.out_dir, 'output') genome_test_dir = os.path.join(options.out_dir, 'genomes') if os.path.exists(genome_test_dir): self.logger.error( 'Test directory {} already exists'.format(genome_test_dir)) self.logger.error('Test must be run in a new directory.') sys.exit(1) current_path = os.path.dirname(os.path.realpath(__file__)) input_dir = os.path.join(current_path, 'tests', 'data', 'genomes') shutil.copytree(input_dir, genome_test_dir) args = [ 'gtdbtk', 'classify_wf', '--genome_dir', genome_test_dir, '--out_dir', output_dir, '--cpus', str(options.cpus) ] self.logger.info('Command: {}'.format(' '.join(args))) path_stdout = os.path.join(options.out_dir, 'test_execution.log') with open(path_stdout, 'w') as fh_stdout: proc = subprocess.Popen(args, stdout=fh_stdout, stderr=subprocess.PIPE) proc.communicate() summary_file = os.path.join( output_dir, PATH_AR122_SUMMARY_OUT.format(prefix='gtdbtk')) if proc.returncode != 0: self.logger.error('The test returned a non-zero exit code.') self.logger.error('A detailed summary of the execution log can be ' 'found here: {}'.format(path_stdout)) self.logger.error('The test has failed.') sys.exit(1) if not os.path.exists(summary_file): self.logger.error("{} is missing.".format(summary_file)) self.logger.error('A detailed summary of the execution log can be ' 'found here: {}'.format(path_stdout)) self.logger.error('The test has failed.') sys.exit(1) self.logger.info('Test has successfully finished.') return True
def run(self, output_tree, tree_log, fasttree_log, prot_model, no_support, no_gamma, msa_file, cpus=1): """Run FastTree. Parameters ---------- output_tree : str The path where the resulting tree should be written to. tree_log : str The path where the FastTree stats should be written to. fasttree_log : str The path where the FastTree log should be written to. prot_model : str Either 'JTT', 'WAG', or 'LG'. no_support : bool True if no support should be used, False otherwise. no_gamma : bool True if no gamma should be used, False otherwise. msa_file : str The path to the input MSA. cpus : int The maximum number of CPUs for FastTree to use. Raises ------ FastTreeException If an error is encountered while running FastTree. """ env = os.environ.copy() if cpus > 1: cmd = 'FastTreeMP' env['OMP_NUM_THREADS'] = str(cpus) else: cmd = 'FastTree' check_dependencies([cmd]) make_sure_path_exists(os.path.dirname(output_tree)) make_sure_path_exists(os.path.dirname(tree_log)) make_sure_path_exists(os.path.dirname(fasttree_log)) # Setup arguments args = [cmd] if prot_model == 'WAG': args.append('-wag') elif prot_model == 'LG': args.append('-lg') if no_support: args.append('-nosupport') if not no_gamma: args.append('-gamma') args.append('-log') args.append(tree_log) args.append(msa_file) model_out = [ prot_model, ('-' if no_gamma else '+') + 'gamma', ('no' if no_support else '') + 'support' ] self.logger.info( 'Inferring FastTree ({}) using a maximum of {} CPUs.'.format( ', '.join(model_out), cpus)) self.logger.info('FastTree version: {}'.format(self.version)) with open(output_tree, 'w') as f_out_tree: with open(fasttree_log, 'w') as f_out_err: proc = subprocess.Popen(args, stdout=f_out_tree, stderr=f_out_err, env=env) proc.communicate() # Validate results if proc.returncode != 0: self.logger.error( 'An error was encountered while running FastTree.') raise FastTreeException('FastTree returned a non-zero exit code.') if not os.path.isfile(output_tree): self.logger.error( 'An error was encountered while running FastTree.') raise FastTreeException( 'Tree output file is missing: {}'.format(output_tree)) elif os.path.getsize(output_tree) < 1: self.logger.error( 'An error was encountered while running FastTree.') raise FastTreeException( 'Tree output file is empty: {}'.format(output_tree))
def _report_identified_marker_genes(self, gene_dict, outdir, prefix, write_single_copy_genes): """Report statistics for identified marker genes.""" # Summarise the copy number of each AR53 and BAC120 markers. tln_summary_file = TlnTableSummaryFile(outdir, prefix) ar53_copy_number_file = CopyNumberFileAR53(outdir, prefix) bac120_copy_number_file = CopyNumberFileBAC120(outdir, prefix) # Process each genome. for db_genome_id, info in tqdm_log(sorted(gene_dict.items()), unit='genome'): cur_marker_dir = os.path.join(outdir, DIR_MARKER_GENE) pfam_tophit_file = TopHitPfamFile(cur_marker_dir, db_genome_id) tigr_tophit_file = TopHitTigrFile(cur_marker_dir, db_genome_id) pfam_tophit_file.read() tigr_tophit_file.read() # Summarise each of the markers for this genome. ar53_copy_number_file.add_genome(db_genome_id, info.get("aa_gene_path"), pfam_tophit_file, tigr_tophit_file) bac120_copy_number_file.add_genome(db_genome_id, info.get("aa_gene_path"), pfam_tophit_file, tigr_tophit_file) # Write the best translation table to disk for this genome. tln_summary_file.add_genome(db_genome_id, info.get("best_translation_table")) # Write each of the summary files to disk. ar53_copy_number_file.write() bac120_copy_number_file.write() tln_summary_file.write() # Create a symlink to store the summary files in the root. # symlink_f(PATH_BAC120_MARKER_SUMMARY.format(prefix=prefix), # os.path.join(outdir, os.path.basename(PATH_BAC120_MARKER_SUMMARY.format(prefix=prefix)))) # symlink_f(PATH_AR53_MARKER_SUMMARY.format(prefix=prefix), # os.path.join(outdir, os.path.basename(PATH_AR53_MARKER_SUMMARY.format(prefix=prefix)))) # symlink_f(PATH_TLN_TABLE_SUMMARY.format(prefix=prefix), # os.path.join(outdir, os.path.basename(PATH_TLN_TABLE_SUMMARY.format(prefix=prefix)))) symlink_f( PATH_FAILS.format(prefix=prefix), os.path.join(outdir, os.path.basename(PATH_FAILS.format(prefix=prefix)))) # Write the single copy AR53/BAC120 FASTA files to disk. if write_single_copy_genes: fasta_dir = os.path.join(outdir, DIR_IDENTIFY_FASTA) self.logger.info( f'Writing unaligned single-copy genes to: {fasta_dir}') # Iterate over each domain. marker_doms = list() marker_doms.append( (Config.AR53_MARKERS['PFAM'] + Config.AR53_MARKERS['TIGRFAM'], ar53_copy_number_file, 'ar53')) marker_doms.append((Config.BAC120_MARKERS['PFAM'] + Config.BAC120_MARKERS['TIGRFAM'], bac120_copy_number_file, 'bac120')) for marker_names, marker_file, marker_d in marker_doms: # Create the domain-specific subdirectory. fasta_d_dir = os.path.join(fasta_dir, marker_d) make_sure_path_exists(fasta_d_dir) # Iterate over each marker. for marker_name in marker_names: marker_name = marker_name.rstrip(r'\.[HMMhmm]') marker_path = os.path.join(fasta_d_dir, f'{marker_name}.fa') to_write = list() for genome_id in sorted(gene_dict): unq_hits = marker_file.get_single_copy_hits(genome_id) if marker_name in unq_hits: to_write.append(f'>{genome_id}') to_write.append(unq_hits[marker_name]['seq']) if len(to_write) > 0: with open(marker_path, 'w') as fh: fh.write('\n'.join(to_write))
def align(self, identify_dir, skip_gtdb_refs, taxa_filter, min_perc_aa, custom_msa_filters, skip_trimming, rnd_seed, cols_per_gene, min_consensus, max_consensus, min_per_taxa, out_dir, prefix, outgroup_taxon, genomes_to_process=None): """Align marker genes in genomes.""" # read genomes that failed identify steps to skip them failed_genomes_file = os.path.join( os.path.join(identify_dir, PATH_FAILS.format(prefix=prefix))) if os.path.isfile(failed_genomes_file): with open(failed_genomes_file) as fgf: failed_genomes = [row.split()[0] for row in fgf] else: failed_genomes = list() # If the user is re-running this step, check if the identify step is consistent. genomic_files = self._path_to_identify_data(identify_dir, identify_dir != out_dir) if genomes_to_process is not None and len(genomic_files) != len( genomes_to_process): if list( set(genomic_files.keys()) - set(genomes_to_process.keys()) ).sort() != failed_genomes.sort(): self.logger.error( '{} are not present in the input list of genome to process.' .format( list( set(genomic_files.keys()) - set(genomes_to_process.keys())))) raise InconsistentGenomeBatch( 'You are attempting to run GTDB-Tk on a non-empty directory that contains extra ' 'genomes not present in your initial identify directory. Remove them, or run ' 'GTDB-Tk on a new directory.') # If this is being run as a part of classify_wf, copy the required files. if identify_dir != out_dir: identify_path = os.path.join(out_dir, DIR_IDENTIFY) make_sure_path_exists(identify_path) copy( CopyNumberFileBAC120(identify_dir, prefix).path, identify_path) copy(CopyNumberFileAR53(identify_dir, prefix).path, identify_path) copy(TlnTableSummaryFile(identify_dir, prefix).path, identify_path) # Create the align intermediate directory. make_sure_path_exists(os.path.join(out_dir, DIR_ALIGN_INTERMEDIATE)) # Write out files with marker information ar53_marker_info_file = MarkerInfoFileAR53(out_dir, prefix) ar53_marker_info_file.write() bac120_marker_info_file = MarkerInfoFileBAC120(out_dir, prefix) bac120_marker_info_file.write() # Determine what domain each genome belongs to. bac_gids, ar_gids, _bac_ar_diff = self.genome_domain( identify_dir, prefix) if len(bac_gids) + len(ar_gids) == 0: raise GTDBTkExit(f'Unable to assign a domain to any genomes, ' f'please check the identify marker summary file, ' f'and verify genome quality.') # # Create a temporary directory that will be used to generate each of the alignments. # with tempfile.TemporaryDirectory(prefix='gtdbtk_tmp_') as dir_tmp_arc, \ # tempfile.TemporaryDirectory(prefix='gtdbtk_tmp_') as dir_tmp_bac: # # cur_gid_dict = {x: genomic_files[x] for x in ar_gids} # self.logger.info(f'Collecting marker sequences from {len(cur_gid_dict):,} ' # f'genomes identified as archaeal.') # align.concat_single_copy_hits(dir_tmp_arc, # cur_gid_dict, # ar53_marker_info_file) # self.logger.info( f'Aligning markers in {len(genomic_files):,} genomes with {self.cpus} CPUs.' ) dom_iter = ((bac_gids, Config.CONCAT_BAC120, Config.MASK_BAC120, "bac120", 'bacterial', CopyNumberFileBAC120), (ar_gids, Config.CONCAT_AR53, Config.MASK_AR53, "ar53", 'archaeal', CopyNumberFileAR53)) gtdb_taxonomy = Taxonomy().read(self.taxonomy_file) for gids, msa_file, mask_file, marker_set_id, domain_str, copy_number_f in dom_iter: # No genomes identified as this domain. if len(gids) == 0: continue self.logger.info( f'Processing {len(gids):,} genomes identified as {domain_str}.' ) if marker_set_id == 'bac120': marker_info_file = bac120_marker_info_file marker_filtered_genomes = os.path.join( out_dir, PATH_BAC120_FILTERED_GENOMES.format(prefix=prefix)) marker_msa_path = os.path.join( out_dir, PATH_BAC120_MSA.format(prefix=prefix)) marker_user_msa_path = os.path.join( out_dir, PATH_BAC120_USER_MSA.format(prefix=prefix)) else: marker_info_file = ar53_marker_info_file marker_filtered_genomes = os.path.join( out_dir, PATH_AR53_FILTERED_GENOMES.format(prefix=prefix)) marker_msa_path = os.path.join( out_dir, PATH_AR53_MSA.format(prefix=prefix)) marker_user_msa_path = os.path.join( out_dir, PATH_AR53_USER_MSA.format(prefix=prefix)) cur_genome_files = { gid: f for gid, f in genomic_files.items() if gid in gids } if skip_gtdb_refs: gtdb_msa = {} else: gtdb_msa = self._msa_filter_by_taxa(msa_file, gtdb_taxonomy, taxa_filter, outgroup_taxon) gtdb_msa_mask = os.path.join(Config.MASK_DIR, mask_file) # Generate the user MSA. user_msa = align.align_marker_set(cur_genome_files, marker_info_file, copy_number_f, self.cpus) if len(user_msa) == 0: self.logger.warning( f'Identified {len(user_msa):,} single copy {domain_str} hits.' ) continue # Write the individual marker alignments to disk if self.debug: self._write_individual_markers(user_msa, marker_set_id, marker_info_file.path, out_dir, prefix) # filter columns without sufficient representation across taxa if skip_trimming: self.logger.info( 'Skipping custom filtering and selection of columns.') pruned_seqs = {} trimmed_seqs = merge_two_dicts(gtdb_msa, user_msa) elif custom_msa_filters: aligned_genomes = merge_two_dicts(gtdb_msa, user_msa) self.logger.info( 'Performing custom filtering and selection of columns.') trim_msa = TrimMSA( cols_per_gene, min_perc_aa / 100.0, min_consensus / 100.0, max_consensus / 100.0, min_per_taxa / 100.0, rnd_seed, os.path.join(out_dir, f'filter_{marker_set_id}')) trimmed_seqs, pruned_seqs = trim_msa.trim( aligned_genomes, marker_info_file.path) if trimmed_seqs: self.logger.info( 'Filtered MSA from {:,} to {:,} AAs.'.format( len(list(aligned_genomes.values())[0]), len(list(trimmed_seqs.values())[0]))) self.logger.info( 'Filtered {:,} genomes with amino acids in <{:.1f}% of columns in filtered MSA.' .format(len(pruned_seqs), min_perc_aa)) filtered_user_genomes = set(pruned_seqs).intersection(user_msa) if len(filtered_user_genomes): self.logger.info( f'Filtered genomes include {len(filtered_user_genomes)} user submitted genomes.' ) else: self.logger.log( Config.LOG_TASK, f'Masking columns of {domain_str} multiple sequence alignment using canonical mask.' ) trimmed_seqs, pruned_seqs = self._apply_mask( gtdb_msa, user_msa, gtdb_msa_mask, min_perc_aa / 100.0) self.logger.info( 'Masked {} alignment from {:,} to {:,} AAs.'.format( domain_str, len(list(user_msa.values())[0]), len(list(trimmed_seqs.values())[0]))) if min_perc_aa > 0: self.logger.info( '{:,} {} user genomes have amino acids in <{:.1f}% of columns in filtered MSA.' .format(len(pruned_seqs), domain_str, min_perc_aa)) # write out filtering information with open(marker_filtered_genomes, 'w') as fout: for pruned_seq_id, pruned_seq in pruned_seqs.items(): if len(pruned_seq) == 0: perc_alignment = 0 else: valid_bases = sum( [1 for c in pruned_seq if c.isalpha()]) perc_alignment = valid_bases * 100.0 / len(pruned_seq) fout.write( f'{pruned_seq_id}\tInsufficient number of amino acids in MSA ({perc_alignment:.1f}%)\n' ) # write out MSAs if not skip_gtdb_refs: self.logger.info( f'Creating concatenated alignment for {len(trimmed_seqs):,} ' f'{domain_str} GTDB and user genomes.') self._write_msa(trimmed_seqs, marker_msa_path, gtdb_taxonomy, zip_output=True) trimmed_user_msa = { k: v for k, v in trimmed_seqs.items() if k in user_msa } if len(trimmed_user_msa) > 0: self.logger.info( f'Creating concatenated alignment for {len(trimmed_user_msa):,} ' f'{domain_str} user genomes.') self._write_msa(trimmed_user_msa, marker_user_msa_path, gtdb_taxonomy, zip_output=True) else: self.logger.info( f'All {domain_str} user genomes have been filtered out.')
def run(self, output_tree, tree_log, fasttree_log, prot_model, no_support, gamma, msa_file, cpus=1): """Run FastTree. Parameters ---------- output_tree : str The path where the resulting tree should be written to. tree_log : str The path where the FastTree stats should be written to. fasttree_log : str The path where the FastTree log should be written to. prot_model : str Either 'JTT', 'WAG', or 'LG'. no_support : bool True if no support should be used, False otherwise. gamma : bool True if Gamma20 should be used, False otherwise. msa_file : str The path to the input MSA. cpus : int The maximum number of CPUs for FastTree to use. Raises ------ FastTreeException If an error is encountered while running FastTree. """ env = os.environ.copy() if cpus > 1: cmd = 'FastTreeMP' env['OMP_NUM_THREADS'] = str(cpus) else: cmd = 'FastTree' check_dependencies([cmd]) make_sure_path_exists(os.path.dirname(output_tree)) make_sure_path_exists(os.path.dirname(tree_log)) make_sure_path_exists(os.path.dirname(fasttree_log)) # Setup arguments args = [cmd] model_out = [prot_model] if prot_model == 'WAG': args.append('-wag') elif prot_model == 'LG': args.append('-lg') if gamma: args.append('-gamma') model_out.append('+G') if no_support: args.append('-nosupport') else: model_out.append('SH support values') args.append('-log') args.append(tree_log) self.logger.info('Inferring FastTree ({}) using a maximum of {} CPUs.'.format( ', '.join(model_out), cpus)) # Use a temporary directory if the input file is gzipped with tempfile.TemporaryDirectory(prefix='gtdbtk_') as tmp_dir: # Uncompress the archive if it's compressed if msa_file.endswith('.gz'): msa_path = os.path.join(tmp_dir, os.path.basename(msa_file[0:-3])) with gzip.open(msa_file, 'rb') as f_in: with open(msa_path, 'wb') as f_out: shutil.copyfileobj(f_in, f_out) else: msa_path = msa_file args.append(msa_path) with open(output_tree, 'w') as f_out_tree: with open(fasttree_log, 'w') as f_out_err: proc = subprocess.Popen( args, stdout=f_out_tree, stderr=f_out_err, env=env) proc.communicate() # Validate results if proc.returncode != 0: self.logger.error( 'An error was encountered while running FastTree.') raise FastTreeException('FastTree returned a non-zero exit code.') if not os.path.isfile(output_tree): self.logger.error( 'An error was encountered while running FastTree.') raise FastTreeException( 'Tree output file is missing: {}'.format(output_tree)) elif os.path.getsize(output_tree) < 1: self.logger.error( 'An error was encountered while running FastTree.') raise FastTreeException( 'Tree output file is empty: {}'.format(output_tree))
def _calculate_fastani_distance(self, user_leaf, list_leaf, genomes): """ Calculate the FastANI distance between all user genomes and the reference to classfy them at the species level Parameters ---------- user_leaf : User genome node list_leaf : Dictionary of nodes including one or many user genomes and one reference genome. genomes : Dictionary of user genomes d[genome_id] -> FASTA file Returns ------- dictionary dict_results[user_g]={ref_genome1:{"af":af,"ani":ani},ref_genome2:{"af":af,"ani":ani}} """ try: self.tmp_output_dir = tempfile.mkdtemp() make_sure_path_exists(self.tmp_output_dir) dict_parser_distance = {} # we first calculate the user genome vs the reference # we write the two input files for fastani, the query file and # reference file path_user_list = os.path.join(self.tmp_output_dir, 'query_list.txt') with open(path_user_list, 'w') as f: f.write('{0}\n'.format(genomes.get(user_leaf.taxon.label))) leafnodes = list_leaf.get("potential_g") for node in leafnodes: leafnode = node[0] shortleaf = leafnode.taxon.label path_ref_list = os.path.join(self.tmp_output_dir, 'ref_{}.txt'.format(shortleaf)) if leafnode.taxon.label.startswith( 'GB_') or leafnode.taxon.label.startswith('RS_'): shortleaf = leafnode.taxon.label[3:] with open(path_ref_list, 'w') as f: f.write('{}\n'.format( os.path.join(Config.FASTANI_GENOMES, shortleaf + Config.FASTANI_GENOMES_EXT))) # run fastANI if not os.path.isfile(path_user_list) or not os.path.isfile( path_ref_list): raise FastANIException path_results = os.path.join( self.tmp_output_dir, 'results_{}_UvsRef.tab'.format(shortleaf)) path_error = os.path.join(self.tmp_output_dir, 'error_{}.log'.format(shortleaf)) cmd = 'fastANI --ql {0} --rl {1} -o {2} > /dev/null 2>{3}'.format( path_user_list, path_ref_list, path_results, path_error) os.system(cmd) if not os.path.isfile(path_results): errstr = 'FastANI has stopped:\n' if os.path.isfile(path_error): with open(path_error) as debug: for line in debug: finalline = line errstr += finalline raise ValueError(errstr) dict_parser_distance = self._parse_fastani_results( path_results, dict_parser_distance) # We then calculate the reference vs user genome path_results_reverse = os.path.join( self.tmp_output_dir, 'results_{}_RefvsU.tab'.format(shortleaf)) cmd_reverse = 'fastANI --ql {0} --rl {1} -o {2} > /dev/null 2>{3}'.format( path_ref_list, path_user_list, path_results_reverse, path_error) os.system(cmd_reverse) if not os.path.isfile(path_results_reverse): errstr = 'FastANI has stopped:\n' if os.path.isfile(path_error): with open(path_error) as debug: for line in debug: finalline = line errstr += finalline raise ValueError(errstr) dict_parser_distance = self._parse_fastani_results_reverse( path_results_reverse, dict_parser_distance) shutil.rmtree(self.tmp_output_dir) return dict_parser_distance except ValueError as error: if os.path.exists(self.tmp_output_dir): shutil.rmtree(self.tmp_output_dir) raise error except Exception as error: if os.path.exists(self.tmp_output_dir): shutil.rmtree(self.tmp_output_dir) raise error
def run(self, genome_files, output_dir, called_genes=False, translation_table=None, meta=False, closed_ends=False): """Call genes with Prodigal. Call genes with prodigal and store the results in the specified output directory. For convenience, the called_gene flag can be used to indicate genes have previously been called and simply need to be copied to the specified output directory. Parameters ---------- genome_files : list of str Nucleotide fasta files to call genes on. called_genes : boolean Flag indicating if genes are already called. translation_table : int Specifies desired translation table, use None to automatically select between tables 4 and 11. meta : boolean Flag indicating if prodigal should call genes with the metagenomics procedure. closed_ends : boolean If True, do not allow genes to run off edges (throws -c flag). output_dir : str Directory to store called genes. Returns ------- d[genome_id] -> namedtuple(best_translation_table coding_density_4 coding_density_11) Summary statistics of called genes for each genome. """ self.called_genes = called_genes self.translation_table = translation_table self.meta = meta self.closed_ends = closed_ends self.output_dir = output_dir make_sure_path_exists(self.output_dir) progress_func = None if self.verbose: file_type = 'genomes' self.progress_str = ' Finished processing %d of %d (%.2f%%) genomes.' if meta: file_type = 'scaffolds' if len(genome_files): file_type = ntpath.basename(genome_files[0]) self.progress_str = ' Finished processing %d of %d (%.2f%%) files.' self.logger.info('Identifying genes within %s: ' % file_type) progress_func = self._progress parallel = Parallel(self.cpus) summary_stats = parallel.run(self._producer, self._consumer, genome_files, progress_func) # An error was encountered during Prodigal processing, clean up. if not summary_stats: shutil.rmtree(self.output_dir) return summary_stats