def run(self, gene_files): """Annotate genes with Pfam HMMs. Parameters ---------- gene_files : iterable Gene files in FASTA format to process. """ self.cpus_per_genome = max(1, int(self.threads / len(gene_files))) # populate worker queue with data to process workerQueue = mp.Queue() writerQueue = mp.Queue() for f in gene_files: workerQueue.put(f) for _ in range(self.threads): workerQueue.put(None) try: workerProc = [ mp.Process(target=self._workerThread, args=(workerQueue, writerQueue)) for _ in range(self.threads) ] writeProc = mp.Process(target=self._writerThread, args=(len(gene_files), writerQueue)) writeProc.start() for p in workerProc: p.start() for p in workerProc: p.join() if p.exitcode != 0: raise GTDBTkExit( 'An error was encountered while running hmmsearch.') writerQueue.put(None) writeProc.join() except Exception: for p in workerProc: p.terminate() writeProc.terminate() raise
def _parse_result_queue(self, q_results, path_to_gid): """Creates the output dictionary given the results from FastANI Parameters ---------- q_results : Queue A multiprocessing queue containing raw results. path_to_gid : Dict[str ,str] A dictionary containing the file path to genome id. Returns ------- Dict[str, Dict[str, Dict[str, float]]] The ANI/AF of the query genome to all reference genomes. """ out = dict() while True: q_item = q_results.get(block=True) if q_item is None: break job, result = q_item qry_gid = job['qry'] for path_a, dict_b in result.items(): for path_b, (ani, af) in dict_b.items(): gid_a, gid_b = path_to_gid[path_a], path_to_gid[path_b] # This was done in the forward direction. if gid_a == qry_gid: ref_gid = gid_b # This was done in the reverse direction. elif gid_b == qry_gid: ref_gid = gid_a else: raise GTDBTkExit('FastANI results are malformed.') # Take the largest ANI / AF from either pass. if qry_gid not in out: out[qry_gid] = {ref_gid: {'ani': ani, 'af': af}} elif ref_gid not in out[qry_gid]: out[qry_gid][ref_gid] = {'ani': ani, 'af': af} else: out[qry_gid][ref_gid]['ani'] = max( out[qry_gid][ref_gid]['ani'], ani) out[qry_gid][ref_gid]['af'] = max( out[qry_gid][ref_gid]['af'], af) return out
def add_genome(self, genome_id: str, path_faa: str, pfam_th: TopHitPfamFile, tigr_th: TopHitTigrFile): """Process the top hit files for a genome and store the copy info.""" if genome_id in self.genomes: self.logger.warning(f'Genome already exists in copy number file: {genome_id}') self.genomes[genome_id] = {'unq': dict(), 'mul': dict(), 'muq': dict(), 'mis': dict()} # Pointers to unique, multiple hit, multiple-unique, missing markers. cur_unq = self.genomes[genome_id]['unq'] cur_mul = self.genomes[genome_id]['mul'] cur_muq = self.genomes[genome_id]['muq'] cur_mis = self.genomes[genome_id]['mis'] # Load genes from the prodigal faa file. d_genes = read_fasta(path_faa, False) for seq_id, seq in d_genes.items(): if seq.endswith('*'): d_genes[seq_id] = seq[:-1] # Create a dictionary of marker names -> Hits d_hmm_hits = self._merge_hit_files(pfam_th, tigr_th) # Foreach expected marker determine which category it falls into. for marker_id in self.marker_names: # Marker is missing. if marker_id not in d_hmm_hits: cur_mis[marker_id] = None # Multiple hits to to the same marker. elif len(d_hmm_hits[marker_id]) > 1: # If sequences are the same, take the most significant hit unq_seqs = {d_genes[x.gene_id] for x in d_hmm_hits[marker_id]} if len(unq_seqs) == 1: cur_top_hit = sorted(d_hmm_hits[marker_id], reverse=True)[0] cur_muq[marker_id] = {'hit': cur_top_hit, 'seq': d_genes[cur_top_hit.gene_id]} # Marker maps to multiple genes. else: cur_mul[marker_id] = None # This was a unique hit. else: cur_hit = d_hmm_hits[marker_id][0] cur_unq[marker_id] = {'hit': cur_hit, 'seq': d_genes[cur_hit.gene_id]} # Sanity check - confirm that the total number of markers matches. if len(self.marker_names) != len(cur_unq) + len(cur_mul) + len(cur_muq) + len(cur_mis): raise GTDBTkExit('The marker set is inconsistent, please report this issue.')
def _calculate(self): self.logger.info('Calculating Mash distances.') args = [ 'mash', 'dist', '-p', self.cpus, '-d', self.max_d, '-v', self.mash_v, self.ref_sketch.path, self.qry_sketch.path ] args = list(map(str, args)) with open(self.path, 'w') as f_out: proc = subprocess.Popen(args, stdout=f_out, stderr=subprocess.PIPE, encoding='utf-8') _, stderr = proc.communicate() if proc.returncode != 0: raise GTDBTkExit(f'Error running Mash dist: {proc.stderr.read()}')
def export_msa(domain: Domain, output_file: str): """Exports the GTDB MSA to the specified path. :param domain: The domain used to determine the marker set. :param output_file: The path to write the MSA. """ if domain is Domain.ARCHAEA: file_to_export = CONCAT_AR53 elif domain is Domain.BACTERIA: file_to_export = CONCAT_BAC120 else: raise GTDBTkExit(f'Unknown domain: "{domain}"') make_sure_path_exists(os.path.dirname(output_file)) copyfile(file_to_export, output_file)
def _get_ingroup_domain(self, ingroup_taxon) -> str: """Get domain on ingroup taxon.""" # read GTDB taxonomy in order to establish domain on ingroup taxon gtdb_taxonomy = Taxonomy().read(TAXONOMY_FILE) ingroup_domain = None for taxa in gtdb_taxonomy.values(): if ingroup_taxon in taxa: ingroup_domain = taxa[Taxonomy.DOMAIN_IDX] if ingroup_domain is None: raise GTDBTkExit(f'Ingroup taxon {ingroup_taxon} was not found in ' f'the GTDB taxonomy.') return ingroup_domain
def read(self): """Reads the marker names from disk. No sequence information!""" with open(self.path) as fh: fh.readline() for line in fh.readlines(): genome_id, n_unq, n_mul, n_muq, n_mis, unq, mul, muq, mis = line.split('\t') n_unq, n_mul, n_muq, n_mis = int(n_unq), int(n_mul), int(n_muq), int(n_mis) self.genomes[genome_id] = {'unq': {x: None for x in unq.strip().split(',') if len(x) > 0}, 'mul': {x: None for x in mul.strip().split(',') if len(x) > 0}, 'muq': {x: None for x in muq.strip().split(',') if len(x) > 0}, 'mis': {x: None for x in mis.strip().split(',') if len(x) > 0}} cur_dict = self.genomes[genome_id] if len(cur_dict['unq']) != n_unq or len(cur_dict['mul']) != n_mul or \ len(cur_dict['muq']) != n_muq or len(cur_dict['mis']) != n_mis: raise GTDBTkExit(f'The marker file is inconsistent: {self.path}')
def _load_metadata(self): """Loads the metadata from an existing Mash sketch file.""" args = ['mash', 'info', '-t', self.path] proc = subprocess.Popen(args, stdout=subprocess.PIPE, stderr=subprocess.PIPE, encoding='utf-8') stdout, stderr = proc.communicate() if proc.returncode != 0: raise GTDBTkExit( f'Error reading Mash sketch file {self.path}:\n{stderr}') for hashes, length, path in re.findall(r'(\d+)\t(\d+)\t(.+)\t.+\n', stdout): self.data[path] = (int(hashes), int(length))
def run_proc(self, q, r, ql, rl, output): """Runs the FastANI process. Parameters ---------- q : str The path to the query genome. r : str The path to the reference genome. ql : str The path to the query list file. rl : str The path to the reference list file. output : str The path to the output file. Returns ------- dict[str, dict[str, float]] The ANI/AF of the query genomes to the reference genomes. """ args = ['fastANI'] if self.minFrac: args.extend(['--minFraction', '0']) if q is not None: args.extend(['-q', q]) if r is not None: args.extend(['-r', r]) if ql is not None: args.extend(['--ql', ql]) if rl is not None: args.extend(['--rl', rl]) args.extend(['-o', output]) self.logger.debug(' '.join(args)) proc = subprocess.Popen(args, stdout=subprocess.PIPE, stderr=subprocess.PIPE, encoding='utf-8') stdout, stderr = proc.communicate() if proc.returncode != 0: self.logger.error('STDOUT:\n' + stdout) self.logger.error('STDERR:\n' + stderr) raise GTDBTkExit('FastANI returned a non-zero exit code.') # Parse the output return self.parse_output_file(output)
def read(self, taxonomy_file: str, canonical_ids: bool = False) -> Dict[str, List[str]]: """Read Greengenes-style taxonomy file. Expected format is: <id>\t<taxonomy string> where the taxonomy string has the formats: d__; p__; c__; o__; f__; g__; s__ Parameters ---------- taxonomy_file : str Path to a Greengenes-style taxonomy file. canonical_ids : bool True if to use the canonical ID format, False otherwise. """ try: d = {} with open(taxonomy_file, 'r') as f: for row, line in enumerate(f.readlines()): line_split = line.split('\t') if len(line_split) != 2: raise GTDBTkExit(f'Not a tab-separated line: {line}') unique_id = line_split[0] if canonical_ids: unique_id = canonical_gid(unique_id) tax_str = line_split[1].rstrip() if tax_str[-1] == ';': # remove trailing semicolons which sometimes # appear in Greengenes-style taxonomy files tax_str = tax_str[0:-1] d[unique_id] = [x.strip() for x in tax_str.split(';')] except: self.logger.error('Failed to parse taxonomy file on line %d' % (row + 1)) raise return d
def parse_output_file(self, path_out): """Parses the resulting output file from FastANI. Parameters ---------- path_out : str The path where the output file resides. Returns ------- dict[str, dict[str, tuple[float, float]]] The ANI/AF of the query genomes to the reference genomes. """ out = dict() if os.path.isfile(path_out): with open(path_out, 'r') as fh: for line in fh.readlines(): """FastANI version >=1.1 uses tabs instead of spaces to separate columns. Preferentially try split with tabs first instead of split() in-case of spaces in the file path.""" try: try: path_qry, path_ref, ani, frac1, frac2 = line.strip( ).split('\t') except ValueError: path_qry, path_ref, ani, frac1, frac2 = line.strip( ).split(' ') if not self._suppress_v1_warning: self.logger.warning( 'You are using FastANI v1.0, it is recommended ' 'that you update to a more recent version.' ) self._suppress_v1_warning = True af = round(float(frac1) / float(frac2), 2) if path_qry not in out: out[path_qry] = {path_ref: (float(ani), af)} elif path_ref not in out[path_qry]: out[path_qry][path_ref] = (float(ani), af) except Exception as e: self.logger.error( f'Exception reading FastANI output: {repr(e)}') raise GTDBTkExit(f'Unable to read line "{line}"') return out
def _run(self, ref_msh, qry_msh, max_d): args = ['mash', 'dist', '-p', self.cpus, '-d', max_d, ref_msh, qry_msh] args = list(map(str, args)) proc = subprocess.Popen(args, stdout=subprocess.PIPE, stderr=subprocess.PIPE, encoding='utf-8') stdout, stderr = proc.communicate() if proc.returncode != 0: raise GTDBTkExit(f'Error running Mash dist: {proc.stderr.read()}') out = defaultdict(dict) for ref_id, qry_id, dist, p_val, shared_n, shared_d in re.findall( r'(.+)\t(.+)\t(.+)\t(.+)\t(\d+)\/(\d+)\n', stdout): dist, p_val = float(dist), float(p_val) shared_num, shared_den = int(shared_n), int(shared_d) out[qry_id][ref_id] = (dist, p_val, shared_num, shared_den) return out
def _get_median_reds(self, ingroup_domain: str): """Get median RED values for domain of ingroup taxon.""" # get median RED values for domain if ingroup_domain == 'd__Bacteria': median_reds = RED_DIST_BAC_DICT elif ingroup_domain == 'd__Archaea': median_reds = RED_DIST_ARC_DICT else: raise GTDBTkExit(f'Unrecognized GTDB domain: {ingroup_domain}.') # report median values domain = ingroup_domain.replace('d__', '') self.logger.info('Median RED values for {}:'.format(domain)) for idx, rank_prefix in enumerate(Taxonomy.rank_prefixes): if idx != Taxonomy.DOMAIN_IDX and idx != Taxonomy.SPECIES_IDX: self.logger.info(' {}\t{:.3f}'.format( Taxonomy.rank_labels[idx].capitalize(), median_reds[rank_prefix])) return median_reds
def __init__(self, genomes, path, cpus, k, s): """Create a sketch file for a given set of genomes. Parameters ---------- genomes : dict[str, str] The genomes to create a sketch file from (genome_id, fasta_path). path : str The path to write the sketch file to. cpus : int The maximum number of CPUs available for Mash. k : int The k-mer size. s : int Maximum number of non-redundant hashes. """ self.logger = logging.getLogger('timestamp') self.genomes = genomes self.path = path self.data = dict() self.args = dict() self.cpus = cpus self.k = k self.s = s make_sure_path_exists(os.path.dirname(self.path)) # Use the pre-existing sketch file, otherwise generate it. if os.path.isfile(self.path): self.logger.info( f'Loading data from existing Mash sketch file: {self.path}') self._load_metadata() if not self._is_consistent(): raise GTDBTkExit(f'The sketch file is not consistent with the ' f'input genomes. Remove the existing sketch ' f'file or specify a new output directory.') else: self.logger.info(f'Creating Mash sketch file: {self.path}') self._generate()
def _generate(self): """Generate a new sketch file.""" with tempfile.TemporaryDirectory(prefix='gtdbtk_mash_tmp_') as dir_tmp: path_genomes = os.path.join(dir_tmp, 'genomes.txt') with open(path_genomes, 'w') as fh: for path in self.genomes.values(): fh.write(f'{path}\n') args = ['mash', 'sketch', '-l', '-p', self.cpus, path_genomes, '-o', self.path, '-k', self.k, '-s', self.s] args = list(map(str, args)) proc = subprocess.Popen(args, stdout=subprocess.PIPE, stderr=subprocess.PIPE, encoding='utf-8') bar_fmt = '==> Sketched {n_fmt}/{total_fmt} ({percentage:.0f}%) ' \ 'genomes [{rate_fmt}, ETA {remaining}]' with tqdm(bar_format=bar_fmt, total=len(self.genomes), mininterval=1, smoothing=0.1) as p_bar: for line in iter(proc.stderr.readline, ''): if line.startswith('Sketching'): p_bar.update() proc.wait() if proc.returncode != 0 or not os.path.isfile(self.path): raise GTDBTkExit(f'Error generating Mash sketch: {proc.stderr.read()}')
def __init__(self, genomes, path, cpus, k, s): self.logger = logging.getLogger('timestamp') self.genomes = genomes self.path = path self.data = dict() self.args = dict() self.cpus = cpus self.k = k self.s = s make_sure_path_exists(os.path.dirname(self.path)) # Use the pre-existing sketch file, otherwise generate it. if os.path.isfile(self.path): self.logger.info( f'Loading data from existing Mash sketch file: {self.path}') self._load_metadata() if not self._is_consistent(): raise GTDBTkExit(f'The sketch file is not consistent with the ' f'input genomes. Remove the existing sketch ' f'file or specify a new output directory.') else: self.logger.info(f'Creating Mash sketch file: {self.path}') self._generate()
def align(self, identify_dir, skip_gtdb_refs, taxa_filter, min_perc_aa, custom_msa_filters, skip_trimming, rnd_seed, cols_per_gene, min_consensus, max_consensus, min_per_taxa, out_dir, prefix, outgroup_taxon, genomes_to_process=None): """Align marker genes in genomes.""" # read genomes that failed identify steps to skip them failed_genomes_file = os.path.join( os.path.join(identify_dir, PATH_FAILS.format(prefix=prefix))) if os.path.isfile(failed_genomes_file): with open(failed_genomes_file) as fgf: failed_genomes = [row.split()[0] for row in fgf] else: failed_genomes = list() # If the user is re-running this step, check if the identify step is consistent. genomic_files = self._path_to_identify_data(identify_dir, identify_dir != out_dir) if genomes_to_process is not None and len(genomic_files) != len( genomes_to_process): if list( set(genomic_files.keys()) - set(genomes_to_process.keys()) ).sort() != failed_genomes.sort(): self.logger.error( '{} are not present in the input list of genome to process.' .format( list( set(genomic_files.keys()) - set(genomes_to_process.keys())))) raise InconsistentGenomeBatch( 'You are attempting to run GTDB-Tk on a non-empty directory that contains extra ' 'genomes not present in your initial identify directory. Remove them, or run ' 'GTDB-Tk on a new directory.') # If this is being run as a part of classify_wf, copy the required files. if identify_dir != out_dir: identify_path = os.path.join(out_dir, DIR_IDENTIFY) make_sure_path_exists(identify_path) copy( CopyNumberFileBAC120(identify_dir, prefix).path, identify_path) copy(CopyNumberFileAR53(identify_dir, prefix).path, identify_path) copy(TlnTableSummaryFile(identify_dir, prefix).path, identify_path) # Create the align intermediate directory. make_sure_path_exists(os.path.join(out_dir, DIR_ALIGN_INTERMEDIATE)) # Write out files with marker information ar53_marker_info_file = MarkerInfoFileAR53(out_dir, prefix) ar53_marker_info_file.write() bac120_marker_info_file = MarkerInfoFileBAC120(out_dir, prefix) bac120_marker_info_file.write() # Determine what domain each genome belongs to. bac_gids, ar_gids, _bac_ar_diff = self.genome_domain( identify_dir, prefix) if len(bac_gids) + len(ar_gids) == 0: raise GTDBTkExit(f'Unable to assign a domain to any genomes, ' f'please check the identify marker summary file, ' f'and verify genome quality.') # # Create a temporary directory that will be used to generate each of the alignments. # with tempfile.TemporaryDirectory(prefix='gtdbtk_tmp_') as dir_tmp_arc, \ # tempfile.TemporaryDirectory(prefix='gtdbtk_tmp_') as dir_tmp_bac: # # cur_gid_dict = {x: genomic_files[x] for x in ar_gids} # self.logger.info(f'Collecting marker sequences from {len(cur_gid_dict):,} ' # f'genomes identified as archaeal.') # align.concat_single_copy_hits(dir_tmp_arc, # cur_gid_dict, # ar53_marker_info_file) # self.logger.info( f'Aligning markers in {len(genomic_files):,} genomes with {self.cpus} CPUs.' ) dom_iter = ((bac_gids, Config.CONCAT_BAC120, Config.MASK_BAC120, "bac120", 'bacterial', CopyNumberFileBAC120), (ar_gids, Config.CONCAT_AR53, Config.MASK_AR53, "ar53", 'archaeal', CopyNumberFileAR53)) gtdb_taxonomy = Taxonomy().read(self.taxonomy_file) for gids, msa_file, mask_file, marker_set_id, domain_str, copy_number_f in dom_iter: # No genomes identified as this domain. if len(gids) == 0: continue self.logger.info( f'Processing {len(gids):,} genomes identified as {domain_str}.' ) if marker_set_id == 'bac120': marker_info_file = bac120_marker_info_file marker_filtered_genomes = os.path.join( out_dir, PATH_BAC120_FILTERED_GENOMES.format(prefix=prefix)) marker_msa_path = os.path.join( out_dir, PATH_BAC120_MSA.format(prefix=prefix)) marker_user_msa_path = os.path.join( out_dir, PATH_BAC120_USER_MSA.format(prefix=prefix)) else: marker_info_file = ar53_marker_info_file marker_filtered_genomes = os.path.join( out_dir, PATH_AR53_FILTERED_GENOMES.format(prefix=prefix)) marker_msa_path = os.path.join( out_dir, PATH_AR53_MSA.format(prefix=prefix)) marker_user_msa_path = os.path.join( out_dir, PATH_AR53_USER_MSA.format(prefix=prefix)) cur_genome_files = { gid: f for gid, f in genomic_files.items() if gid in gids } if skip_gtdb_refs: gtdb_msa = {} else: gtdb_msa = self._msa_filter_by_taxa(msa_file, gtdb_taxonomy, taxa_filter, outgroup_taxon) gtdb_msa_mask = os.path.join(Config.MASK_DIR, mask_file) # Generate the user MSA. user_msa = align.align_marker_set(cur_genome_files, marker_info_file, copy_number_f, self.cpus) if len(user_msa) == 0: self.logger.warning( f'Identified {len(user_msa):,} single copy {domain_str} hits.' ) continue # Write the individual marker alignments to disk if self.debug: self._write_individual_markers(user_msa, marker_set_id, marker_info_file.path, out_dir, prefix) # filter columns without sufficient representation across taxa if skip_trimming: self.logger.info( 'Skipping custom filtering and selection of columns.') pruned_seqs = {} trimmed_seqs = merge_two_dicts(gtdb_msa, user_msa) elif custom_msa_filters: aligned_genomes = merge_two_dicts(gtdb_msa, user_msa) self.logger.info( 'Performing custom filtering and selection of columns.') trim_msa = TrimMSA( cols_per_gene, min_perc_aa / 100.0, min_consensus / 100.0, max_consensus / 100.0, min_per_taxa / 100.0, rnd_seed, os.path.join(out_dir, f'filter_{marker_set_id}')) trimmed_seqs, pruned_seqs = trim_msa.trim( aligned_genomes, marker_info_file.path) if trimmed_seqs: self.logger.info( 'Filtered MSA from {:,} to {:,} AAs.'.format( len(list(aligned_genomes.values())[0]), len(list(trimmed_seqs.values())[0]))) self.logger.info( 'Filtered {:,} genomes with amino acids in <{:.1f}% of columns in filtered MSA.' .format(len(pruned_seqs), min_perc_aa)) filtered_user_genomes = set(pruned_seqs).intersection(user_msa) if len(filtered_user_genomes): self.logger.info( f'Filtered genomes include {len(filtered_user_genomes)} user submitted genomes.' ) else: self.logger.log( Config.LOG_TASK, f'Masking columns of {domain_str} multiple sequence alignment using canonical mask.' ) trimmed_seqs, pruned_seqs = self._apply_mask( gtdb_msa, user_msa, gtdb_msa_mask, min_perc_aa / 100.0) self.logger.info( 'Masked {} alignment from {:,} to {:,} AAs.'.format( domain_str, len(list(user_msa.values())[0]), len(list(trimmed_seqs.values())[0]))) if min_perc_aa > 0: self.logger.info( '{:,} {} user genomes have amino acids in <{:.1f}% of columns in filtered MSA.' .format(len(pruned_seqs), domain_str, min_perc_aa)) # write out filtering information with open(marker_filtered_genomes, 'w') as fout: for pruned_seq_id, pruned_seq in pruned_seqs.items(): if len(pruned_seq) == 0: perc_alignment = 0 else: valid_bases = sum( [1 for c in pruned_seq if c.isalpha()]) perc_alignment = valid_bases * 100.0 / len(pruned_seq) fout.write( f'{pruned_seq_id}\tInsufficient number of amino acids in MSA ({perc_alignment:.1f}%)\n' ) # write out MSAs if not skip_gtdb_refs: self.logger.info( f'Creating concatenated alignment for {len(trimmed_seqs):,} ' f'{domain_str} GTDB and user genomes.') self._write_msa(trimmed_seqs, marker_msa_path, gtdb_taxonomy, zip_output=True) trimmed_user_msa = { k: v for k, v in trimmed_seqs.items() if k in user_msa } if len(trimmed_user_msa) > 0: self.logger.info( f'Creating concatenated alignment for {len(trimmed_user_msa):,} ' f'{domain_str} user genomes.') self._write_msa(trimmed_user_msa, marker_user_msa_path, gtdb_taxonomy, zip_output=True) else: self.logger.info( f'All {domain_str} user genomes have been filtered out.')
def run(self, gtdbtk_output_dir, ar122_metadata_file, bac120_metadata_file, output_file, gtdbtk_prefix): """Translate GTDB to NCBI classification via majority vote.""" # Set the output directories if not (ar122_metadata_file or bac120_metadata_file): raise GTDBTkExit( 'You must specify at least one of --ar122_metadata_file or --bac120_metadata_file' ) ar_summary = os.path.join(gtdbtk_output_dir, PATH_AR122_SUMMARY_OUT.format(prefix=gtdbtk_prefix)) \ if ar122_metadata_file else None ar_tree = os.path.join(gtdbtk_output_dir, PATH_AR122_TREE_FILE.format(prefix=gtdbtk_prefix)) \ if ar122_metadata_file else None bac_summary = os.path.join(gtdbtk_output_dir, PATH_BAC120_SUMMARY_OUT.format(prefix=gtdbtk_prefix)) \ if bac120_metadata_file else None bac_tree = os.path.join(gtdbtk_output_dir, PATH_BAC120_TREE_FILE.format(prefix=gtdbtk_prefix)) \ if bac120_metadata_file else None # Create the output file directory. output_dir = os.path.dirname(output_file) if output_dir and not os.path.isdir(output_dir): os.makedirs(output_dir) # get NCBI taxonomy string for GTDB genomes and GTDB species clusters ncbi_taxa = {} ncbi_lineages = {} gtdb_sp_clusters = defaultdict(set) for domain, metadata_file in [('archaeal', ar122_metadata_file), ('bacterial', bac120_metadata_file)]: # Only process those domains which have been provided as an input. if metadata_file is None: continue self._logger.info(f'Processing {domain} metadata file.') if not os.path.exists(metadata_file): raise GTDBTkExit(f'File does not exist {metadata_file}') with open(metadata_file, 'r', encoding='utf-8') as f: header = f.readline().strip().split('\t') ncbi_taxonomy_index = header.index('ncbi_taxonomy') gtdb_genome_rep_index = header.index( 'gtdb_genome_representative') for line in f.readlines(): line_split = line.strip().split('\t') gid = line_split[0] ncbi_taxonomy = line_split[ncbi_taxonomy_index] if ncbi_taxonomy and ncbi_taxonomy != 'none': ncbi_taxa[gid] = [ t.strip() for t in ncbi_taxonomy.split(';') ] for idx, taxon in enumerate(ncbi_taxa[gid]): ncbi_lineages[taxon] = ncbi_taxa[gid][0:idx + 1] if idx < 6: ncbi_lineages[taxon] += self.rank_prefix[idx + 1:] rep_id = line_split[gtdb_genome_rep_index] gtdb_sp_clusters[rep_id].add(gid) self._logger.info( f'Read NCBI taxonomy for {len(ncbi_taxa):,} genomes.') self._logger.info( f'Identified {len(gtdb_sp_clusters):,} GTDB species clusters.') # get majority vote NCBI classification for each GTDB species cluster ncbi_sp_classification = defaultdict(list) for rep_id, cluster_ids in gtdb_sp_clusters.items(): for rank in range(6, -1, -1): ncbi_taxon_list = [] for cid in cluster_ids: if cid in ncbi_taxa: ncbi_taxon_list.append(ncbi_taxa[cid][rank]) if len(ncbi_taxon_list) > 0: counter = Counter(ncbi_taxon_list) mc_taxon, mc_count = counter.most_common(1)[0] if mc_count >= 0.5 * len(ncbi_taxon_list) and len( mc_taxon) > 3: ncbi_sp_classification[rep_id] = ncbi_lineages[ mc_taxon] break if rep_id in ncbi_sp_classification and ncbi_sp_classification[ rep_id][0] == 'd__': raise GTDBTkExit( f'Majority vote domain is undefined for {rep_id}') self._logger.info(f'Identified {len(ncbi_sp_classification):,} GTDB ' f'species clusters with an NCBI classification.') # convert GTDB classifications to NCBI classification with open(output_file, 'w') as fout: fout.write( 'user_genome\tGTDB classification\tNCBI classification\n') for domain, summary_file, tree_file in [ ('Archaea', ar_summary, ar_tree), ('Bacteria', bac_summary, bac_tree) ]: if summary_file is None or tree_file is None: self._logger.warning( f'{domain} have been skipped as no metadata file was provided.' ) continue if not os.path.exists(summary_file): self._logger.warning( f'{domain} have been skipped as the summary file does not exist: {summary_file}' ) continue if not os.path.exists(tree_file): self._logger.warning( f'{domain} have been skipped as the tree file does not exist: {summary_file}' ) continue self._logger.info(f'Parsing {tree_file}') tree = dendropy.Tree.get_from_path(tree_file, schema='newick', rooting='force-rooted', preserve_underscores=True) # map genomes IDs to leaf nodes leaf_node_map = {} for leaf in tree.leaf_node_iter(): leaf_node_map[leaf.taxon.label] = leaf # get majority vote NCBI classification for each user genome self._logger.info(f'Reclassifying genomes in {summary_file}') with open(summary_file) as f: header = f.readline().strip().split('\t') gtdb_classification_index = header.index('classification') for line in f: line_split = line.strip().split('\t') user_gid = line_split[0] gtdb_taxonomy = line_split[gtdb_classification_index] gtdb_taxa = [ t.strip() for t in gtdb_taxonomy.split(';') ] gtdb_species = gtdb_taxa[6] ncbi_rep_ids = self.get_ncbi_descendants( user_gid, tree, leaf_node_map, ncbi_sp_classification) # take a majority vote over species with a NCBI classification, and # limit taxonomic resolution to most-specific rank reported by GTDB-Tk ncbi_classification = [] for rank in range(6, -1, -1): if len(gtdb_taxa[rank]) == 3: continue ncbi_taxon_list = [] for rep_id in ncbi_rep_ids: ncbi_taxon_list.append( ncbi_sp_classification[rep_id][rank]) counter = Counter(ncbi_taxon_list) mc_taxon, mc_count = counter.most_common(1)[0] if mc_count >= 0.5 * len(ncbi_taxon_list) and len( mc_taxon) > 3: ncbi_classification = ncbi_lineages[mc_taxon] break # write out results fout.write('%s\t%s\t%s\n' % (user_gid, gtdb_taxonomy, ';'.join(ncbi_classification))) self._logger.info(f'Results have been written to: {output_file}')
def root_with_outgroup(self, input_tree: str, output_tree: str, outgroup: Set[str]): """Reroot the tree using the given outgroup. Parameters ---------- input_tree File containing Newick tree to rerooted. output_tree Name of file for rerooted tree. outgroup Labels of taxa in outgroup. """ tree = dendropy.Tree.get_from_path(input_tree, schema='newick', rooting='force-rooted', preserve_underscores=True) outgroup = set(outgroup) outgroup_in_tree = set() ingroup_leaves = set() for n in tree.leaf_node_iter(): if n.taxon.label in outgroup: outgroup_in_tree.add(n.taxon) else: ingroup_leaves.add(n) self.logger.info( f'Identified {len(outgroup_in_tree):,} outgroup taxa in the tree.') self.logger.info( f'Identified {len(ingroup_leaves):,} ingroup taxa in the tree.') if len(outgroup_in_tree) == 0: self.logger.error('No outgroup taxa identified in the tree.') raise GTDBTkExit('Tree was not rerooted.') # Since finding the MRCA is a rooted tree operation, # the tree is first rerooted on an ingroup taxa. This # ensures the MRCA of the outgroup can be identified # so long as the outgroup is monophyletic. If the # outgroup is polyphyletic trying to root on it # is ill defined. To try and pick a "good" root for # polyphyletic outgroups, random ingroup taxa are # selected until two of them give the same size # lineage. This will, likely, be the smallest # bipartition possible for the given outgroup though # this is not guaranteed. mrca = tree.mrca(taxa=outgroup_in_tree) mrca_leaves = len(mrca.leaf_nodes()) while True: rnd_ingroup = random.sample(ingroup_leaves, 1)[0] tree.reroot_at_edge(rnd_ingroup.edge, length1=0.5 * rnd_ingroup.edge_length, length2=0.5 * rnd_ingroup.edge_length) mrca = tree.mrca(taxa=outgroup_in_tree) if len(mrca.leaf_nodes()) == mrca_leaves: break mrca_leaves = len(mrca.leaf_nodes()) if len(mrca.leaf_nodes()) != len(outgroup_in_tree): self.logger.info('Outgroup is not monophyletic. Tree will be ' 'rerooted at the MRCA of the outgroup.') self.logger.info(f'The outgroup consisted of ' f'{len(outgroup_in_tree):,} taxa, while the MRCA ' f'has {len(mrca.leaf_nodes()):,} leaf nodes.') if len(mrca.leaf_nodes()) == len(tree.leaf_nodes()): self.logger.warning('The MRCA spans all taxa in the tree.') self.logger.warning('This indicating the selected outgroup is ' 'likely polyphyletic in the current tree.') self.logger.warning('Polyphyletic outgroups are not suitable ' 'for rooting. Try another outgroup.') else: self.logger.info('Outgroup is monophyletic.') if mrca.edge_length is None: self.logger.info( 'Tree appears to already be rooted on this outgroup.') else: self.logger.info('Rerooting tree.') tree.reroot_at_edge(mrca.edge, length1=0.5 * mrca.edge_length, length2=0.5 * mrca.edge_length) tree.write_to_path(output_tree, schema='newick', suppress_rooting=True, unquoted_underscores=True) self.logger.info(f'Rerooted tree written to: {output_tree}')
def coding_density_11(self, v): try: self._coding_density_11 = float(v) except ValueError: raise GTDBTkExit(f'Invalid coding density: {v} for {self.path}')
def add_genome(self, gid: str, tree_index: str): """PlAdds the pplacer classification of a given genome.""" if gid in self.data: raise GTDBTkExit( f'Warning! Attempting to add duplicate genome: {gid}') self.data[gid] = tree_index
def best_tln_table(self, v): try: self._best_tln_table = int(v) except ValueError: raise GTDBTkExit(f'Invalid translation table: {v} for {self.path}')
def add_row(self, row: PplacerHighClassifyRow): if row.gid in self.rows: raise GTDBTkExit(f'Attempting to add duplicate row: {row.gid}') self.rows[row.gid] = row
def _run_prodigal(self, genome_id, fasta_path): """Run Prodigal. Parameters ---------- fasta_path : str Path to FASTA file to process. :return False if an error occurred. """ # Setup output files output_dir = os.path.join(self.marker_gene_dir, genome_id) aa_gene_file = os.path.join(output_dir, genome_id + self.protein_file_suffix) nt_gene_file = None gff_file = None translation_table_file = None if not self.proteins: nt_gene_file = os.path.join(output_dir, genome_id + self.nt_gene_file_suffix) gff_file = os.path.join(output_dir, genome_id + self.gff_file_suffix) translation_table_file = os.path.join( output_dir, 'prodigal' + TRANSLATION_TABLE_SUFFIX) # Return early if files are already done if not self.proteins and file_has_checksum(aa_gene_file) and file_has_checksum(nt_gene_file) \ and file_has_checksum(gff_file) and file_has_checksum(translation_table_file): best_tln_table = -1 with open(translation_table_file, 'r') as tln_f: for line in tln_f.readlines(): cols = line.strip().split('\t') if cols[0] == 'best_translation_table': best_tln_table = int(cols[1]) break if best_tln_table > 0: self.logger.info( 'Skipping result from a previous run: {}'.format( genome_id)) return aa_gene_file, nt_gene_file, gff_file, translation_table_file, best_tln_table # Did not meet the conditions to skip processing this genome, call genes. prodigal = BioLibProdigal(1, False) summary_stats = prodigal.run([fasta_path], output_dir, called_genes=self.proteins) # An error occured in BioLib Prodigal. if not summary_stats: if self.force: return None else: raise GTDBTkExit( "Prodigal failed to call genes for: {} " "(to skip these genomes, re-run with --force)".format( genome_id)) summary_stats = list(summary_stats.values())[0] # rename output files to adhere to GTDB conventions and desired genome # ID shutil.move(summary_stats.aa_gene_file, aa_gene_file) with open(aa_gene_file + CHECKSUM_SUFFIX, 'w') as f: f.write(sha256(aa_gene_file)) if not self.proteins: shutil.move(summary_stats.nt_gene_file, nt_gene_file) with open(nt_gene_file + CHECKSUM_SUFFIX, 'w') as f: f.write(sha256(nt_gene_file)) shutil.move(summary_stats.gff_file, gff_file) with open(gff_file + CHECKSUM_SUFFIX, 'w') as f: f.write(sha256(gff_file)) # save translation table information translation_table_file = os.path.join( output_dir, 'prodigal_translation_table.tsv') with open(translation_table_file, 'w') as fout: fout.write('%s\t%d\n' % ('best_translation_table', summary_stats.best_translation_table)) fout.write( '%s\t%.2f\n' % ('coding_density_4', summary_stats.coding_density_4 * 100)) fout.write('%s\t%.2f\n' % ('coding_density_11', summary_stats.coding_density_11 * 100)) fout.write( '%s\t%.2f\n' % ('probability_4', summary_stats.probability_4 * 100)) fout.write( '%s\t%.2f\n' % ('probability_11', summary_stats.probability_11 * 100)) with open(translation_table_file + CHECKSUM_SUFFIX, 'w') as f: f.write(sha256(translation_table_file)) return aa_gene_file, nt_gene_file, gff_file, translation_table_file, summary_stats.best_translation_table
def calculate_patristic_distance(qry_node, ref_nodes, tt=None): """Computes the patristic distance from the query node to all reference nodes. Note that all nodes must be a leaf nodes under max_node. Parameters ---------- qry_node : dendropy.Node The query taxon node that the distance to all ref nodes will be found. ref_nodes : List[dendropy.Node] A list of reference nodes that the qry_node will be calculated to. tt : Optional[TreeTraversal] A TreeTraversal index, if absent a new one will be created. Returns ------- Dict[dendropy.Node, float] A dictionary keyed by each reference taxon, valued by patristic dist. """ tt = tt or TreeTraversal() # Iterate over each of the ref_nodes to find the MRCA to qry_node. d_ref_to_mrca = dict() for ref_node in ref_nodes: cur_dist_to_mrca = ref_node.edge_length # Go up the tree until the descendants include qry_node. parent_node = ref_node.parent_node while parent_node is not None: leaf_nodes = tt.get_leaf_nodes(parent_node) # Found the MRCA node. if qry_node in leaf_nodes: d_ref_to_mrca[ref_node] = (parent_node, cur_dist_to_mrca) break # Keep going up. cur_dist_to_mrca += parent_node.edge_length parent_node = parent_node.parent_node # If the loop did not break, raise an exception. else: raise GTDBTkExit(f'Unable to find MRCA: {qry_node.taxon.label} / ' f'{ref_node.taxon.label}') # Compute the distance from the qry_node to each of the MRCAs. out = dict() for ref_node, (mrca_node, ref_mrca_dist) in d_ref_to_mrca.items(): # Go up the tree until the MRCA is found again. cur_dist_to_mrca = qry_node.edge_length cur_node = qry_node.parent_node while cur_node is not None: # Found the MRCA node. if cur_node == mrca_node: out[ref_node] = cur_dist_to_mrca + ref_mrca_dist break # Keep going up. cur_dist_to_mrca += cur_node.edge_length cur_node = cur_node.parent_node # Impossible case, but throw an exception anyway. else: raise GTDBTkExit(f'Tree is inconsistent: {qry_node.taxon.label} / ' f'{ref_node.taxon.label}') return out
def add_row(self, row: ClassifySummaryFileRow): if row.gid in self.rows: raise GTDBTkExit(f'Attempting to add duplicate row: {row.gid}') self.rows[row.gid] = row
def add_genome(self, genome_id: str, tln_table: int): """Record a translation table for a genome.""" if genome_id in self.genomes: raise GTDBTkExit( f'Genome already exists in summary file: {genome_id}') self.genomes[genome_id] = tln_table
def _classify_on_internal_branch(self, leaf, child_taxons, current_rel_list, child_rel_dist, node_in_ref_tree, parent_rank, child_rk, taxa_str, taxa_str_terminal, is_on_terminal_branch, red_bac_dict): """ Classification on an internal node is very similar to the 'normal' classification """ # Persist descendant information for efficient traversal. tt = TreeTraversal() closest_rank = None if len(child_taxons) == 0: list_leaves = [ childnd.taxon.label.replace("'", '') for childnd in tt.get_leaf_nodes(node_in_ref_tree) if childnd.taxon.label in self.reference_ids ] if len(list_leaves) != 1: list_subrank = [] for leaf_subrank in list_leaves: list_subrank.append( self.gtdb_taxonomy.get(leaf_subrank)[ self.order_rank.index(parent_rank) + 1]) if len(set(list_subrank)) == 1: print(leaf.taxon.label) print(list_leaves) print(list_subrank) raise GTDBTkExit('There should be only one leaf.') else: closest_rank = parent_rank detection = "taxonomic classification fully defined by topology" list_leaf_ranks = self.gtdb_taxonomy.get( list_leaves[0])[self.order_rank.index(child_rk): -1] # We remove the species name for leaf_taxon in reversed(list_leaf_ranks): leaf_taxon_rank = leaf_taxon[:3] if leaf_taxon == list_leaf_ranks[0]: if abs(current_rel_list - red_bac_dict.get(leaf_taxon_rank) ) < abs(current_rel_list - red_bac_dict.get(parent_rank[:3])): closest_rank = leaf_taxon break else: pchildrank = list_leaf_ranks[ list_leaf_ranks.index(leaf_taxon) - 1] if abs(current_rel_list - red_bac_dict.get(leaf_taxon_rank) ) < abs(current_rel_list - red_bac_dict.get(pchildrank[:3])): closest_rank = leaf_taxon break if closest_rank is None: closest_rank = parent_rank # if there is multiple ranks on the child node (i.e genome between p__Nitrospirae and c__Nitrospiria;o__Nitrospirales;f__Nitropiraceae) # we loop through the list of rank from f_ to c_ rank for child_taxon in reversed(child_taxons): child_taxon_rank = child_taxon[:3] if child_taxon == child_taxons[0]: if (abs(current_rel_list - red_bac_dict.get(child_taxon_rank)) < abs(child_rel_dist - red_bac_dict.get(child_taxon_rank)) and abs(current_rel_list - red_bac_dict.get(child_taxon_rank)) < abs(current_rel_list - red_bac_dict.get(parent_rank[:3]))): closest_rank = child_taxon elif closest_rank is None: closest_rank = parent_rank else: pchildrank = child_taxons[child_taxons.index(child_taxon) - 1] if (abs(current_rel_list - red_bac_dict.get(child_taxon_rank)) < abs(current_rel_list - red_bac_dict.get(child_taxon_rank)) and abs(current_rel_list - red_bac_dict.get(child_taxon_rank)) < abs(child_rel_dist - red_bac_dict.get(child_taxon_rank))): closest_rank = child_taxon break if closest_rank is not None: # when we have the closest rank found, we can find it in # gtdb_Taxonomy and get the higher level from it. for k, v in self.gtdb_taxonomy.items(): if closest_rank in v: taxa_str = ';'.join(v[1:v.index(closest_rank) + 1]) # All classification should be at least to the order level if a genome # is placed on a internal branch with only one order under if any(x.startswith('o__') for x in child_taxons) \ and self.order_rank.index(closest_rank[0:3]) < self.order_rank.index('o__') \ and ('o__' in taxa_str_terminal.split(';') or not is_on_terminal_branch): taxa_str_terminal = ';'.join( v[1:self.order_rank.index('o__') + 1]) break return taxa_str, taxa_str_terminal
def run(self, dict_compare, dict_paths): """Runs FastANI in batch mode. Parameters ---------- dict_compare : dict[str, set[str]] All query to reference comparisons to be made. dict_paths : dict[str, str] The path for each genome id being compared. Returns ------- dict[str, dict[str, dict[str, float]]] A dictionary containing the ANI and AF for each comparison.""" # Create the multiprocessing items. manager = mp.Manager() q_worker = manager.Queue() q_writer = manager.Queue() q_results = manager.Queue() # Populate the queue of comparisons in forwards and reverse direction. n_total = 0 if self.force_single: for qry_gid, ref_set in dict_compare.items(): qry_path = dict_paths[qry_gid] for ref_gid in ref_set: ref_path = dict_paths[ref_gid] fwd_dict = {'q': dict(), 'r': dict(), 'qry': qry_gid} rev_dict = {'q': dict(), 'r': dict(), 'qry': qry_gid} fwd_dict['q'][qry_gid] = qry_path fwd_dict['r'][ref_gid] = ref_path rev_dict['q'][ref_gid] = ref_path rev_dict['r'][qry_gid] = qry_path q_worker.put(fwd_dict) q_worker.put(rev_dict) n_total += 2 else: for qry_gid, ref_set in dict_compare.items(): fwd_dict = {'ql': dict(), 'rl': dict(), 'qry': qry_gid} rev_dict = {'ql': dict(), 'rl': dict(), 'qry': qry_gid} qry_path = dict_paths[qry_gid] fwd_dict['ql'][qry_gid] = qry_path rev_dict['rl'][qry_gid] = qry_path for ref_gid in ref_set: ref_path = dict_paths[ref_gid] fwd_dict['rl'][ref_gid] = ref_path rev_dict['ql'][ref_gid] = ref_path q_worker.put(fwd_dict) q_worker.put(rev_dict) n_total += 2 # Set the terminate condition for each worker thread. [q_worker.put(None) for _ in range(self.cpus)] # Create each of the processes p_workers = [ mp.Process(target=self._worker, args=(q_worker, q_writer, q_results)) for _ in range(self.cpus) ] p_writer = mp.Process(target=self._writer, args=(q_writer, n_total)) # Start each of the threads. try: # Start the writer and each processing thread. p_writer.start() for p_worker in p_workers: p_worker.start() # Wait until each worker has finished. for p_worker in p_workers: p_worker.join() # Gracefully terminate the program. if p_worker.exitcode != 0: raise GTDBTkExit('FastANI returned a non-zero exit code.') # Stop the writer thread. q_writer.put(None) p_writer.join() except Exception: for p in p_workers: p.terminate() p_writer.terminate() raise # Process and return each of the results obtained path_to_gid = {v: k for k, v in dict_paths.items()} q_results.put(None) return self._parse_result_queue(q_results, path_to_gid)
def add_row(self, row: GenomeMappingFileRow): if row.gid in self.rows: raise GTDBTkExit(f'Attempting to add duplicate row: {row.gid}') self.rows[row.gid] = row