def get_single_copy_hits_worker(job): """For a given genome, obtain the PFAM and TIGRFAM tophit files. Use this information to determine what hits are single copy. Parameters ---------- job : Tuple[str, str, CopyNumberFile] The genome id, path to called genes, and domain-specific copy number file object. Returns ------- Dict[str, Dict[str, str]] dict[marker id][genome id] = sequence """ gid, aa_path, copy_number_file = job # Load the marker top hit files. marker_genes_dir = os.path.dirname(os.path.dirname(aa_path)) pfam_tophit_file = TopHitPfamFile(marker_genes_dir, gid) tigr_tophit_file = TopHitTigrFile(marker_genes_dir, gid) pfam_tophit_file.read() tigr_tophit_file.read() # Process each of the genes to determine if they are single copy. cnf = copy_number_file('/dev/null', None) cnf.add_genome(gid, aa_path, pfam_tophit_file, tigr_tophit_file) single_copy = cnf.get_single_copy_hits(gid) # Store the output out = defaultdict(dict) for marker_id, marker_d in single_copy.items(): out[marker_id][gid] = marker_d['seq'] return out
def _report_identified_marker_genes(self, gene_dict, outdir, prefix): """Report statistics for identified marker genes.""" # Summarise the copy number of each AR122 and BAC120 markers. tln_summary_file = TlnTableSummaryFile(outdir, prefix) ar122_copy_number_file = CopyNumberFileAR122(outdir, prefix) bac120_copy_number_file = CopyNumberFileBAC120(outdir, prefix) # Process each genome. for db_genome_id, info in sorted(gene_dict.items()): cur_marker_dir = os.path.join(outdir, DIR_MARKER_GENE) pfam_tophit_file = TopHitPfamFile(cur_marker_dir, db_genome_id) tigr_tophit_file = TopHitTigrFile(cur_marker_dir, db_genome_id) pfam_tophit_file.read() tigr_tophit_file.read() # Summarise each of the markers for this genome. ar122_copy_number_file.add_genome(db_genome_id, info.get("aa_gene_path"), pfam_tophit_file, tigr_tophit_file) bac120_copy_number_file.add_genome(db_genome_id, info.get("aa_gene_path"), pfam_tophit_file, tigr_tophit_file) # Write the best translation table to disk for this genome. tln_summary_file.add_genome(db_genome_id, info.get("best_translation_table")) # Write each of the summary files to disk. ar122_copy_number_file.write() bac120_copy_number_file.write() tln_summary_file.write() # Create a symlink to store the summary files in the root. symlink_f( PATH_BAC120_MARKER_SUMMARY.format(prefix=prefix), os.path.join( outdir, os.path.basename( PATH_BAC120_MARKER_SUMMARY.format(prefix=prefix)))) symlink_f( PATH_AR122_MARKER_SUMMARY.format(prefix=prefix), os.path.join( outdir, os.path.basename( PATH_AR122_MARKER_SUMMARY.format(prefix=prefix)))) symlink_f( PATH_TLN_TABLE_SUMMARY.format(prefix=prefix), os.path.join( outdir, os.path.basename( PATH_TLN_TABLE_SUMMARY.format(prefix=prefix))))
def _report_identified_marker_genes(self, gene_dict, outdir, prefix, write_single_copy_genes): """Report statistics for identified marker genes.""" # Summarise the copy number of each AR53 and BAC120 markers. tln_summary_file = TlnTableSummaryFile(outdir, prefix) ar53_copy_number_file = CopyNumberFileAR53(outdir, prefix) bac120_copy_number_file = CopyNumberFileBAC120(outdir, prefix) # Process each genome. for db_genome_id, info in tqdm_log(sorted(gene_dict.items()), unit='genome'): cur_marker_dir = os.path.join(outdir, DIR_MARKER_GENE) pfam_tophit_file = TopHitPfamFile(cur_marker_dir, db_genome_id) tigr_tophit_file = TopHitTigrFile(cur_marker_dir, db_genome_id) pfam_tophit_file.read() tigr_tophit_file.read() # Summarise each of the markers for this genome. ar53_copy_number_file.add_genome(db_genome_id, info.get("aa_gene_path"), pfam_tophit_file, tigr_tophit_file) bac120_copy_number_file.add_genome(db_genome_id, info.get("aa_gene_path"), pfam_tophit_file, tigr_tophit_file) # Write the best translation table to disk for this genome. tln_summary_file.add_genome(db_genome_id, info.get("best_translation_table")) # Write each of the summary files to disk. ar53_copy_number_file.write() bac120_copy_number_file.write() tln_summary_file.write() # Create a symlink to store the summary files in the root. # symlink_f(PATH_BAC120_MARKER_SUMMARY.format(prefix=prefix), # os.path.join(outdir, os.path.basename(PATH_BAC120_MARKER_SUMMARY.format(prefix=prefix)))) # symlink_f(PATH_AR53_MARKER_SUMMARY.format(prefix=prefix), # os.path.join(outdir, os.path.basename(PATH_AR53_MARKER_SUMMARY.format(prefix=prefix)))) # symlink_f(PATH_TLN_TABLE_SUMMARY.format(prefix=prefix), # os.path.join(outdir, os.path.basename(PATH_TLN_TABLE_SUMMARY.format(prefix=prefix)))) symlink_f( PATH_FAILS.format(prefix=prefix), os.path.join(outdir, os.path.basename(PATH_FAILS.format(prefix=prefix)))) # Write the single copy AR53/BAC120 FASTA files to disk. if write_single_copy_genes: fasta_dir = os.path.join(outdir, DIR_IDENTIFY_FASTA) self.logger.info( f'Writing unaligned single-copy genes to: {fasta_dir}') # Iterate over each domain. marker_doms = list() marker_doms.append( (Config.AR53_MARKERS['PFAM'] + Config.AR53_MARKERS['TIGRFAM'], ar53_copy_number_file, 'ar53')) marker_doms.append((Config.BAC120_MARKERS['PFAM'] + Config.BAC120_MARKERS['TIGRFAM'], bac120_copy_number_file, 'bac120')) for marker_names, marker_file, marker_d in marker_doms: # Create the domain-specific subdirectory. fasta_d_dir = os.path.join(fasta_dir, marker_d) make_sure_path_exists(fasta_d_dir) # Iterate over each marker. for marker_name in marker_names: marker_name = marker_name.rstrip(r'\.[HMMhmm]') marker_path = os.path.join(fasta_d_dir, f'{marker_name}.fa') to_write = list() for genome_id in sorted(gene_dict): unq_hits = marker_file.get_single_copy_hits(genome_id) if marker_name in unq_hits: to_write.append(f'>{genome_id}') to_write.append(unq_hits[marker_name]['seq']) if len(to_write) > 0: with open(marker_path, 'w') as fh: fh.write('\n'.join(to_write))
def _run_multi_align(self, db_genome_id, path, marker_set_id): """ Returns the concatenated marker sequence for a specific genome :param db_genome_id: Selected genome :param path: Path to the genomic fasta file for the genome :param marker_set_id: Unique ID of marker set to use for alignment """ cur_marker_dir = os.path.dirname(os.path.dirname(path)) pfam_tophit_file = TopHitPfamFile(cur_marker_dir, db_genome_id) tigr_tophit_file = TopHitTigrFile(cur_marker_dir, db_genome_id) pfam_tophit_file.read() tigr_tophit_file.read() if marker_set_id == 'bac120': copy_number_file = CopyNumberFileBAC120('/dev/null', None) elif marker_set_id == 'ar122': copy_number_file = CopyNumberFileAR122('/dev/null', None) else: raise GTDBTkException('Unknown marker set.') copy_number_file.add_genome(db_genome_id, path, pfam_tophit_file, tigr_tophit_file) single_copy_hits = copy_number_file.get_single_copy_hits(db_genome_id) # gather information for all marker genes marker_paths = { "PFAM": os.path.join(self.pfam_hmm_dir, 'individual_hmms'), "TIGRFAM": os.path.join(os.path.dirname(self.tigrfam_hmm_dir), 'individual_hmms') } marker_dict_original = {} if marker_set_id == "bac120": for db_marker in sorted(self.bac120_markers): marker_dict_original.update({ marker.replace(".HMM", "").replace(".hmm", ""): os.path.join(marker_paths[db_marker], marker) for marker in self.bac120_markers[db_marker] }) elif marker_set_id == "ar122": for db_marker in sorted(self.ar122_markers): marker_dict_original.update({ marker.replace(".HMM", "").replace(".hmm", ""): os.path.join(marker_paths[db_marker], marker) for marker in self.ar122_markers[db_marker] }) elif marker_set_id == "rps23": for db_marker in sorted(self.rps23_markers): marker_dict_original.update({ marker.replace(".HMM", "").replace(".hmm", ""): os.path.join(marker_paths[db_marker], marker) for marker in self.rps23_markers[db_marker] }) # Iterate over each of the expected markers and store the gene sequence. gene_dict = dict() result_align = dict() for marker_id, marker_path in marker_dict_original.items(): hit = single_copy_hits.get(marker_id) if hit: # print(marker_id) gene_dict[marker_id] = { "marker_path": marker_path, "gene": hit['hit'].gene_id, "gene_seq": hit['seq'], "bitscore": hit['hit'].bit_score } else: hmm_len = self._get_hmm_size(marker_path) result_align[marker_id] = '-' * hmm_len # Align the markers. result_align.update(self._run_align(gene_dict, db_genome_id)) # we concatenate the aligned markers together and associate them with # the genome. return ''.join([x[1] for x in sorted(result_align.items())])