def _read_taxonomy_files(self, options) -> Dict[str, Tuple[str, str, str, str, str, str, str]]: """Read and merge taxonomy files.""" self.logger.info('Reading GTDB taxonomy for representative genomes.') taxonomy = Taxonomy().read(Config.TAXONOMY_FILE) if options.gtdbtk_classification_file: # add and overwrite taxonomy for genomes specified in the # GTDB-Tk classification file check_file_exists(options.gtdbtk_classification_file) self.logger.info('Reading GTDB-Tk classification file.') gtdbtk_taxonomy = Taxonomy().read(options.gtdbtk_classification_file) del gtdbtk_taxonomy['user_genome'] num_reassigned = 0 for gid, taxa in gtdbtk_taxonomy.items(): if gid in taxonomy: num_reassigned += 1 taxonomy[gid] = taxa self.logger.info(f'Read GTDB-Tk classifications for {len(gtdbtk_taxonomy):,} genomes.') self.logger.info(f'Reassigned taxonomy for {num_reassigned:,} GTDB representative genomes.') if options.custom_taxonomy_file: # add and overwrite taxonomy for genomes specified in the # custom taxonomy file check_file_exists(options.custom_taxonomy_file) self.logger.info('Reading custom taxonomy file.') custom_taxonomy = Taxonomy().read(options.custom_taxonomy_file) num_reassigned = 0 for gid, taxa in custom_taxonomy.items(): if gid in taxonomy: num_reassigned += 1 taxonomy[gid] = taxa self.logger.info(f'Read custom taxonomy for {len(custom_taxonomy):,} genomes.') self.logger.info(f'Reassigned taxonomy for {num_reassigned:,} GTDB representative genomes.') if options.gtdbtk_classification_file and options.custom_taxonomy_file: dup_genomes = set(gtdbtk_taxonomy).intersection(custom_taxonomy) if len(dup_genomes) > 0: self.logger.error('GTDB-Tk classification and custom taxonomy ' 'files must not specify taxonomies for the ' 'same genomes.') self.logger.error('These files have {:,} genomes in common.'.format(len(dup_genomes))) self.logger.error('Example duplicate genome: {}'.format(dup_genomes.pop())) raise GTDBTkExit('Duplicated taxonomy information.') self.logger.info(f'Read taxonomy for {len(taxonomy):,} genomes.') return taxonomy
def root(self, options): """Root tree using outgroup. Parameters ---------- options : argparse.Namespace The CLI arguments input by the user. """ self.logger.warning("Tree rooting is still under development!") check_file_exists(options.input_tree) if options.custom_taxonomy_file: check_file_exists(options.custom_taxonomy_file) taxonomy = Taxonomy().read(options.custom_taxonomy_file) else: taxonomy = Taxonomy().read(Config.TAXONOMY_FILE) self.logger.info('Identifying genomes from the specified outgroup.') outgroup = set() for genome_id, taxa in taxonomy.items(): if options.outgroup_taxon in taxa: outgroup.add(genome_id) reroot = RerootTree() reroot.root_with_outgroup(options.input_tree, options.output_tree, outgroup) # Symlink to the tree summary file, if not run independently if hasattr(options, 'suffix'): if options.suffix == 'bac120': symlink_f( PATH_BAC120_ROOTED_TREE.format(prefix=options.prefix), os.path.join( options.out_dir, os.path.basename( PATH_AR122_ROOTED_TREE.format( prefix=options.prefix)))) elif options.suffix == 'ar122': symlink_f( PATH_AR122_ROOTED_TREE.format(prefix=options.prefix), os.path.join( options.out_dir, os.path.basename( PATH_AR122_ROOTED_TREE.format( prefix=options.prefix)))) else: raise GenomeMarkerSetUnknown( 'There was an error determining the marker set.') self.logger.info('Done.')