def __init__(self, options): """Initialize.""" AbstractPlot.__init__(self, options) self.fig.clear() self.fig.set_size_inches(self.options.width, self.options.height)
def __init__(self, dpi=96): """Initialize.""" self.logger = logging.getLogger() Options = namedtuple('Options', 'width height tick_font_size label_font_size dpi') options = Options(6, 6, 12, 12, 96) AbstractPlot.__init__(self, options) self.dpi = dpi
def __init__(self, dpi=96): """Initialize.""" self.logger = logging.getLogger() Options = namedtuple('Options', 'width height tick_font_size label_font_size dpi') options = Options(6, 6, 12, 12, 96) AbstractPlot.__init__(self, options) self.dpi = dpi check_dependencies(['genometreetk'])
def save_html(self, output_html): """Save figure as HTML. Parameters ---------- output_html : str Name of output file. """ html_script = Tooltip.script_global html_body = Tooltip.html_body AbstractPlot.save_html(self, output_html, html_script, html_body)
def __init__(self, infile, outfile): AbstractPlot.__init__(self, None) self.outfile = outfile self.genomes = None self._parse_data(infile) self.colormap = pylab.cm.bwr self.discreteColourMap = ListedColormap([(141/255.0, 211/255.0, 199/255.0),(255/255.0, 255/255.0, 179/255.0), (190/255.0, 186/255.0, 218/255.0),(251/255.0, 128/255.0, 114/255.0), (128/255.0, 177/255.0, 211/255.0),(253/255.0, 180/255.0, 98/255.0), (179/255.0, 222/255.0, 105/255.0),(252/255.0, 205/255.0, 229/255.0), (217/255.0, 217/255.0, 217/255.0), (188/255.0, 128/255.0, 189/255.0), (204/255.0, 235/255.0, 197/255.0),(255/255.0, 237/255.0, 111/255.0)])
def run(self, bac120_metadata_file, ar120_metadata_file): """Scatter plot showing quality of GTDB representative genomes.""" # get genome metadata self.logger.info('Reading GTDB metadata.') metadata = self.read_metadata(bac120_metadata_file, ar120_metadata_file) self.logger.info( f' ...read metadata for {len(metadata):,} representative genomes.') # get completenss, contamination, and MIMAG quality of each # GTDB species representative comp = [] cont = [] mimag_category = [] exception_count = 0 for gid, m in metadata.items(): if m.mimag_hq: mimag_category.append('hq') comp.append(m.genome_comp) cont.append(m.genome_cont) elif m.mimag_mq: mimag_category.append('mq') comp.append(m.genome_comp) cont.append(m.genome_cont) elif m.mimag_lq: mimag_category.append('lq') comp.append(m.genome_comp) cont.append(m.genome_cont) else: #***self.logger.error(f'GTDB representative has no assigned MIMAG quality: {gid}') # this occurs becomes some representatives have >10% contamination exception_count += 1 self.logger.info( ' ...HQ = {:,}, MQ = {:,}, LQ = {:,}, exceptions = {:,}'.format( mimag_category.count('hq'), mimag_category.count('mq'), mimag_category.count('lq'), exception_count)) # create plot for each genomic statistic options = AbstractPlot.Options(width=4, height=4, label_font_size=7, tick_font_size=6, dpi=600) plot = GenomeQualityPlot(options) plot.plot(comp, cont, mimag_category, exception_count, 'Completeness (%)', 'Contamination (%)', num_bins=25, xlim=(49, 101), ylim=(-0.2, 10.2)) out_prefix = f'gtdb_r{self.release_number}_genome_quality.species' plot.save_plot(self.output_dir / f'{out_prefix}.png', dpi=600) plot.save_plot(self.output_dir / f'{out_prefix}.svg', dpi=600)
def __init__(self, skip_mpld3=False, dpi=96, output_dir=None): """Initialize.""" self.logger = logging.getLogger() self.skip_mpld3 = skip_mpld3 if not self.skip_mpld3: import mpld3 Options = namedtuple('Options', 'width height tick_font_size label_font_size dpi') options = Options(5, 4, 12, 12, 96) AbstractPlot.__init__(self, options) self.poly_color = (89.0 / 255, 89.0 / 255, 89.0 / 255) self.near_mono_color = (255.0 / 255, 188.0 / 255, 121.0 / 255) self.mono_color = (95.0 / 255, 158.0 / 255, 209.0 / 255) self.median_color = (0.0 / 255, 107.0 / 255, 164.0 / 255) self.dpi = dpi self.output_dir = output_dir
def __init__(self, options): """Initialize.""" AbstractPlot.__init__(self, options)
def run(self, bac120_metadata_file, ar120_metadata_file, domain): """Bar plot comparing GTDB and NCBI taxonomies.""" # parse GTDB metadata file to determine genomes in each species clusters self.logger.info('Reading GTDB metadata.') gtdb_taxonomy = {} type_strain = set() genome_category = {} sp_clusters = defaultdict(set) for mf in [bac120_metadata_file, ar120_metadata_file]: with open(mf, encoding='utf-8') as f: header = f.readline().strip().split('\t') gtdb_taxonomy_index = header.index('gtdb_taxonomy') gtdb_type_index = header.index('gtdb_type_designation') gtdb_rep_index = header.index('gtdb_representative') gtdb_genome_rep_index = header.index('gtdb_genome_representative') genome_category_index = header.index('ncbi_genome_category') for line in f: line_split = line.strip().split('\t') gid = line_split[0] gtdb_genome_rep = line_split[gtdb_genome_rep_index] sp_clusters[gtdb_genome_rep].add(gid) genome_category[gid] = line_split[genome_category_index] gtdb_rep = line_split[gtdb_rep_index] if gtdb_rep != 't': continue taxonomy = line_split[gtdb_taxonomy_index] gtdb_taxa = [t.strip() for t in taxonomy.split(';')] gtdb_domain = gtdb_taxa[0] if domain == 'Both' or domain in gtdb_domain: gtdb_taxonomy[gid] = gtdb_taxa type_designation = line_split[gtdb_type_index] if type_designation == 'type strain of species': type_strain.add(gid) self.logger.info(' ...identified {:,} GTDB species representatives.'.format(len(gtdb_taxonomy))) # determine genome types in each species cluster sp_genome_types = {} for rid in sp_clusters: if rid.startswith('UBA') or genome_category[rid] == 'derived from metagenome': sp_genome_types[rid] = 'MAG' elif genome_category[rid] in ['derived from environmental sample', 'derived from environmental_sample']: sp_genome_types[rid] = 'MAG' elif genome_category[rid] == 'derived from single cell': sp_genome_types[rid] = 'SAG' elif genome_category[rid] == 'none': sp_genome_types[rid] = 'ISOLATE' else: print(f'Unrecognized genome category: {genome_category[rid]}') sys.exit(-1) # determine type information for GTDB representatives, # and genome type category (isolate, MAG/SAG) out_prefix = f'gtdb_r{self.release_number}_sp_rep_type' if domain == 'Bacteria': out_prefix += '.bacteria' elif domain == 'Archaea': out_prefix += '.archaea' self.logger.info('Determining type information for GTDB representatives.') fout = open(self.output_dir / f'{out_prefix}.tsv', 'w') fout.write('Genome ID\tGTDB taxonomy\tGTDB species\tClassification\n') type_strain_categories = defaultdict(int) latinized_categories = defaultdict(int) placeholder_categories = defaultdict(int) for gid, taxa in gtdb_taxonomy.items(): gtdb_sp = taxa[6] fout.write('{}\t{}\t{}'.format(gid, ';'.join(taxa), gtdb_sp)) if gid in type_strain: fout.write('\ttype strain of species') type_strain_categories[sp_genome_types[gid]] += 1 elif self.latinized_species(gtdb_sp): fout.write('\tLatinized, not type strain') latinized_categories[sp_genome_types[gid]] += 1 else: fout.write('\tplaceholder') placeholder_categories[sp_genome_types[gid]] += 1 fout.write('\n') fout.close() # create plot self.logger.info('Creating plot.') options = AbstractPlot.Options(width=3, height=3, label_font_size=7, tick_font_size=6, dpi=600) plot = SpeciesRepTypePlot(options) plot.plot(type_strain_categories, latinized_categories, placeholder_categories) plot.save_plot(self.output_dir / f'{out_prefix}.png', dpi=600) plot.save_plot(self.output_dir / f'{out_prefix}.svg', dpi=600)
def run(self, bac120_metadata_file, ar120_metadata_file, all_genomes, width, height): """Plot of common genomic statistics.""" # get genome metadata self.logger.info('Reading GTDB metadata.') metadata = self.read_metadata(bac120_metadata_file, ar120_metadata_file) # create plot for each genomic statistic options = AbstractPlot.Options(width=width, height=height, label_font_size=7, tick_font_size=6, dpi=600) plot = GenomicStatsPlot(options, 2, 3) if all_genomes: ylabel = 'Genomes' out_prefix = f'gtdb_r{self.release_number}_genomic_stats.genomes' else: ylabel = 'Species' out_prefix = f'gtdb_r{self.release_number}_genomic_stats.species' table_data = [] table_data.append(('', 'Median', 'Mean', 'Std. deviation', 'Min.', 'Max.', '5th percentile', '95th percentile')) # Genome size panel self.logger.info(f'Creating genome size plot.') data = self.get_stat_data(metadata, 'genome_size', 1e6, 0, 20, all_genomes) plot.plot(1, data, 'Genome size (Mb)', f'{ylabel} ({len(data):,})') table_data.append(('Genome size (Mb)', f'{np_median(data):.1f}', f'{np_mean(data):.1f}', f'{np_std(data):.1f}', f'{min(data):.1f}', f'{max(data):.1f}', f'{np_percentile(data, 5):.1f}', f'{np_percentile(data, 95):.1f}')) # GC-content panel self.logger.info(f'Creating GC content plot.') data = self.get_stat_data(metadata, 'gc_percentage', 1, 20, 80, all_genomes) plot.plot(2, data, 'GC content (%)', f'{ylabel} ({len(data):,})') table_data.append(('GC content (%)', f'{np_median(data):.1f}', f'{np_mean(data):.1f}', f'{np_std(data):.1f}', f'{min(data):.1f}', f'{max(data):.1f}', f'{np_percentile(data, 5):.1f}', f'{np_percentile(data, 95):.1f}')) # SSU count panel self.logger.info(f'Creating SSU count plot.') data = self.get_stat_data(metadata, 'ssu_count', 1, -1, 12, all_genomes) plot.plot(3, data, 'No. SSU genes', f'{ylabel} ({len(data):,})', center_xticks=True) table_data.append(('No. SSU genes', f'{np_median(data):.1f}', f'{np_mean(data):.1f}', f'{np_std(data):.1f}', f'{min(data):.1f}', f'{max(data):.1f}', f'{np_percentile(data, 5):.1f}', f'{np_percentile(data, 95):.1f}')) # CDS count panel self.logger.info(f'Creating CDS count plot.') data = self.get_stat_data(metadata, 'protein_count', 1, 100, 10000, all_genomes) plot.plot(4, data, 'No. CDSs', f'{ylabel} ({len(data):,})') table_data.append(('No. CDSs', f'{np_median(data):.1f}', f'{np_mean(data):.1f}', f'{np_std(data):.1f}', f'{min(data):.1f}', f'{max(data):.1f}', f'{np_percentile(data, 5):.1f}', f'{np_percentile(data, 95):.1f}')) # Coding density panel self.logger.info(f'Creating coding density plot.') data = self.get_stat_data(metadata, 'coding_density', 1, 70, 100, all_genomes) plot.plot(5, data, 'Coding density (%)', f'{ylabel} ({len(data):,})') table_data.append(('Coding density (%)', f'{np_median(data):.1f}', f'{np_mean(data):.1f}', f'{np_std(data):.1f}', f'{min(data):.1f}', f'{max(data):.1f}', f'{np_percentile(data, 5):.1f}', f'{np_percentile(data, 95):.1f}')) # Contig count panel self.logger.info(f'Creating contig count plot.') data = self.get_stat_data(metadata, 'contig_count', 1, 1, 1000, all_genomes) plot.plot(6, data, 'No. contigs', f'{ylabel} ({len(data):,})') table_data.append(('No. contigs', f'{np_median(data):.1f}', f'{np_mean(data):.1f}', f'{np_std(data):.1f}', f'{min(data):.1f}', f'{max(data):.1f}', f'{np_percentile(data, 5):.1f}', f'{np_percentile(data, 95):.1f}')) plot.save_plot(self.output_dir / f'{out_prefix}.png', dpi=600) plot.save_plot(self.output_dir / f'{out_prefix}.svg', dpi=600) # write out table fout = open(self.output_dir / f'{out_prefix}.tsv','w') for row in table_data: fout.write('{}\n'.format('\t'.join(row))) fout.close() fout = open(self.output_dir / f'{out_prefix}.html','w') htmlcode = HTML.table(table_data, col_align=['left'] + ['center']*(len(table_data[0])-1), col_styles=['font-size: small']*len(table_data[0]), cellpadding=5) fout.write(htmlcode) fout.close()
def run(self, bac120_metadata_file, ar120_metadata_file, domain): """Plot nomenclatural status of species for each taxonomic rank.""" # parse GTDB metadata file to determine genomes in each species clusters self.logger.info('Reading GTDB metadata.') gtdb_taxonomy = {} ncbi_taxonomy = {} for mf in [bac120_metadata_file, ar120_metadata_file]: with open(mf, encoding='utf-8') as f: header = f.readline().strip().split('\t') gtdb_taxonomy_index = header.index('gtdb_taxonomy') ncbi_taxonomy_index = header.index('ncbi_taxonomy') gtdb_rep_index = header.index('gtdb_representative') for line in f: line_split = line.strip().split('\t') gid = line_split[0] gtdb_rep = line_split[gtdb_rep_index] if gtdb_rep != 't': continue taxonomy = line_split[gtdb_taxonomy_index] gtdb_taxa = [t.strip() for t in taxonomy.split(';')] gtdb_domain = gtdb_taxa[0] if domain == 'Both' or domain in gtdb_domain: gtdb_taxonomy[gid] = gtdb_taxa self.logger.info(' ...identified {:,} representative genomes.'.format( len(gtdb_taxonomy))) # get GTDB taxa at each rank gtdb_taxa_at_rank = defaultdict(set) for taxa in gtdb_taxonomy.values(): for rank, taxon in enumerate(taxa): gtdb_taxa_at_rank[rank].add(taxon) # determine nomenclatural category for each taxa at each rank out_prefix = f'gtdb_r{self.release_number}_nomenclatural_per_rank' self.logger.info('Determining nomenclatural type of taxa.') fout = open(self.output_dir / f'{out_prefix}.species.tsv', 'w') plot_latinized = [] plot_placeholder = [] plot_labels = [] for rank_index in range(1, 7): fout.write(Taxonomy.rank_labels[rank_index]) latinized = 0 placeholder = 0 for taxon in gtdb_taxa_at_rank[rank_index]: if rank_index != 6: if self.latinized_taxon(taxon): latinized += 1 else: placeholder += 1 else: if self.latinized_species(taxon): latinized += 1 else: placeholder += 1 total_taxa = latinized + placeholder fout.write(f'\t{total_taxa}\t{latinized}\t{placeholder}\n') plot_latinized.append(latinized * 100.0 / total_taxa) plot_placeholder.append(placeholder * 100.0 / total_taxa) plot_labels.append('{}\n{:,}'.format( Taxonomy.rank_labels[rank_index].capitalize(), total_taxa)) fout.close() # create plot self.logger.info('Creating plot.') options = AbstractPlot.Options(width=4, height=3, label_font_size=7, tick_font_size=6, dpi=600) plot = NomenclaturalPerRankPlot(options) plot.plot(plot_latinized, plot_placeholder, plot_labels) plot.save_plot(self.output_dir / f'{out_prefix}.png', dpi=600) plot.save_plot(self.output_dir / f'{out_prefix}.svg', dpi=600)
def run(self, metadata_file, msa_info_file, genome_info_file, output_prefix): """Create plots of genome completeness, genome contamination, and percent MSA.""" # read metadata comp = {} cont = {} type = {} with open(metadata_file) as f: headers = f.readline().strip().split('\t') comp_index = headers.index('checkm_completeness') cont_index = headers.index('checkm_contamination') type_index = headers.index('gtdb_type_designation') for line in f: line_split = line.strip().split('\t') gid = line_split[0] comp[gid] = float(line_split[comp_index]) cont[gid] = float(line_split[cont_index]) type[gid] = line_split[type_index] # read MSA info msa_perc = {} with open(msa_info_file) as f: headers = f.readline().strip().split('\t') msa_perc_index = headers.index('Amino acids (%)') for line in f: line_split = line.strip().split('\t') gid = line_split[0] msa_perc[gid] = float(line_split[msa_perc_index]) # read species information sp = {} with open(genome_info_file) as f: headers = f.readline().strip().split('\t') sp_index = headers.index('Species') for line in f: line_split = line.strip().split('\t') gid = line_split[0] sp[gid] = (line_split[sp_index]) # write out statistics to file fout = open(output_prefix + '_table.tsv', 'w') fout.write( 'Genome ID\tCompletenss (%)\tContamination (%)\tMSA completenss (%)\tSpecies\tGTDB type designation\n' ) for gid in msa_perc: fout.write( '%s\t%.2f\t%.2f\t%.2f\t%s\t%s\n' % (gid, comp[gid], cont[gid], msa_perc[gid], sp[gid], type[gid])) # plot stats options = AbstractPlot.Options(6, 7.5, 10, 8, 300) hist = Histogram(options) hist.plot(1, msa_perc.values(), 'MSA completeness (%)', 'Genomes (%)', range(0, 101, 5), 'blue') hist.plot(2, comp.values(), 'Completeness (%)', 'Genomes (%)', range(0, 101, 5), 'blue') hist.plot(3, cont.values(), 'Contamination (%)', 'Genomes (%)', range(0, 101, 5), 'blue') hist.save_plot(output_prefix + '.png')
def __init__(self): """Initialize.""" AbstractPlot.__init__(self, None)
def run(self, bac120_metadata_file, ar120_metadata_file): """Plot number of MAGs, SAGs, and isolates for each taxonomic rank.""" # parse GTDB metadata file to determine genomes in each species clusters # and the type of these genomes (MAG, SAG, or isolate) self.logger.info('Reading GTDB metadata.') gtdb_taxonomy = {} sp_clusters = defaultdict(set) genome_category = {} for mf in [bac120_metadata_file, ar120_metadata_file]: with open(mf, encoding='utf-8') as f: header = f.readline().strip().split('\t') gtdb_taxonomy_index = header.index('gtdb_taxonomy') gtdb_rep_index = header.index('gtdb_genome_representative') genome_category_index = header.index('ncbi_genome_category') for line in f: line_split = line.strip().split('\t') gid = line_split[0] taxonomy = line_split[gtdb_taxonomy_index] gtdb_rep = line_split[gtdb_rep_index] if taxonomy == 'none' or gtdb_rep == 'none': continue gtdb_taxa = [t.strip() for t in taxonomy.split(';')] gtdb_taxonomy[gid] = gtdb_taxa sp_clusters[gtdb_taxa[6]].add(gid) genome_category[gid] = line_split[genome_category_index] self.logger.info(' ...identified {:,} species clusters spanning {:,} genomes.'.format( len(sp_clusters), sum([len(gids) for gids in sp_clusters.values()]))) # determine genome types in each species cluster sp_genome_types = {} for sp, gids in sp_clusters.items(): sp_genome_types[sp] = sp_cluster_type_category(gids, genome_category) # get species in each taxa sp_in_taxa = defaultdict(lambda: defaultdict(set)) for taxa in gtdb_taxonomy.values(): for rank_index in range(1, 7): cur_taxon = taxa[rank_index] if rank_index < 5: # canonicalize names above genus cur_taxon = canonical_taxon_name(cur_taxon) sp_in_taxa[rank_index][cur_taxon].add(taxa[6]) # tabulate number of genome types at each rank out_prefix = f'gtdb_r{self.release_number}_genome_category_per_rank' self.logger.info('Tabulating genomes types at each rank.') fout_count = open(self.output_dir / f'{out_prefix}.tsv', 'w') fout_count.write('Rank\tNo. taxa\tBoth\tIsolate\tEnvironmental\n') fout_taxa = open(self.output_dir / f'{out_prefix}.taxa.tsv', 'w') fout_taxa.write('Rank\tNo. taxa\tBoth\tIsolate\tEnvironmental\n') plot_both = [] plot_isolate = [] plot_end = [] plot_labels = [] for rank_index in range(1, 7): fout_count.write(Taxonomy.rank_labels[rank_index]) fout_taxa.write(Taxonomy.rank_labels[rank_index]) both = set() env = set() isolate = set() for taxon in sp_in_taxa[rank_index]: taxon_categories = set() for sp in sp_in_taxa[rank_index][taxon]: taxon_categories.add(sp_genome_types[sp]) if (('ENV' in taxon_categories and 'ISOLATE' in taxon_categories) or 'BOTH' in taxon_categories): both.add(taxon) elif 'ISOLATE' in taxon_categories: isolate.add(taxon) elif 'ENV' in taxon_categories: env.add(taxon) else: self.logger.error(f'Genomes in species have an unassigned category: {taxon_categories}') sys.exit(-1) total_taxa = len(both) + len(isolate) + len(env) fout_count.write(f'\t{total_taxa}\t{len(both)}\t{len(isolate)}\t{len(env)}\n') fout_taxa.write('\t{}\t{}\t{}\t{}\n'.format( total_taxa, ', '.join(sorted(both)), ', '.join(sorted(isolate)), ', '.join(sorted(env)))) plot_both.append(len(both)*100.0/total_taxa) plot_isolate.append(len(isolate)*100.0/total_taxa) plot_end.append(len(env)*100.0/total_taxa) plot_labels.append('{}\n{:,}'.format( Taxonomy.rank_labels[rank_index].capitalize(), total_taxa)) isolate_genomes = sum([1 for c in genome_category.values() if c not in ENV_CATEGORIES]) env_genomes = sum([1 for c in genome_category.values() if c in ENV_CATEGORIES]) fout_count.write('Genomes\t{}\t{}\t{}\t{}\n'.format( len(genome_category), 0, isolate_genomes, env_genomes)) plot_both.append(0) plot_isolate.append(isolate_genomes*100.0/len(genome_category)) plot_end.append(env_genomes*100.0/len(genome_category)) plot_labels.append('{}\n{:,}'.format( 'Genomes', len(genome_category))) fout_count.close() fout_taxa.close() # create plot self.logger.info('Creating plot.') options = AbstractPlot.Options(width=4, height=3, label_font_size=7, tick_font_size=6, dpi=600) plot = GenomeCateogryPerRankPlot(options) plot.plot(plot_both, plot_isolate, plot_end, plot_labels) plot.save_plot(self.output_dir / f'{out_prefix}.png', dpi=600) plot.save_plot(self.output_dir / f'{out_prefix}.svg', dpi=600)
def run(self, bac120_metadata_file, ar120_metadata_file, all_genomes, domain): """Bar plot comparing GTDB and NCBI taxonomies.""" # parse GTDB metadata file to determine genomes in each species clusters self.logger.info('Reading GTDB metadata.') gtdb_taxonomy = {} ncbi_taxonomy = {} for mf in [bac120_metadata_file, ar120_metadata_file]: with open(mf, encoding='utf-8') as f: header = f.readline().strip().split('\t') gtdb_taxonomy_index = header.index('gtdb_taxonomy') ncbi_taxonomy_index = header.index('ncbi_taxonomy') gtdb_rep_index = header.index('gtdb_representative') for line in f: line_split = line.strip().split('\t') gid = line_split[0] gtdb_rep = line_split[gtdb_rep_index] if not (all_genomes or gtdb_rep == 't'): continue taxonomy = line_split[gtdb_taxonomy_index] gtdb_taxa = [t.strip() for t in taxonomy.split(';')] gtdb_domain = gtdb_taxa[0] if domain == 'Both' or domain in gtdb_domain: gtdb_taxonomy[gid] = gtdb_taxa taxonomy = line_split[ncbi_taxonomy_index] if taxonomy != 'none': ncbi_taxa = [ t.strip() for t in taxonomy.split(';') ] ncbi_taxonomy[gid] = ncbi_taxa self.logger.info( ' ...identified {:,} GTDB genomes and {:,} NCBI genomes.'.format( len(gtdb_taxonomy), len(ncbi_taxonomy))) # compare NCBI and GTDB taxa at each rank out_prefix = f'gtdb_r{self.release_number}_ncbi_compare' if all_genomes: out_prefix += '.genomes' else: out_prefix += '.species' if domain == 'Bacteria': out_prefix += '.bacteria' elif domain == 'Archaea': out_prefix += '.archaea' self.logger.info('Comparing NCBI and GTDB taxa.') fout = open(self.output_dir / f'{out_prefix}.summary.tsv', 'w') fout.write('Rank\tUnchanged\tPassive change\tActive change\n') fout_taxon = open(self.output_dir / f'{out_prefix}.taxon.tsv', 'w') fout_taxon.write('Genome ID\tNCBI taxon\tGTDB taxon\tClassification\n') plot_unchanged = [] plot_passive = [] plot_active = [] plot_labels = [] active_change = set() active_change_no_species = set() active_or_passive_change = set() for rank_index in range(1, 7): fout.write(Taxonomy.rank_labels[rank_index]) unchanged = 0 passive = 0 active = 0 for gid, ncbi_taxa in ncbi_taxonomy.items(): ncbi_taxon = ncbi_taxa[rank_index].replace('Candidatus ', '') gtdb_taxon = gtdb_taxonomy[gid][rank_index] classification = None if ncbi_taxon == gtdb_taxon: unchanged += 1 classification = 'unchanged' elif ncbi_taxon == Taxonomy.rank_prefixes[rank_index]: passive += 1 classification = 'passive change' active_or_passive_change.add(gid) else: active += 1 classification = 'active change' active_change.add(gid) active_or_passive_change.add(gid) if rank_index != 6: active_change_no_species.add(gid) fout_taxon.write( f'{gid}\t{ncbi_taxon}\t{gtdb_taxon}\t{classification}\n') total_taxa = unchanged + passive + active fout.write(f'\t{total_taxa}\t{unchanged}\t{active}\n') plot_unchanged.append(unchanged * 100.0 / total_taxa) plot_passive.append(passive * 100.0 / total_taxa) plot_active.append(active * 100.0 / total_taxa) plot_labels.append('{}'.format( Taxonomy.rank_labels[rank_index].capitalize())) if all_genomes: ylabel = 'Genomes (%)\n({:,})'.format(total_taxa) else: ylabel = 'Representative Genomes (%)\n({:,})'.format( total_taxa) fout.close() fout_taxon.close() self.logger.info( f'Identified {len(active_change_no_species):,} genomes with one or more active changes above the rank of species.' ) self.logger.info( f'Identified {len(active_change):,} genomes with one or more active changes.' ) self.logger.info( f'Identified {len(active_or_passive_change):,} genomes with one or more active or passive changes.' ) # create plot self.logger.info('Creating plot.') options = AbstractPlot.Options(width=4, height=3, label_font_size=7, tick_font_size=6, dpi=600) plot = NCBI_ComparePlot(options) plot.plot(plot_unchanged, plot_passive, plot_active, plot_labels, ylabel) plot.save_plot(self.output_dir / f'{out_prefix}.png', dpi=600) plot.save_plot(self.output_dir / f'{out_prefix}.svg', dpi=600)
def __init__(self): """Initialize.""" Options = namedtuple('Options', 'width height font_size dpi') options = Options(6, 6, 12, 96) AbstractPlot.__init__(self, options)