Пример #1
0
    def __init__(self, options):
        """Initialize."""

        AbstractPlot.__init__(self, options)

        self.fig.clear()
        self.fig.set_size_inches(self.options.width, self.options.height)
Пример #2
0
    def __init__(self, dpi=96):
        """Initialize."""
        self.logger = logging.getLogger()

        Options = namedtuple('Options', 'width height tick_font_size label_font_size dpi')
        options = Options(6, 6, 12, 12, 96)

        AbstractPlot.__init__(self, options)
        
        self.dpi = dpi
Пример #3
0
    def __init__(self, dpi=96):
        """Initialize."""
        self.logger = logging.getLogger()

        Options = namedtuple('Options', 'width height tick_font_size label_font_size dpi')
        options = Options(6, 6, 12, 12, 96)

        AbstractPlot.__init__(self, options)
        
        self.dpi = dpi

        check_dependencies(['genometreetk'])
Пример #4
0
    def save_html(self, output_html):
        """Save figure as HTML.

        Parameters
        ----------
        output_html : str
            Name of output file.
        """

        html_script = Tooltip.script_global
        html_body = Tooltip.html_body

        AbstractPlot.save_html(self, output_html, html_script, html_body)
Пример #5
0
    def save_html(self, output_html):
        """Save figure as HTML.

        Parameters
        ----------
        output_html : str
            Name of output file.
        """

        html_script = Tooltip.script_global
        html_body = Tooltip.html_body

        AbstractPlot.save_html(self, output_html, html_script, html_body)
Пример #6
0
    def __init__(self, infile, outfile):
        AbstractPlot.__init__(self, None)
        
        self.outfile = outfile
        self.genomes = None
        self._parse_data(infile)
        
        self.colormap = pylab.cm.bwr

        self.discreteColourMap = ListedColormap([(141/255.0, 211/255.0, 199/255.0),(255/255.0, 255/255.0, 179/255.0),
                                                    (190/255.0, 186/255.0, 218/255.0),(251/255.0, 128/255.0, 114/255.0),
                                                    (128/255.0, 177/255.0, 211/255.0),(253/255.0, 180/255.0, 98/255.0),
                                                    (179/255.0, 222/255.0, 105/255.0),(252/255.0, 205/255.0, 229/255.0),
                                                    (217/255.0, 217/255.0, 217/255.0), (188/255.0, 128/255.0, 189/255.0),
                                                    (204/255.0, 235/255.0, 197/255.0),(255/255.0, 237/255.0, 111/255.0)])
Пример #7
0
    def __init__(self, infile, outfile):
        AbstractPlot.__init__(self, None)
        
        self.outfile = outfile
        self.genomes = None
        self._parse_data(infile)
        
        self.colormap = pylab.cm.bwr

        self.discreteColourMap = ListedColormap([(141/255.0, 211/255.0, 199/255.0),(255/255.0, 255/255.0, 179/255.0),
                                                    (190/255.0, 186/255.0, 218/255.0),(251/255.0, 128/255.0, 114/255.0),
                                                    (128/255.0, 177/255.0, 211/255.0),(253/255.0, 180/255.0, 98/255.0),
                                                    (179/255.0, 222/255.0, 105/255.0),(252/255.0, 205/255.0, 229/255.0),
                                                    (217/255.0, 217/255.0, 217/255.0), (188/255.0, 128/255.0, 189/255.0),
                                                    (204/255.0, 235/255.0, 197/255.0),(255/255.0, 237/255.0, 111/255.0)])
Пример #8
0
    def run(self, bac120_metadata_file, ar120_metadata_file):
        """Scatter plot showing quality of GTDB representative genomes."""

        # get genome metadata
        self.logger.info('Reading GTDB metadata.')
        metadata = self.read_metadata(bac120_metadata_file,
                                      ar120_metadata_file)
        self.logger.info(
            f' ...read metadata for {len(metadata):,} representative genomes.')

        # get completenss, contamination, and MIMAG quality of each
        # GTDB species representative
        comp = []
        cont = []
        mimag_category = []
        exception_count = 0
        for gid, m in metadata.items():
            if m.mimag_hq:
                mimag_category.append('hq')
                comp.append(m.genome_comp)
                cont.append(m.genome_cont)
            elif m.mimag_mq:
                mimag_category.append('mq')
                comp.append(m.genome_comp)
                cont.append(m.genome_cont)
            elif m.mimag_lq:
                mimag_category.append('lq')
                comp.append(m.genome_comp)
                cont.append(m.genome_cont)
            else:
                #***self.logger.error(f'GTDB representative has no assigned MIMAG quality: {gid}')
                # this occurs becomes some representatives have >10% contamination
                exception_count += 1

        self.logger.info(
            ' ...HQ = {:,}, MQ = {:,}, LQ = {:,}, exceptions = {:,}'.format(
                mimag_category.count('hq'), mimag_category.count('mq'),
                mimag_category.count('lq'), exception_count))

        # create plot for each genomic statistic
        options = AbstractPlot.Options(width=4,
                                       height=4,
                                       label_font_size=7,
                                       tick_font_size=6,
                                       dpi=600)
        plot = GenomeQualityPlot(options)

        plot.plot(comp,
                  cont,
                  mimag_category,
                  exception_count,
                  'Completeness (%)',
                  'Contamination (%)',
                  num_bins=25,
                  xlim=(49, 101),
                  ylim=(-0.2, 10.2))

        out_prefix = f'gtdb_r{self.release_number}_genome_quality.species'
        plot.save_plot(self.output_dir / f'{out_prefix}.png', dpi=600)
        plot.save_plot(self.output_dir / f'{out_prefix}.svg', dpi=600)
Пример #9
0
    def __init__(self, skip_mpld3=False, dpi=96, output_dir=None):
        """Initialize."""

        self.logger = logging.getLogger()
        
        self.skip_mpld3 = skip_mpld3
        if not self.skip_mpld3:
            import mpld3

        Options = namedtuple('Options', 'width height tick_font_size label_font_size dpi')
        options = Options(5, 4, 12, 12, 96)

        AbstractPlot.__init__(self, options)

        self.poly_color = (89.0 / 255, 89.0 / 255, 89.0 / 255)
        self.near_mono_color = (255.0 / 255, 188.0 / 255, 121.0 / 255)
        self.mono_color = (95.0 / 255, 158.0 / 255, 209.0 / 255)

        self.median_color = (0.0 / 255, 107.0 / 255, 164.0 / 255)

        self.dpi = dpi
        self.output_dir = output_dir
Пример #10
0
 def __init__(self, options):
     """Initialize."""
     AbstractPlot.__init__(self, options)
    def run(self, 
                bac120_metadata_file, 
                ar120_metadata_file,
                domain):
        """Bar plot comparing GTDB and NCBI taxonomies."""
        
        # parse GTDB metadata file to determine genomes in each species clusters
        self.logger.info('Reading GTDB metadata.')
        gtdb_taxonomy = {}
        type_strain = set()
        genome_category = {}
        sp_clusters = defaultdict(set)
        for mf in [bac120_metadata_file, ar120_metadata_file]:
            with open(mf, encoding='utf-8') as f:
                header = f.readline().strip().split('\t')
                
                gtdb_taxonomy_index = header.index('gtdb_taxonomy')
                gtdb_type_index = header.index('gtdb_type_designation')
                gtdb_rep_index = header.index('gtdb_representative')
                gtdb_genome_rep_index = header.index('gtdb_genome_representative')
                genome_category_index = header.index('ncbi_genome_category')

                for line in f:
                    line_split = line.strip().split('\t')
                    
                    gid = line_split[0]
                    
                    gtdb_genome_rep = line_split[gtdb_genome_rep_index]
                    sp_clusters[gtdb_genome_rep].add(gid)
                    genome_category[gid] = line_split[genome_category_index]

                    gtdb_rep = line_split[gtdb_rep_index]
                    if gtdb_rep != 't':
                        continue
                        
                    taxonomy = line_split[gtdb_taxonomy_index]
                    gtdb_taxa = [t.strip() for t in taxonomy.split(';')]
                    gtdb_domain = gtdb_taxa[0]
                    if domain == 'Both' or domain in gtdb_domain:
                        gtdb_taxonomy[gid] = gtdb_taxa
                        
                    type_designation = line_split[gtdb_type_index]
                    if type_designation == 'type strain of species':
                        type_strain.add(gid)

        self.logger.info(' ...identified {:,} GTDB species representatives.'.format(len(gtdb_taxonomy)))
        
        # determine genome types in each species cluster
        sp_genome_types = {}
        for rid in sp_clusters:
            if rid.startswith('UBA') or genome_category[rid] == 'derived from metagenome':
                sp_genome_types[rid] = 'MAG'
            elif genome_category[rid] in ['derived from environmental sample', 'derived from environmental_sample']:
                sp_genome_types[rid] = 'MAG'
            elif genome_category[rid] == 'derived from single cell':
                sp_genome_types[rid] = 'SAG'
            elif genome_category[rid] == 'none':
                sp_genome_types[rid] = 'ISOLATE'
            else:
                print(f'Unrecognized genome category: {genome_category[rid]}')
                sys.exit(-1)
                            
        # determine type information for GTDB representatives,
        # and genome type category (isolate, MAG/SAG)
        out_prefix = f'gtdb_r{self.release_number}_sp_rep_type'
        if domain == 'Bacteria':
            out_prefix += '.bacteria'
        elif domain == 'Archaea':
            out_prefix += '.archaea'
            
        self.logger.info('Determining type information for GTDB representatives.')
        fout = open(self.output_dir / f'{out_prefix}.tsv', 'w')
        fout.write('Genome ID\tGTDB taxonomy\tGTDB species\tClassification\n')

        type_strain_categories = defaultdict(int)
        latinized_categories = defaultdict(int)
        placeholder_categories = defaultdict(int)
        for gid, taxa in gtdb_taxonomy.items():
            gtdb_sp = taxa[6]
            fout.write('{}\t{}\t{}'.format(gid, ';'.join(taxa), gtdb_sp))
            
            if gid in type_strain:
                fout.write('\ttype strain of species')
                type_strain_categories[sp_genome_types[gid]] += 1
            elif self.latinized_species(gtdb_sp):
                fout.write('\tLatinized, not type strain')
                latinized_categories[sp_genome_types[gid]] += 1
            else:
                fout.write('\tplaceholder')
                placeholder_categories[sp_genome_types[gid]] += 1
                
            fout.write('\n')

        fout.close()
        
        # create plot
        self.logger.info('Creating plot.')
        options = AbstractPlot.Options(width=3, 
                                        height=3, 
                                        label_font_size=7, 
                                        tick_font_size=6, 
                                        dpi=600)
        plot = SpeciesRepTypePlot(options)
        plot.plot(type_strain_categories,
                    latinized_categories,
                    placeholder_categories)
        
        plot.save_plot(self.output_dir / f'{out_prefix}.png', dpi=600)
        plot.save_plot(self.output_dir / f'{out_prefix}.svg', dpi=600)
Пример #12
0
    def run(self, 
                bac120_metadata_file,
                ar120_metadata_file,
                all_genomes,
                width,
                height):
        """Plot of common genomic statistics."""

        # get genome metadata
        self.logger.info('Reading GTDB metadata.')
        metadata = self.read_metadata(bac120_metadata_file, ar120_metadata_file)
        
        # create plot for each genomic statistic
        options = AbstractPlot.Options(width=width, 
                                        height=height, 
                                        label_font_size=7, 
                                        tick_font_size=6, 
                                        dpi=600)
        plot = GenomicStatsPlot(options, 2, 3)
        
        if all_genomes:
            ylabel = 'Genomes'
            out_prefix = f'gtdb_r{self.release_number}_genomic_stats.genomes'
        else:
            ylabel = 'Species'
            out_prefix = f'gtdb_r{self.release_number}_genomic_stats.species'
            
        table_data = []
        table_data.append(('', 
                            'Median', 
                            'Mean', 
                            'Std. deviation',
                            'Min.',
                            'Max.',
                            '5th percentile',
                            '95th percentile'))
        
        # Genome size panel
        self.logger.info(f'Creating genome size plot.')
        data = self.get_stat_data(metadata, 'genome_size', 1e6, 0, 20, all_genomes)
        plot.plot(1, data, 'Genome size (Mb)', f'{ylabel} ({len(data):,})')
        table_data.append(('Genome size (Mb)', 
                            f'{np_median(data):.1f}',
                            f'{np_mean(data):.1f}',
                            f'{np_std(data):.1f}',
                            f'{min(data):.1f}',
                            f'{max(data):.1f}',
                            f'{np_percentile(data, 5):.1f}',
                            f'{np_percentile(data, 95):.1f}'))
        
        # GC-content panel
        self.logger.info(f'Creating GC content plot.')
        data = self.get_stat_data(metadata, 'gc_percentage', 1, 20, 80, all_genomes)
        plot.plot(2, data, 'GC content (%)', f'{ylabel} ({len(data):,})')
        table_data.append(('GC content (%)', 
                            f'{np_median(data):.1f}',
                            f'{np_mean(data):.1f}',
                            f'{np_std(data):.1f}',
                            f'{min(data):.1f}',
                            f'{max(data):.1f}',
                            f'{np_percentile(data, 5):.1f}',
                            f'{np_percentile(data, 95):.1f}'))
        
        # SSU count panel
        self.logger.info(f'Creating SSU count plot.')
        data = self.get_stat_data(metadata, 'ssu_count', 1, -1, 12, all_genomes)
        plot.plot(3, data, 'No. SSU genes', f'{ylabel} ({len(data):,})', center_xticks=True)
        table_data.append(('No. SSU genes', 
                            f'{np_median(data):.1f}',
                            f'{np_mean(data):.1f}',
                            f'{np_std(data):.1f}',
                            f'{min(data):.1f}',
                            f'{max(data):.1f}',
                            f'{np_percentile(data, 5):.1f}',
                            f'{np_percentile(data, 95):.1f}'))
        
        # CDS count panel
        self.logger.info(f'Creating CDS count plot.')
        data = self.get_stat_data(metadata, 'protein_count', 1, 100, 10000, all_genomes)
        plot.plot(4, data, 'No. CDSs', f'{ylabel} ({len(data):,})')
        table_data.append(('No. CDSs', 
                            f'{np_median(data):.1f}',
                            f'{np_mean(data):.1f}',
                            f'{np_std(data):.1f}',
                            f'{min(data):.1f}',
                            f'{max(data):.1f}',
                            f'{np_percentile(data, 5):.1f}',
                            f'{np_percentile(data, 95):.1f}'))
        
        # Coding density panel
        self.logger.info(f'Creating coding density plot.')
        data = self.get_stat_data(metadata, 'coding_density', 1, 70, 100, all_genomes)
        plot.plot(5, data, 'Coding density (%)', f'{ylabel} ({len(data):,})')
        table_data.append(('Coding density (%)', 
                            f'{np_median(data):.1f}',
                            f'{np_mean(data):.1f}',
                            f'{np_std(data):.1f}',
                            f'{min(data):.1f}',
                            f'{max(data):.1f}',
                            f'{np_percentile(data, 5):.1f}',
                            f'{np_percentile(data, 95):.1f}'))
        
        # Contig count panel
        self.logger.info(f'Creating contig count plot.')
        data = self.get_stat_data(metadata, 'contig_count', 1, 1, 1000, all_genomes)
        plot.plot(6, data, 'No. contigs', f'{ylabel} ({len(data):,})')
        table_data.append(('No. contigs', 
                            f'{np_median(data):.1f}',
                            f'{np_mean(data):.1f}',
                            f'{np_std(data):.1f}',
                            f'{min(data):.1f}',
                            f'{max(data):.1f}',
                            f'{np_percentile(data, 5):.1f}',
                            f'{np_percentile(data, 95):.1f}'))

        plot.save_plot(self.output_dir / f'{out_prefix}.png', dpi=600)
        plot.save_plot(self.output_dir / f'{out_prefix}.svg', dpi=600)
        
        # write out table
        fout = open(self.output_dir / f'{out_prefix}.tsv','w')
        for row in table_data:
            fout.write('{}\n'.format('\t'.join(row)))
        fout.close()
        
        fout = open(self.output_dir / f'{out_prefix}.html','w')
        htmlcode = HTML.table(table_data,
                                col_align=['left'] + ['center']*(len(table_data[0])-1),
                                col_styles=['font-size: small']*len(table_data[0]),
                                cellpadding=5)
        fout.write(htmlcode)
        fout.close()
    def run(self, bac120_metadata_file, ar120_metadata_file, domain):
        """Plot nomenclatural status of species for each taxonomic rank."""

        # parse GTDB metadata file to determine genomes in each species clusters
        self.logger.info('Reading GTDB metadata.')
        gtdb_taxonomy = {}
        ncbi_taxonomy = {}
        for mf in [bac120_metadata_file, ar120_metadata_file]:
            with open(mf, encoding='utf-8') as f:
                header = f.readline().strip().split('\t')

                gtdb_taxonomy_index = header.index('gtdb_taxonomy')
                ncbi_taxonomy_index = header.index('ncbi_taxonomy')
                gtdb_rep_index = header.index('gtdb_representative')

                for line in f:
                    line_split = line.strip().split('\t')

                    gid = line_split[0]

                    gtdb_rep = line_split[gtdb_rep_index]
                    if gtdb_rep != 't':
                        continue

                    taxonomy = line_split[gtdb_taxonomy_index]
                    gtdb_taxa = [t.strip() for t in taxonomy.split(';')]

                    gtdb_domain = gtdb_taxa[0]
                    if domain == 'Both' or domain in gtdb_domain:

                        gtdb_taxonomy[gid] = gtdb_taxa

        self.logger.info(' ...identified {:,} representative genomes.'.format(
            len(gtdb_taxonomy)))

        # get GTDB taxa at each rank
        gtdb_taxa_at_rank = defaultdict(set)
        for taxa in gtdb_taxonomy.values():
            for rank, taxon in enumerate(taxa):
                gtdb_taxa_at_rank[rank].add(taxon)

        # determine nomenclatural category for each taxa at each rank
        out_prefix = f'gtdb_r{self.release_number}_nomenclatural_per_rank'
        self.logger.info('Determining nomenclatural type of taxa.')
        fout = open(self.output_dir / f'{out_prefix}.species.tsv', 'w')

        plot_latinized = []
        plot_placeholder = []
        plot_labels = []
        for rank_index in range(1, 7):
            fout.write(Taxonomy.rank_labels[rank_index])

            latinized = 0
            placeholder = 0
            for taxon in gtdb_taxa_at_rank[rank_index]:
                if rank_index != 6:
                    if self.latinized_taxon(taxon):
                        latinized += 1
                    else:
                        placeholder += 1
                else:
                    if self.latinized_species(taxon):
                        latinized += 1
                    else:
                        placeholder += 1

            total_taxa = latinized + placeholder
            fout.write(f'\t{total_taxa}\t{latinized}\t{placeholder}\n')

            plot_latinized.append(latinized * 100.0 / total_taxa)
            plot_placeholder.append(placeholder * 100.0 / total_taxa)
            plot_labels.append('{}\n{:,}'.format(
                Taxonomy.rank_labels[rank_index].capitalize(), total_taxa))

        fout.close()

        # create plot
        self.logger.info('Creating plot.')
        options = AbstractPlot.Options(width=4,
                                       height=3,
                                       label_font_size=7,
                                       tick_font_size=6,
                                       dpi=600)
        plot = NomenclaturalPerRankPlot(options)
        plot.plot(plot_latinized, plot_placeholder, plot_labels)

        plot.save_plot(self.output_dir / f'{out_prefix}.png', dpi=600)
        plot.save_plot(self.output_dir / f'{out_prefix}.svg', dpi=600)
Пример #14
0
    def run(self, metadata_file, msa_info_file, genome_info_file,
            output_prefix):
        """Create plots of genome completeness, genome contamination, and percent MSA."""

        # read metadata
        comp = {}
        cont = {}
        type = {}
        with open(metadata_file) as f:
            headers = f.readline().strip().split('\t')

            comp_index = headers.index('checkm_completeness')
            cont_index = headers.index('checkm_contamination')
            type_index = headers.index('gtdb_type_designation')

            for line in f:
                line_split = line.strip().split('\t')

                gid = line_split[0]
                comp[gid] = float(line_split[comp_index])
                cont[gid] = float(line_split[cont_index])
                type[gid] = line_split[type_index]

        # read MSA info
        msa_perc = {}
        with open(msa_info_file) as f:
            headers = f.readline().strip().split('\t')

            msa_perc_index = headers.index('Amino acids (%)')

            for line in f:
                line_split = line.strip().split('\t')

                gid = line_split[0]
                msa_perc[gid] = float(line_split[msa_perc_index])

        # read species information
        sp = {}
        with open(genome_info_file) as f:
            headers = f.readline().strip().split('\t')

            sp_index = headers.index('Species')

            for line in f:
                line_split = line.strip().split('\t')

                gid = line_split[0]
                sp[gid] = (line_split[sp_index])

        # write out statistics to file
        fout = open(output_prefix + '_table.tsv', 'w')
        fout.write(
            'Genome ID\tCompletenss (%)\tContamination (%)\tMSA completenss (%)\tSpecies\tGTDB type designation\n'
        )
        for gid in msa_perc:
            fout.write(
                '%s\t%.2f\t%.2f\t%.2f\t%s\t%s\n' %
                (gid, comp[gid], cont[gid], msa_perc[gid], sp[gid], type[gid]))

        # plot stats
        options = AbstractPlot.Options(6, 7.5, 10, 8, 300)

        hist = Histogram(options)
        hist.plot(1, msa_perc.values(), 'MSA completeness (%)', 'Genomes (%)',
                  range(0, 101, 5), 'blue')
        hist.plot(2, comp.values(), 'Completeness (%)', 'Genomes (%)',
                  range(0, 101, 5), 'blue')
        hist.plot(3, cont.values(), 'Contamination (%)', 'Genomes (%)',
                  range(0, 101, 5), 'blue')
        hist.save_plot(output_prefix + '.png')
Пример #15
0
 def __init__(self):
     """Initialize."""
     AbstractPlot.__init__(self, None)
    def run(self, bac120_metadata_file, ar120_metadata_file):
        """Plot number of MAGs, SAGs, and isolates for each taxonomic rank."""
        
        # parse GTDB metadata file to determine genomes in each species clusters
        # and the type of these genomes (MAG, SAG, or isolate)
        self.logger.info('Reading GTDB metadata.')
        gtdb_taxonomy = {}
        sp_clusters = defaultdict(set)
        genome_category = {}
        for mf in [bac120_metadata_file, ar120_metadata_file]:
            with open(mf, encoding='utf-8') as f:
                header = f.readline().strip().split('\t')
                
                gtdb_taxonomy_index = header.index('gtdb_taxonomy')
                gtdb_rep_index = header.index('gtdb_genome_representative')
                genome_category_index = header.index('ncbi_genome_category')
                
                for line in f:
                    line_split = line.strip().split('\t')
                    
                    gid = line_split[0]
                    
                    taxonomy = line_split[gtdb_taxonomy_index]
                    gtdb_rep = line_split[gtdb_rep_index]
                    if taxonomy == 'none' or gtdb_rep == 'none':
                        continue
                        
                    gtdb_taxa = [t.strip() for t in taxonomy.split(';')]
                    gtdb_taxonomy[gid] = gtdb_taxa
                    
                    sp_clusters[gtdb_taxa[6]].add(gid)
                    
                    genome_category[gid] = line_split[genome_category_index]

        self.logger.info(' ...identified {:,} species clusters spanning {:,} genomes.'.format(
                            len(sp_clusters),
                            sum([len(gids) for gids in sp_clusters.values()])))
                            
        # determine genome types in each species cluster
        sp_genome_types = {}
        for sp, gids in sp_clusters.items():
            sp_genome_types[sp] = sp_cluster_type_category(gids, genome_category)
                    
        # get species in each taxa
        sp_in_taxa = defaultdict(lambda: defaultdict(set))
        for taxa in gtdb_taxonomy.values():
            for rank_index in range(1, 7):
                cur_taxon = taxa[rank_index]
                if rank_index < 5:
                    # canonicalize names above genus
                    cur_taxon = canonical_taxon_name(cur_taxon)
                sp_in_taxa[rank_index][cur_taxon].add(taxa[6])
                    
        # tabulate number of genome types at each rank
        out_prefix = f'gtdb_r{self.release_number}_genome_category_per_rank'
        self.logger.info('Tabulating genomes types at each rank.')
        fout_count = open(self.output_dir / f'{out_prefix}.tsv', 'w')
        fout_count.write('Rank\tNo. taxa\tBoth\tIsolate\tEnvironmental\n')
        
        fout_taxa = open(self.output_dir / f'{out_prefix}.taxa.tsv', 'w')
        fout_taxa.write('Rank\tNo. taxa\tBoth\tIsolate\tEnvironmental\n')
        
        plot_both = []
        plot_isolate = []
        plot_end = []
        plot_labels = []
        for rank_index in range(1, 7):
            fout_count.write(Taxonomy.rank_labels[rank_index])
            fout_taxa.write(Taxonomy.rank_labels[rank_index])
            
            both = set()
            env = set()
            isolate = set()
            for taxon in sp_in_taxa[rank_index]:
                taxon_categories = set()
                for sp in sp_in_taxa[rank_index][taxon]:
                    taxon_categories.add(sp_genome_types[sp])
                    
                if (('ENV' in taxon_categories and 'ISOLATE' in taxon_categories)
                    or 'BOTH' in taxon_categories):
                    both.add(taxon)
                elif 'ISOLATE' in taxon_categories:
                    isolate.add(taxon)
                elif 'ENV' in taxon_categories:
                    env.add(taxon)
                else:
                    self.logger.error(f'Genomes in species have an unassigned category: {taxon_categories}')
                    sys.exit(-1)
                
            total_taxa = len(both) + len(isolate) + len(env)
            fout_count.write(f'\t{total_taxa}\t{len(both)}\t{len(isolate)}\t{len(env)}\n')
            
            fout_taxa.write('\t{}\t{}\t{}\t{}\n'.format(
                            total_taxa,
                            ', '.join(sorted(both)),
                            ', '.join(sorted(isolate)),
                            ', '.join(sorted(env))))
                            
            plot_both.append(len(both)*100.0/total_taxa)
            plot_isolate.append(len(isolate)*100.0/total_taxa)
            plot_end.append(len(env)*100.0/total_taxa)
            plot_labels.append('{}\n{:,}'.format(
                                Taxonomy.rank_labels[rank_index].capitalize(), 
                                total_taxa))
        
        isolate_genomes = sum([1 for c in genome_category.values() if c not in ENV_CATEGORIES])
        env_genomes = sum([1 for c in genome_category.values() if c in ENV_CATEGORIES])
        fout_count.write('Genomes\t{}\t{}\t{}\t{}\n'.format(
                            len(genome_category),
                            0,
                            isolate_genomes,
                            env_genomes))
                    
        plot_both.append(0)
        plot_isolate.append(isolate_genomes*100.0/len(genome_category))
        plot_end.append(env_genomes*100.0/len(genome_category))
        plot_labels.append('{}\n{:,}'.format(
                            'Genomes', 
                            len(genome_category)))
            
        fout_count.close()
        fout_taxa.close()
        
        # create plot
        self.logger.info('Creating plot.')
        options = AbstractPlot.Options(width=4, 
                                        height=3, 
                                        label_font_size=7, 
                                        tick_font_size=6, 
                                        dpi=600)
        plot = GenomeCateogryPerRankPlot(options)
        plot.plot(plot_both, plot_isolate, plot_end, plot_labels)
        
        plot.save_plot(self.output_dir / f'{out_prefix}.png', dpi=600)
        plot.save_plot(self.output_dir / f'{out_prefix}.svg', dpi=600)
Пример #17
0
    def run(self, bac120_metadata_file, ar120_metadata_file, all_genomes,
            domain):
        """Bar plot comparing GTDB and NCBI taxonomies."""

        # parse GTDB metadata file to determine genomes in each species clusters
        self.logger.info('Reading GTDB metadata.')
        gtdb_taxonomy = {}
        ncbi_taxonomy = {}
        for mf in [bac120_metadata_file, ar120_metadata_file]:
            with open(mf, encoding='utf-8') as f:
                header = f.readline().strip().split('\t')

                gtdb_taxonomy_index = header.index('gtdb_taxonomy')
                ncbi_taxonomy_index = header.index('ncbi_taxonomy')
                gtdb_rep_index = header.index('gtdb_representative')

                for line in f:
                    line_split = line.strip().split('\t')

                    gid = line_split[0]

                    gtdb_rep = line_split[gtdb_rep_index]
                    if not (all_genomes or gtdb_rep == 't'):
                        continue

                    taxonomy = line_split[gtdb_taxonomy_index]
                    gtdb_taxa = [t.strip() for t in taxonomy.split(';')]
                    gtdb_domain = gtdb_taxa[0]
                    if domain == 'Both' or domain in gtdb_domain:
                        gtdb_taxonomy[gid] = gtdb_taxa

                        taxonomy = line_split[ncbi_taxonomy_index]
                        if taxonomy != 'none':
                            ncbi_taxa = [
                                t.strip() for t in taxonomy.split(';')
                            ]
                            ncbi_taxonomy[gid] = ncbi_taxa

        self.logger.info(
            ' ...identified {:,} GTDB genomes and {:,} NCBI genomes.'.format(
                len(gtdb_taxonomy), len(ncbi_taxonomy)))

        # compare NCBI and GTDB taxa at each rank
        out_prefix = f'gtdb_r{self.release_number}_ncbi_compare'
        if all_genomes:
            out_prefix += '.genomes'
        else:
            out_prefix += '.species'

        if domain == 'Bacteria':
            out_prefix += '.bacteria'
        elif domain == 'Archaea':
            out_prefix += '.archaea'

        self.logger.info('Comparing NCBI and GTDB taxa.')
        fout = open(self.output_dir / f'{out_prefix}.summary.tsv', 'w')
        fout.write('Rank\tUnchanged\tPassive change\tActive change\n')

        fout_taxon = open(self.output_dir / f'{out_prefix}.taxon.tsv', 'w')
        fout_taxon.write('Genome ID\tNCBI taxon\tGTDB taxon\tClassification\n')

        plot_unchanged = []
        plot_passive = []
        plot_active = []
        plot_labels = []
        active_change = set()
        active_change_no_species = set()
        active_or_passive_change = set()
        for rank_index in range(1, 7):
            fout.write(Taxonomy.rank_labels[rank_index])

            unchanged = 0
            passive = 0
            active = 0
            for gid, ncbi_taxa in ncbi_taxonomy.items():
                ncbi_taxon = ncbi_taxa[rank_index].replace('Candidatus ', '')
                gtdb_taxon = gtdb_taxonomy[gid][rank_index]
                classification = None
                if ncbi_taxon == gtdb_taxon:
                    unchanged += 1
                    classification = 'unchanged'
                elif ncbi_taxon == Taxonomy.rank_prefixes[rank_index]:
                    passive += 1
                    classification = 'passive change'
                    active_or_passive_change.add(gid)
                else:
                    active += 1
                    classification = 'active change'
                    active_change.add(gid)
                    active_or_passive_change.add(gid)

                    if rank_index != 6:
                        active_change_no_species.add(gid)

                fout_taxon.write(
                    f'{gid}\t{ncbi_taxon}\t{gtdb_taxon}\t{classification}\n')

            total_taxa = unchanged + passive + active
            fout.write(f'\t{total_taxa}\t{unchanged}\t{active}\n')

            plot_unchanged.append(unchanged * 100.0 / total_taxa)
            plot_passive.append(passive * 100.0 / total_taxa)
            plot_active.append(active * 100.0 / total_taxa)
            plot_labels.append('{}'.format(
                Taxonomy.rank_labels[rank_index].capitalize()))

            if all_genomes:
                ylabel = 'Genomes (%)\n({:,})'.format(total_taxa)
            else:
                ylabel = 'Representative Genomes (%)\n({:,})'.format(
                    total_taxa)

        fout.close()
        fout_taxon.close()

        self.logger.info(
            f'Identified {len(active_change_no_species):,} genomes with one or more active changes above the rank of species.'
        )
        self.logger.info(
            f'Identified {len(active_change):,} genomes with one or more active changes.'
        )
        self.logger.info(
            f'Identified {len(active_or_passive_change):,} genomes with one or more active or passive changes.'
        )

        # create plot
        self.logger.info('Creating plot.')
        options = AbstractPlot.Options(width=4,
                                       height=3,
                                       label_font_size=7,
                                       tick_font_size=6,
                                       dpi=600)
        plot = NCBI_ComparePlot(options)
        plot.plot(plot_unchanged, plot_passive, plot_active, plot_labels,
                  ylabel)

        plot.save_plot(self.output_dir / f'{out_prefix}.png', dpi=600)
        plot.save_plot(self.output_dir / f'{out_prefix}.svg', dpi=600)
Пример #18
0
 def __init__(self, options):
     """Initialize."""
     AbstractPlot.__init__(self, options)
Пример #19
0
    def __init__(self):
        """Initialize."""
        Options = namedtuple('Options', 'width height font_size dpi')
        options = Options(6, 6, 12, 96)

        AbstractPlot.__init__(self, options)
Пример #20
0
    def __init__(self):
        """Initialize."""
        Options = namedtuple('Options', 'width height font_size dpi')
        options = Options(6, 6, 12, 96)

        AbstractPlot.__init__(self, options)
Пример #21
0
 def __init__(self):
     """Initialize."""
     AbstractPlot.__init__(self, None)