示例#1
0
    def __worker(self,
                    metadata_file,
                    nt_files,
                    max_genomes,
                    queue_in,
                    queue_out):
        """Process each species in parallel."""
        
        metadata = read_gtdb_metadata(metadata_file, ['checkm_completeness', 
                                                        'checkm_contamination'])
            
        genome_quality = {}
        for genome_id, m in metadata.items():
            genome_quality[genome_id] = m.checkm_completeness - 5*m.checkm_contamination
                
        while True:
            species, genome_ids = queue_in.get(block=True, timeout=None)
            if species == None:
                break
                
            # select highest quality genomes
            if len(genome_ids) > max_genomes:
                t = [(genome_id, q) 
                        for genome_id, q in genome_quality.items() 
                        if genome_id in genome_ids]
                hq_genomes = sorted(t, key=lambda x: x[1], reverse=True)[0:max_genomes]
                genome_ids = [x[0] for x in hq_genomes]

            ani = []
            af = []
            results = ''
            tmp_dir = tempfile.mkdtemp()
            for gi, gj in itertools.combinations(genome_ids, 2):
                tmp_file = tempfile.NamedTemporaryFile(delete=False, dir=tmp_dir)
                tmp_file.close()
                cmd = ('ani_calculator ' + 
                        '-genome1fna %s ' + 
                        '-genome2fna %s ' +
                        '-outfile %s -outdir %s ' +
                        '> /dev/null') % (nt_files[gi], 
                                            nt_files[gj],
                                            tmp_file.name,
                                            tmp_dir)
      
                os.system(cmd)
                with open(tmp_file.name) as f:
                    f.readline()
                    for line in f:
                        results += '%s\t%s' % (species, line)
                        line_split = line.strip().split('\t')
                        ani.append(0.5*(float(line_split[2]) + float(line_split[3])))
                        af.append(0.5*(float(line_split[4]) + float(line_split[5])))

            shutil.rmtree(tmp_dir)

            queue_out.put((species, ani, af, genome_ids, results))
示例#2
0
def read_quality_metadata(metadata_file):
    """Read statistics needed to determine genome quality."""

    return read_gtdb_metadata(metadata_file, [
        'gtdb_taxonomy', 'checkm_completeness', 'checkm_contamination',
        'checkm_strain_heterogeneity_100', 'genome_size', 'contig_count',
        'n50_contigs', 'scaffold_count', 'ambiguous_bases', 'total_gap_length',
        'ssu_count', 'ssu_length', 'mimag_high_quality', 'ncbi_assembly_level',
        'ncbi_genome_representation', 'ncbi_refseq_category',
        'ncbi_type_material_designation', 'ncbi_molecule_count',
        'ncbi_unspanned_gaps', 'ncbi_spanned_gaps', 'ncbi_genome_category'
    ])
    def _gtdb_user_genomes(self, gtdb_user_genomes_file, metadata_file):
        """Get map between GTDB User genomes and GenBank accessions."""
        
        uba_to_genbank = {}
        for line in open(gtdb_user_genomes_file):
            line_split = line.strip().split('\t')
            gb_acc = line_split[0]
            uba_id = line_split[4]
            uba_to_genbank[uba_id] = gb_acc
        
        user_to_genbank = {}
        m = read_gtdb_metadata(metadata_file, ['organism_name'])
        for gid, metadata in m.items():
            if '(UBA' in str(metadata.organism_name):
                uba_id = metadata.organism_name[metadata.organism_name.find('(')+1:-1]
                if uba_id in uba_to_genbank:
                    user_to_genbank[gid] = uba_to_genbank[uba_id]

        return user_to_genbank
    def _gtdb_user_genomes(self, gtdb_user_genomes_file, metadata_file):
        """Get map between GTDB User genomes and GenBank accessions."""

        uba_to_genbank = {}
        for line in open(gtdb_user_genomes_file):
            line_split = line.strip().split('\t')
            gb_acc = line_split[0]
            uba_id = line_split[4]
            uba_to_genbank[uba_id] = gb_acc

        user_to_genbank = {}
        m = read_gtdb_metadata(metadata_file, ['organism_name'])
        for gid, metadata in m.items():
            if '(UBA' in str(metadata.organism_name):
                uba_id = metadata.organism_name[metadata.organism_name.
                                                find('(') + 1:-1]
                if uba_id in uba_to_genbank:
                    user_to_genbank[gid] = uba_to_genbank[uba_id]

        return user_to_genbank
    def _genome_stats(self, metadata_file):
        """Genome genome and assembly quality metadata."""
        
        stats = read_gtdb_metadata(metadata_file, ['checkm_completeness',
                                                    'checkm_contamination',
                                                    'contig_count',
                                                    'n50_scaffolds',
                                                    'ambiguous_bases',
                                                    'total_gap_length',
                                                    'scaffold_count',
                                                    'ssu_count',
                                                    'gtdb_taxonomy',
                                                    'ncbi_molecule_count',
                                                    'ncbi_unspanned_gaps',
                                                    'ncbi_genome_representation',
                                                    'ncbi_spanned_gaps',
                                                    'ncbi_assembly_level',
                                                    'ncbi_taxonomy',
                                                    'ncbi_organism_name',
                                                    'lpsn_strain'])
 
        return stats
def read_quality_metadata(metadata_file):
    """Read statistics needed to determine genome quality."""
    
    return read_gtdb_metadata(metadata_file, ['gtdb_taxonomy',
                                                'checkm_completeness',
                                                'checkm_contamination',
                                                'checkm_strain_heterogeneity_100',
                                                'genome_size',
                                                'contig_count',
                                                'n50_contigs',
                                                'scaffold_count',
                                                'ambiguous_bases',
                                                'total_gap_length',
                                                'ssu_count',
                                                'ssu_length',
                                                'mimag_high_quality',
                                                'ncbi_assembly_level',
                                                'ncbi_genome_representation',
                                                'ncbi_refseq_category',
                                                'ncbi_type_material_designation',
                                                'ncbi_molecule_count',
                                                'ncbi_unspanned_gaps',
                                                'ncbi_spanned_gaps',
                                                'ncbi_genome_category'])
示例#7
0
    def run(self, metadata_file,
                gtdb_user_genomes_file,
                gtdb_user_reps,
                ncbi_refseq_assembly_file,
                ncbi_genbank_assembly_file,
                gtdb_domain_report,
                qc_exception_file,
                species_exception_file,
                min_comp,
                max_cont,
                min_quality,
                sh_exception,
                min_perc_markers,
                max_contigs,
                min_N50,
                max_ambiguous,
                output_dir):
        """Quality check all potential GTDB genomes."""

        # get GTDB and NCBI taxonomy strings for each genome
        self.logger.info('Reading NCBI taxonomy from GTDB metadata file.')
        ncbi_taxonomy, ncbi_update_count = read_gtdb_ncbi_taxonomy(metadata_file, species_exception_file)
        ncbi_species = binomial_species(ncbi_taxonomy)
        gtdb_taxonomy = read_gtdb_taxonomy(metadata_file)
        self.logger.info('Read NCBI taxonomy for %d genomes with %d manually defined updates.' % (len(ncbi_taxonomy), ncbi_update_count))
        self.logger.info('Read GTDB taxonomy for %d genomes.' % len(gtdb_taxonomy))
        
        # determine User genomes to retain for consideration
        gtdb_user_to_genbank = self._gtdb_user_genomes(gtdb_user_genomes_file, metadata_file)
        self.logger.info('Identified %d GTDB User genomes with GenBank accessions to retain for potential inclusion in GTDB.' % len(gtdb_user_to_genbank))
        
        user_genomes = 0
        for line in open(gtdb_user_reps):
            line_split = line.strip().split('\t')
            gid, taxonomy = line_split
            if gid not in gtdb_user_to_genbank:
                if 'd__Bacteria' in taxonomy:
                    self.logger.warning('Bacterial genome %s has no NCBI accession and is being skipped.' % gid)
                else:
                    gtdb_user_to_genbank[gid] = gid
                    user_genomes += 1
        self.logger.info('Identified %d archaeal GTDB User genome WITHOUT GenBank accessions to retain for potential inclusion in GTDB.' % user_genomes)

        # parse genomes flagged as exceptions from QC
        qc_exceptions = set()
        for line in open(qc_exception_file):
            qc_exceptions.add(line.split('\t')[0].strip())
        self.logger.info('Identified %d genomes flagged as exceptions from QC.' % len(qc_exceptions))
        
        # calculate quality score for genomes
        self.logger.info('Parsing QC statistics for each genome.')
        quality_metadata = read_gtdb_metadata(metadata_file, ['checkm_completeness',
                                                                'checkm_contamination',
                                                                'checkm_strain_heterogeneity_100',
                                                                'contig_count',
                                                                'n50_contigs',
                                                                'ambiguous_bases',
                                                                'genome_size'])
                                                                
        marker_perc = parse_marker_percentages(gtdb_domain_report)
                                                                
        # parse NCBI assembly files
        self.logger.info('Parsing NCBI assembly files.')
        excluded_from_refseq_note = exclude_from_refseq(ncbi_refseq_assembly_file, ncbi_genbank_assembly_file)

        # get type material designations for each genome
        self.logger.info('Reading type material designations for genomes from GTDB metadata file.')
        type_metadata = read_gtdb_metadata(metadata_file, ['ncbi_type_material_designation',
                                                                'gtdb_type_designation',
                                                                'gtdb_type_designation_sources'])
                                                                
        ncbi_tsp = ncbi_type_strain_of_species(type_metadata)
        gtdb_tsp = gtdb_type_strain_of_species(type_metadata)
        
        # QC all genomes
        self.logger.info('Validating genomes.')
        fout_retained = open(os.path.join(output_dir, 'qc_passed.tsv'), 'w')
        fout_failed = open(os.path.join(output_dir, 'qc_failed.tsv'), 'w')
        
        header = 'Accession\tNCBI species'
        header += '\tCompleteness (%)\tContamination (%)\tQuality\tStrain heterogeneity at 100%'
        header += '\tMarkers (%)\tNo. contigs\tN50 contigs\tAmbiguous bases'
        
        fout_retained.write(header + '\tNote\n')
        fout_failed.write(header)
        fout_failed.write('\tFailed completeness\tFailed contamination\tFailed quality')
        fout_failed.write('\tFailed marker percentage\tFailed no. contigs\tFailed N50 contigs\tFailed ambiguous bases\n')

        num_retained = 0
        num_filtered = 0
        for gid in quality_metadata:
            if gid.startswith('U_') and gid not in gtdb_user_to_genbank:
                # skip user genomes not marked for retention
                continue

            failed_tests = defaultdict(int)
            passed_qc = pass_qc(quality_metadata[gid], 
                                    marker_perc[gid],
                                    min_comp,
                                    max_cont,
                                    min_quality,
                                    sh_exception,
                                    min_perc_markers,
                                    max_contigs,
                                    min_N50,
                                    max_ambiguous,
                                    failed_tests)

            if passed_qc or gid in qc_exceptions:
                num_retained += 1
                fout_retained.write('%s\t%s' % (gid, ncbi_taxonomy[gid][6]))
                fout_retained.write('\t%.2f\t%.2f\t%.2f\t%s\t%.2f\t%d\t%d\t%d\t%s\n' % (
                                        quality_metadata[gid].checkm_completeness,
                                        quality_metadata[gid].checkm_contamination,
                                        quality_metadata[gid].checkm_completeness-5*quality_metadata[gid].checkm_contamination,
                                        ('%.2f' % quality_metadata[gid].checkm_strain_heterogeneity_100) if quality_metadata[gid].checkm_strain_heterogeneity_100 else '-',
                                        marker_perc[gid],
                                        quality_metadata[gid].contig_count,
                                        quality_metadata[gid].n50_contigs,
                                        quality_metadata[gid].ambiguous_bases,
                                        'Passed QC' if passed_qc else 'Flagged as exception'))
            else:
                num_filtered += 1 
                fout_failed.write('%s\t%s' % (gid, ncbi_taxonomy[gid][6]))
                fout_failed.write('\t%.2f\t%.2f\t%.2f\t%s\t%.2f\t%d\t%d\t%d' % (
                                        quality_metadata[gid].checkm_completeness,
                                        quality_metadata[gid].checkm_contamination,
                                        quality_metadata[gid].checkm_completeness-5*quality_metadata[gid].checkm_contamination,
                                        ('%.2f' % quality_metadata[gid].checkm_strain_heterogeneity_100) if quality_metadata[gid].checkm_strain_heterogeneity_100 else '-',
                                        marker_perc[gid],
                                        quality_metadata[gid].contig_count,
                                        quality_metadata[gid].n50_contigs,
                                        quality_metadata[gid].ambiguous_bases))
                fout_failed.write('\t%d\t%d\t%d\t%d\t%d\t%d\t%d\n' % (
                                    failed_tests['comp'],
                                    failed_tests['cont'],
                                    failed_tests['qual'],
                                    failed_tests['marker_perc'],
                                    failed_tests['contig_count'],
                                    failed_tests['N50'],
                                    failed_tests['ambig']))
        fout_retained.close()
        fout_failed.close()
        
        self.logger.info('Retained %d genomes and filtered %d genomes.' % (num_retained, num_filtered))
                                                                
        # QC genomes in each named species
        self.logger.info('Performing QC of type genome for each of the %d NCBI species.' % len(ncbi_species))
        
        fout_type_fail = open(os.path.join(output_dir, 'type_genomes_fail_qc.tsv'), 'w')
        fout_type_fail.write('Species\tAccession\tGTDB taxonomy\tNCBI taxonomy\tType sources\tNCBI assembly type\tGenome size (bp)')
        fout_type_fail.write('\tCompleteness (%)\tContamination (%)\tQuality\tStrain heterogeneity at 100%')
        fout_type_fail.write('\tMarkers (%)\tNo. contigs\tN50 contigs\tAmbiguous bases\tNCBI exclude from RefSeq\tLost species\n')
        
        fout_fail_sp = open(os.path.join(output_dir, 'species_fail_qc.tsv'), 'w')
        fout_fail_sp.write('Species\tAccession\tGTDB taxonomy\tNCBI taxonomy\tAssembled from type material\tGenome size (bp)')
        fout_fail_sp.write('\tCompleteness (%)\tContamination (%)\tQuality\tStrain heterogeneity at 100%')
        fout_fail_sp.write('\tMarkers (%)\tNo. contigs\tN50 contigs\tAmbiguous bases')
        fout_fail_sp.write('\tFailed completeness\tFailed contamination\tFailed quality')
        fout_fail_sp.write('\tFailed marker percentage\tFailed no. contigs\tFailed N50 contigs\tFailed ambiguous bases')
        fout_fail_sp.write('\tNCBI exclude from RefSeq\n')
        
        fout_sp_lost = open(os.path.join(output_dir, 'species_lost.tsv'), 'w')
        fout_sp_lost.write('Species\tNo. genomes\tNo. type genomes')
        fout_sp_lost.write('\tFail completeness\tFail contamination\tFail quality\tFailed percent markers')
        fout_sp_lost.write('\tFail no. contigs\tFail N50 contigs\tFail ambiguous bases\n')
        
        lost_type = 0
        lost_sp = 0
        filtered_genomes = 0
        failed_tests_cumulative = defaultdict(int)
        for sp, gids in ncbi_species.items():
            type_pass = set()
            type_fail = set()
            other_pass = set()
            other_fail = set()
            
            failed_tests_gids = {}
            for gid in gids:
                failed_tests = defaultdict(int)
                passed_qc = pass_qc(quality_metadata[gid], 
                                    marker_perc[gid],
                                    min_comp,
                                    max_cont,
                                    min_quality,
                                    sh_exception,
                                    min_perc_markers,
                                    max_contigs,
                                    min_N50,
                                    max_ambiguous,
                                    failed_tests)
                                    
                failed_tests_gids[gid] = failed_tests

                if gid in gtdb_tsp or gid in ncbi_tsp:
                    if passed_qc:
                        type_pass.add(gid)
                    else:
                        type_fail.add(gid)
                        filtered_genomes += 1
                else:
                    if passed_qc:
                        other_pass.add(gid)
                    else:
                        other_fail.add(gid)
                        filtered_genomes += 1
                        
            # tally failed species
            for test, count in failed_tests.items():
                failed_tests_cumulative[test] += count

            if len(type_pass) >= 1:
                # great: one or more type genomes pass QC and will be selected as the type genome
                continue 
            
            if len(type_fail):
                # all potential type genomes for species failed QC so report these for manual inspection
                lost_type += 1
                for gid in type_fail:
                    fout_type_fail.write('%s\t%s\t%s\t%s\t%s\t%s\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%d\t%d\t%d\t%s\t%s\n' % (
                                            sp,
                                            gid,
                                            '; '.join(gtdb_taxonomy[gid]),
                                            '; '.join(ncbi_taxonomy[gid]),
                                            type_metadata[gid].gtdb_type_designation_sources,
                                            type_metadata[gid].ncbi_type_material_designation,
                                            float(quality_metadata[gid].genome_size)/1e6,
                                            quality_metadata[gid].checkm_completeness,
                                            quality_metadata[gid].checkm_contamination,
                                            quality_metadata[gid].checkm_completeness-5*quality_metadata[gid].checkm_contamination,
                                            quality_metadata[gid].checkm_strain_heterogeneity_100,
                                            marker_perc[gid],
                                            quality_metadata[gid].contig_count,
                                            quality_metadata[gid].n50_contigs,
                                            quality_metadata[gid].ambiguous_bases,
                                            excluded_from_refseq_note[gid],
                                            len(other_pass) == 0))
                
            if len(other_pass) == 0:
                # no genomes for species pass QC so report loss of species
                lost_sp += 1
                fout_sp_lost.write('%s\t%d\t%d' % (sp, len(gids), len(type_fail)))
                fout_sp_lost.write('\t%d\t%d\t%d\t%d\t%d\t%d\t%d\n' % (
                                    sum([failed_tests_gids[gid]['comp'] for gid in gids]),
                                    sum([failed_tests_gids[gid]['cont'] for gid in gids]),
                                    sum([failed_tests_gids[gid]['qual'] for gid in gids]),
                                    sum([failed_tests_gids[gid]['marker_perc'] for gid in gids]),
                                    sum([failed_tests_gids[gid]['contig_count'] for gid in gids]),
                                    sum([failed_tests_gids[gid]['N50'] for gid in gids]),
                                    sum([failed_tests_gids[gid]['ambig'] for gid in gids])))
                                    
                for gid in type_fail.union(other_fail):
                    fout_fail_sp.write('%s\t%s\t%s\t%s\t%s\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%d\t%d\t%d' % (
                                            sp,
                                            gid,
                                            '; '.join(gtdb_taxonomy[gid]),
                                            '; '.join(ncbi_taxonomy[gid]),
                                            gid in type_fail,
                                            float(quality_metadata[gid].genome_size)/1e6,
                                            quality_metadata[gid].checkm_completeness,
                                            quality_metadata[gid].checkm_contamination,
                                            quality_metadata[gid].checkm_completeness-5*quality_metadata[gid].checkm_contamination,
                                            quality_metadata[gid].checkm_strain_heterogeneity_100,
                                            marker_perc[gid],
                                            quality_metadata[gid].contig_count,
                                            quality_metadata[gid].n50_contigs,
                                            quality_metadata[gid].ambiguous_bases))
                    fout_fail_sp.write('\t%d\t%d\t%d\t%d\t%d\t%d\t%d' % (
                                        failed_tests_gids[gid]['comp'],
                                        failed_tests_gids[gid]['cont'],
                                        failed_tests_gids[gid]['qual'],
                                        failed_tests_gids[gid]['marker_perc'],
                                        failed_tests_gids[gid]['contig_count'],
                                        failed_tests_gids[gid]['N50'],
                                        failed_tests_gids[gid]['ambig']))
                    fout_fail_sp.write('\t%s\n' % excluded_from_refseq_note[gid])

        fout_type_fail.close()
        fout_fail_sp.close()
        fout_sp_lost.close()
        
        self.logger.info('Genomes filtered for each criterion:')
        for test in sorted(failed_tests_cumulative):
            self.logger.info('%s: %d' % (test, failed_tests_cumulative[test]))

        self.logger.info('Filtered %d genomes assigned to NCBI species.' % filtered_genomes)
        self.logger.info('Identified %d species with type genomes failing QC and %d total species failing QC.' % (lost_type, lost_sp))
示例#8
0
    def run(self, 
                qc_file,
                gtdb_metadata_file,
                gtdb_final_clusters,
                species_exception_file,
                output_dir):
        """Quality check all potential GTDB genomes."""
        
        # identify genomes failing quality criteria
        self.logger.info('Reading QC file.')
        passed_qc = read_qc_file(qc_file)
        self.logger.info('Identified %d genomes passing QC.' % len(passed_qc))
        
        # get GTDB and NCBI taxonomy strings for each genome
        self.logger.info('Reading NCBI and GTDB taxonomy from GTDB metadata file.')
        ncbi_taxonomy, ncbi_update_count = read_gtdb_ncbi_taxonomy(gtdb_metadata_file, species_exception_file)
        prev_gtdb_taxonomy = read_gtdb_taxonomy(gtdb_metadata_file)
        self.logger.info('Read NCBI taxonomy for %d genomes with %d manually defined updates.' % (len(ncbi_taxonomy), ncbi_update_count))
        self.logger.info('Read GTDB taxonomy for %d genomes.' % len(prev_gtdb_taxonomy))
        
        # get GTDB metadata
        type_metadata = read_gtdb_metadata(gtdb_metadata_file, ['gtdb_type_designation',
                                                                    'gtdb_type_designation_sources',
                                                                    'gtdb_type_species_of_genus'])
                                                                    
        quality_metadata = read_quality_metadata(gtdb_metadata_file)

        # read species clusters
        sp_clusters, species, _rep_radius = read_clusters(gtdb_final_clusters)
        self.logger.info('Read %d species clusters.' % len(sp_clusters))
        
        # sanity check species clusters all defined by genomes passing QC
        for gid in sp_clusters:
            if gid not in passed_qc:
                self.logger.error('Genome %s defines a species cluster, but fails QC.' % gid)
                sys.exit(-1)
                
        # modify GTDB taxonomy to reflect new species clustering and report incongruencies
        self.logger.info('Identifying species with incongruent specific names.')
        self._incongruent_specific_names(species, 
                                            ncbi_taxonomy,
                                            prev_gtdb_taxonomy, 
                                            type_metadata, 
                                            output_dir)
        
        self._incongruent_genus_names(species, 
                                            ncbi_taxonomy,
                                            prev_gtdb_taxonomy, 
                                            type_metadata, 
                                            output_dir)
                                            

        # get GIDs for canonical and validation trees
        fout_bac_can_gtdb = open(os.path.join(output_dir, 'bac_can_taxonomy.tsv'), 'w')
        fout_bac_val_gtdb = open(os.path.join(output_dir, 'bac_val_taxonomy.tsv'), 'w')
        fout_ar_can_gtdb = open(os.path.join(output_dir, 'ar_can_taxonomy.tsv'), 'w')
        fout_ar_val_gtdb = open(os.path.join(output_dir, 'ar_val_taxonomy.tsv'), 'w')
            
        fout_bac_val = open(os.path.join(output_dir, 'gids_bac_validation.lst'), 'w')
        fout_ar_val = open(os.path.join(output_dir, 'gids_ar_validation.lst'), 'w')
        fout_bac_can = open(os.path.join(output_dir, 'gids_bac_canonical.lst'), 'w')
        fout_ar_can = open(os.path.join(output_dir, 'gids_ar_canonical.lst'), 'w')
        fout_bac_val.write('#Accession\tSpecies\tNote\n')
        fout_ar_val.write('#Accession\tSpecies\tNote\n')
        fout_bac_can.write('#Accession\tSpecies\tNote\n')
        fout_ar_can.write('#Accession\tSpecies\tNote\n')
   
        for rid in sp_clusters:
            domain = prev_gtdb_taxonomy[rid][0]
            if domain == 'd__Bacteria':
                fout_val = fout_bac_val
                fout_can = fout_bac_can
                
                fout_can_gtdb = fout_bac_can_gtdb
                fout_val_gtdb = fout_bac_val_gtdb
            elif domain == 'd__Archaea':
                fout_val = fout_ar_val
                fout_can = fout_ar_can
                fout_can_gtdb = fout_ar_can_gtdb
                fout_val_gtdb = fout_ar_val_gtdb
            else:
                self.logger.error('Genome %s has no GTDB domain assignment.' % rid)
                sys.exit(-1)
            
            # substitute proposed species name into GTDB taxonomy
            sp = species[rid]
            canonical_sp = parse_canonical_sp(sp)
            taxa = prev_gtdb_taxonomy[rid][0:6] + [canonical_sp]
            new_gtdb_str = '; '.join(taxa)
            fout_can_gtdb.write('%s\t%s\n' % (rid, new_gtdb_str))
            fout_val_gtdb.write('%s\t%s\n' % (rid, new_gtdb_str))
            
            fout_val.write('%s\t%s\t%s\n' % (rid, sp, 'GTDB type or representative genome'))
            fout_can.write('%s\t%s\t%s\n' % (rid, sp, 'GTDB type or representative genome'))
            
            cluster_gids = set(sp_clusters[rid])
            for gid in cluster_gids:
                if gid not in passed_qc:
                    self.logger.error('Genome %s is in a species cluster, but fails QC.' % gid)
                    sys.exit(-1)
                    
            if len(cluster_gids) > 0:
                # select highest-quality genome
                q = quality_score(cluster_gids, quality_metadata)
                gid = max(q.items(), key=operator.itemgetter(1))[0]
                
                taxa = prev_gtdb_taxonomy[gid][0:6] + [canonical_sp]
                new_gtdb_str = '; '.join(taxa)
    
                fout_val.write('%s\t%s\t%s\n' % (gid, sp, 'selected highest-quality genome (Q=%.2f)' % q[gid]))
                fout_val_gtdb.write('%s\t%s\n' % (gid, new_gtdb_str))
                    
        fout_bac_val.close()
        fout_ar_val.close()
        fout_bac_can.close()
        fout_ar_can.close()
        
        fout_bac_can_gtdb.close()
        fout_bac_val_gtdb.close()
        fout_ar_can_gtdb.close()
        fout_ar_val_gtdb.close()
        
示例#9
0
    def run(self, qc_file, gtdb_metadata_file, gtdb_final_clusters,
            output_dir):
        """Quality check all potential GTDB genomes."""

        # identify genomes failing quality criteria
        self.logger.info('Reading QC file.')
        passed_qc = read_qc_file(qc_file)
        self.logger.info('Identified %d genomes passing QC.' % len(passed_qc))

        # get GTDB and NCBI taxonomy strings for each genome
        self.logger.info(
            'Reading NCBI and GTDB taxonomy from GTDB metadata file.')
        ncbi_taxonomy = read_gtdb_ncbi_taxonomy(gtdb_metadata_file)
        prev_gtdb_taxonomy = read_gtdb_taxonomy(gtdb_metadata_file)
        self.logger.info('Read NCBI taxonomy for %d genomes.' %
                         len(ncbi_taxonomy))
        self.logger.info('Read GTDB taxonomy for %d genomes.' %
                         len(prev_gtdb_taxonomy))

        # get GTDB metadata
        type_metadata = read_gtdb_metadata(gtdb_metadata_file, [
            'gtdb_type_designation', 'gtdb_type_designation_sources',
            'gtdb_type_species_of_genus'
        ])

        quality_metadata = read_quality_metadata(gtdb_metadata_file)

        # read species clusters
        sp_clusters, species = read_clusters(gtdb_final_clusters)
        self.logger.info('Read %d species clusters.' % len(sp_clusters))

        # sanity check species clusters all defined by genomes passing QC
        for gid in sp_clusters:
            if gid not in passed_qc:
                self.logger.error(
                    'Genome %s defines a species cluster, but fails QC.' % gid)
                sys.exit(-1)

        # modify GTDB taxonomy to reflect new species clustering a report incongruencies
        self.logger.info(
            'Identifying species with incongruent specific names.')
        self._incongruent_specific_names(species, ncbi_taxonomy,
                                         prev_gtdb_taxonomy, type_metadata,
                                         output_dir)

        self._incongruent_genus_names(species, ncbi_taxonomy,
                                      prev_gtdb_taxonomy, type_metadata,
                                      output_dir)

        # get GIDs for canonical and validation trees
        fout_bac_can_gtdb = open(
            os.path.join(output_dir, 'bac_can_taxonomy.tsv'), 'w')
        fout_bac_val_gtdb = open(
            os.path.join(output_dir, 'bac_val_taxonomy.tsv'), 'w')
        fout_ar_can_gtdb = open(
            os.path.join(output_dir, 'ar_can_taxonomy.tsv'), 'w')
        fout_ar_val_gtdb = open(
            os.path.join(output_dir, 'ar_val_taxonomy.tsv'), 'w')

        fout_bac_val = open(
            os.path.join(output_dir, 'gids_bac_validation.lst'), 'w')
        fout_ar_val = open(os.path.join(output_dir, 'gids_ar_validation.lst'),
                           'w')
        fout_bac_can = open(os.path.join(output_dir, 'gids_bac_canonical.lst'),
                            'w')
        fout_ar_can = open(os.path.join(output_dir, 'gids_ar_canonical.lst'),
                           'w')
        fout_bac_val.write('#Accession\tSpecies\tNote\n')
        fout_ar_val.write('#Accession\tSpecies\tNote\n')
        fout_bac_can.write('#Accession\tSpecies\tNote\n')
        fout_ar_can.write('#Accession\tSpecies\tNote\n')

        for rid in sp_clusters:
            domain = prev_gtdb_taxonomy[rid][0]
            if domain == 'd__Bacteria':
                fout_val = fout_bac_val
                fout_can = fout_bac_can

                fout_can_gtdb = fout_bac_can_gtdb
                fout_val_gtdb = fout_bac_val_gtdb
            elif domain == 'd__Archaea':
                fout_val = fout_ar_val
                fout_can = fout_ar_can
                fout_can_gtdb = fout_ar_can_gtdb
                fout_val_gtdb = fout_ar_val_gtdb
            else:
                self.logger.error('Genome %s has no GTDB domain assignment.' %
                                  rid)
                sys.exit(-1)

            # substitute proposed species name into GTDB taxonomy
            sp = species[rid]
            canonical_sp = parse_canonical_sp(sp)
            taxa = prev_gtdb_taxonomy[rid][0:6] + [canonical_sp]
            new_gtdb_str = '; '.join(taxa)
            fout_can_gtdb.write('%s\t%s\n' % (rid, new_gtdb_str))
            fout_val_gtdb.write('%s\t%s\n' % (rid, new_gtdb_str))

            fout_val.write('%s\t%s\t%s\n' %
                           (rid, sp, 'GTDB type or representative genome'))
            fout_can.write('%s\t%s\t%s\n' %
                           (rid, sp, 'GTDB type or representative genome'))

            cluster_gids = set(sp_clusters[rid])
            for gid in cluster_gids:
                if gid not in passed_qc:
                    self.logger.error(
                        'Genome %s is in a species cluster, but fails QC.' %
                        gid)
                    sys.exit(-1)

            if len(cluster_gids) > 0:
                # select highest-quality genome
                q = quality_score(cluster_gids, quality_metadata)
                gid = max(q.items(), key=operator.itemgetter(1))[0]

                fout_val.write(
                    '%s\t%s\t%s\n' %
                    (gid, sp,
                     'selected highest-quality genome (Q=%.2f)' % q[gid]))
                fout_val_gtdb.write('%s\t%s\n' % (gid, new_gtdb_str))

        fout_bac_val.close()
        fout_ar_val.close()
        fout_bac_can.close()
        fout_ar_can.close()

        fout_bac_can_gtdb.close()
        fout_bac_val_gtdb.close()
        fout_ar_can_gtdb.close()
        fout_ar_val_gtdb.close()
示例#10
0
    def propagate(self, options):
        """Propagate labels to all genomes in a cluster."""

        check_file_exists(options.input_taxonomy)
        check_file_exists(options.metadata_file)

        # get representative genome information
        rep_metadata = read_gtdb_metadata(options.metadata_file, ['gtdb_representative',
                                                                  'gtdb_clustered_genomes'])
                                                                  
        taxonomy = Taxonomy()
        explict_tax = taxonomy.read(options.input_taxonomy)
        expanded_taxonomy = {}
        incongruent_count = 0
        for genome_id, taxon_list in explict_tax.iteritems():
            taxonomy_str = ';'.join(taxon_list)

            # Propagate taxonomy strings if genome is a representatives. Also, determine
            # if genomes clustered together have compatible taxonomies. Note that a genome
            # may not have metadata as it is possible a User has removed a genome that is
            # in the provided taxonomy file.
            _rep_genome, clustered_genomes = rep_metadata.get(genome_id, (None, None))
            if clustered_genomes:  # genome is a representative
                clustered_genome_ids = clustered_genomes.split(';')

                # get taxonomy of all genomes in cluster with a specified taxonomy
                clustered_genome_tax = {}
                for cluster_genome_id in clustered_genome_ids:
                    if cluster_genome_id == genome_id:
                        continue

                    if cluster_genome_id not in rep_metadata:
                        continue  # genome is no longer in the GTDB so ignore it

                    if cluster_genome_id in explict_tax:
                        clustered_genome_tax[cluster_genome_id] = explict_tax[cluster_genome_id]

                # determine if representative and clustered genome taxonomy strings are congruent
                working_cluster_taxonomy = list(taxon_list)
                incongruent_with_rep = False
                for cluster_genome_id, cluster_tax in clustered_genome_tax.iteritems():
                    if incongruent_with_rep:
                        working_cluster_taxonomy = list(taxon_list)  # default to rep taxonomy
                        break

                    for r in xrange(0, len(Taxonomy.rank_prefixes)):
                        if cluster_tax[r] == Taxonomy.rank_prefixes[r]:
                            break  # no more taxonomy information to consider

                        if cluster_tax[r] != taxon_list[r]:
                            if taxon_list[r] == Taxonomy.rank_prefixes[r]:
                                # clustered genome has a more specific taxonomy string which
                                # should be propagate to the representative if all clustered
                                # genomes are in agreement
                                if working_cluster_taxonomy[r] == Taxonomy.rank_prefixes[r]:
                                    # make taxonomy more specific based on genomes in cluster
                                    working_cluster_taxonomy[r] = cluster_tax[r]
                                elif working_cluster_taxonomy[r] != cluster_tax[r]:
                                    # not all genomes agree on the assignment of this rank so leave it unspecified
                                    working_cluster_taxonomy[r] = Taxonomy.rank_prefixes[r]
                                    break
                            else:
                                # genomes in cluster have incongruent taxonomies so defer to representative
                                self.logger.warning("Genomes in cluster have incongruent taxonomies.")
                                self.logger.warning("Representative %s: %s" % (genome_id, taxonomy_str))
                                self.logger.warning("Clustered genome %s: %s" % (cluster_genome_id, ';'.join(cluster_tax)))
                                self.logger.warning("Deferring to taxonomy specified for representative.")

                                incongruent_count += 1
                                incongruent_with_rep = True
                                break

                cluster_taxonomy_str = ';'.join(working_cluster_taxonomy)

                # assign taxonomy to representative and all genomes in the cluster
                expanded_taxonomy[genome_id] = cluster_taxonomy_str
                for cluster_genome_id in clustered_genome_ids:
                    expanded_taxonomy[cluster_genome_id] = cluster_taxonomy_str
            else:
                if genome_id in expanded_taxonomy:
                    # genome has already been assigned a taxonomy based on its representative
                    pass
                else:
                    # genome is a singleton
                    expanded_taxonomy[genome_id] = taxonomy_str


        self.logger.info('Identified %d clusters with incongruent taxonomies.' % incongruent_count)

        fout = open(options.output_taxonomy, 'w')
        for genome_id, taxonomy_str in expanded_taxonomy.iteritems():
            fout.write('%s\t%s\n' % (genome_id, taxonomy_str))
        fout.close()

        self.logger.info('Taxonomy written to: %s' % options.output_taxonomy)
示例#11
0
    def run(self, input_tree, lineage_of_interest, outgroup, gtdb_metadata,
            num_taxa_to_retain, msa_file, keep_unclassified, output_dir):
        """Dereplicate tree.

        Parameters
        ----------
        input_tree : str
            Tree to dereplicate
        lineage_of_interest : str
            Named lineage where all taxa should be retain.
        outgroup : str
            Named lineage to use as outgroup.
        gtdb_metadata : str
            File containing metadata for taxa in tree.
        num_taxa_to_retain: int
            Taxa to retain in dereplicated lineages.
        msa_file : str
            Multiple sequence alignment to dereplicate along with tree.
        keep_unclassified : boolean
            Keep all taxa in unclassified lineages.
        output_dir:
            Output dir.
        """

        # read GTDB metadata
        self.logger.info('Reading metadata.')
        genome_metadata = read_gtdb_metadata(gtdb_metadata, [
            'checkm_completeness', 'checkm_contamination',
            'gtdb_representative'
        ])

        # read tree
        self.logger.info('Reading tree.')
        tree = dendropy.Tree.get_from_path(input_tree,
                                           schema='newick',
                                           rooting='force-rooted',
                                           preserve_underscores=True)

        # locate node of interest and outgroup node
        self.logger.info('Identifying lineage of interest and outgroup.')
        node_of_interest = None
        outgroup_node = None
        for node in tree.preorder_node_iter():
            _support, taxon_str, _auxiliary_info = parse_label(node.label)

            if not taxon_str:
                continue

            for taxon in [t.strip() for t in taxon_str.split(';')]:
                if taxon == lineage_of_interest:
                    node_of_interest = node
                elif taxon == outgroup:
                    outgroup_node = node

        if not node_of_interest:
            self.logger.error(
                'Could not find specified lineage of interest: %s' %
                lineage_of_interest)
            sys.exit()

        if not outgroup_node:
            self.logger.error('Could not find outgroup: %s' % outgroup)
            sys.exit()

        # select taxa to retain
        self.logger.info('Selecting taxa to retain.')
        selected_taxa = self._select_taxa(tree, node_of_interest,
                                          outgroup_node, num_taxa_to_retain,
                                          keep_unclassified, genome_metadata)

        self.logger.info('Retaining %d taxa.' % len(selected_taxa))

        # prune tree
        self.logger.info('Pruning tree.')
        tree.retain_taxa(selected_taxa)

        # dereplicate MSA if requested
        if msa_file:
            self.logger.info('Dereplicating MSA.')
            msa_name, msa_ext = os.path.splitext(os.path.basename(msa_file))
            output_msa = os.path.join(output_dir,
                                      msa_name + '.derep' + msa_ext)
            self._derep_msa(msa_file, selected_taxa, output_msa)

        # write out results
        tree_name, tree_ext = os.path.splitext(os.path.basename(input_tree))
        output_tree = os.path.join(output_dir, tree_name + '.derep' + tree_ext)
        tree.write_to_path(output_tree,
                           schema='newick',
                           suppress_rooting=True,
                           unquoted_underscores=True)
示例#12
0
    def propagate(self, options):
        """Propagate labels to all genomes in a cluster."""

        check_file_exists(options.input_taxonomy)
        check_file_exists(options.metadata_file)

        user_to_uba = {}
        if options.uba_mapping_file:
            self.logger.info('Parsing genome ID mapping file.')
            with open(options.uba_mapping_file) as f:
                for line in f:
                    tokens = line.strip().split('\t')
                    if len(tokens) == 2:
                        user_to_uba[tokens[0]] = tokens[1]
            self.logger.info(' - found mappings for {:,} genomes.'.format(
                len(user_to_uba)))

        # get representative genome information
        rep_metadata = read_gtdb_metadata(
            options.metadata_file,
            ['gtdb_representative', 'gtdb_clustered_genomes'])

        rep_metadata = {
            canonical_gid(gid): values
            for gid, values in rep_metadata.items()
        }

        rep_metadata = {
            user_to_uba.get(gid, gid): values
            for gid, values in rep_metadata.items()
        }

        explict_tax = Taxonomy().read(options.input_taxonomy)

        self.logger.info(f' - identified {len(rep_metadata):,} genomes')

        # sanity check all representatives have a taxonomy string
        rep_count = 0
        for gid in rep_metadata:
            is_rep_genome, clustered_genomes = rep_metadata.get(
                gid, (None, None))
            if is_rep_genome:
                rep_count += 1
                if gid not in explict_tax:
                    self.logger.error(
                        'Expected to find {} in input taxonomy as it is a GTDB representative.'
                        .format(gid))
                    sys.exit(-1)

        self.logger.info(
            'Identified {:,} representatives in metadata file and {:,} genomes in input taxonomy file.'
            .format(rep_count, len(explict_tax)))

        # propagate taxonomy to genomes clustered with each representative
        fout = open(options.output_taxonomy, 'w')
        for rid, taxon_list in explict_tax.items():
            taxonomy_str = ';'.join(taxon_list)
            rid = canonical_gid(rid)
            rid = user_to_uba.get(rid, rid)

            is_rep_genome, clustered_genomes = rep_metadata[rid]
            if is_rep_genome:
                # assign taxonomy to representative and all genomes in the cluster
                fout.write('{}\t{}\n'.format(rid, taxonomy_str))
                for cid in [
                        gid.strip() for gid in clustered_genomes.split(';')
                ]:
                    cid = canonical_gid(cid)
                    cid = user_to_uba.get(cid, cid)
                    if cid != rid:
                        if cid in rep_metadata:
                            fout.write('{}\t{}\n'.format(cid, taxonomy_str))
                        else:
                            self.logger.warning(
                                'Skipping {} as it is not in GTDB metadata file.'
                                .format(cid))
            else:
                self.logger.error(
                    'Did not expected to find {} in input taxonomy as it is not a GTDB representative.'
                    .format(rid))
                sys.exit(-1)

        self.logger.info('Taxonomy written to: {}'.format(
            options.output_taxonomy))
    def run(self, max_species,
                prev_rep_file,
                trusted_genomes_file,
                metadata_file,
                min_rep_comp,
                max_rep_cont,
                min_quality,
                max_contigs,
                min_N50,
                max_ambiguous,
                max_gap_length,
                strict_filtering,
                output_file):
        """Dereplicate genomes to a specific number per named species.

        Parameters
        ----------
        max_species : int
            Maximum number of genomes of the same species to retain.
        prev_rep_file : str
            File indicating previous representatives to favour during selection.
        trusted_genomes_file:
            File containing list of genomes to retain regardless of filtering criteria.
        metadata_file : str
            Metadata, including CheckM estimates, for all genomes.
        min_rep_comp : float [0, 100]
            Minimum completeness for a genome to be a representative.
        max_rep_cont : float [0, 100]
            Maximum contamination for a genome to be a representative.
        min_quality : float [0, 100]
            Minimum genome quality (comp-5*cont) for a genome to be a representative.
        max_contigs : int
            Maximum number of contigs for a genome to be a representative.
        min_N50 : int
            Minimum N50 of scaffolds for a genome to be a representative.
        max_ambiguous : int
            Maximum number of ambiguous bases for a genome to be a representative.
        max_gap_length : int
            Maximum number of ambiguous bases between contigs for a genome to be a representative.
        strict_filtering : boolean
            If True apply filtering to all genomes, otherise apply lenient 
            filtering to genomes where the chromosome and plasmids are reported 
            as complete.
        output_file : str
            Output file to contain list of dereplicated genomes.
        """
        
        trusted_accessions = set()
        if trusted_genomes_file:
            for line in open(trusted_genomes_file):
                line_split = line.split('\t')
                trusted_accessions.add(line_split[0].strip())

        accession_to_taxid, complete_genomes, representative_genomes = ncbi.read_refseq_metadata(metadata_file, keep_db_prefix=True)
        self.logger.info('Identified %d RefSeq genomes.' % len(accession_to_taxid))
        self.logger.info('Identified %d representative or reference genomes.' % len(representative_genomes))
        self.logger.info('Identified %d complete genomes.' % len(complete_genomes))
        self.logger.info('Identified %d genomes in exception list.' % len(trusted_accessions))
        
        if trusted_accessions.difference(representative_genomes):
            self.logger.error('There are genomes in the exception list which are not representatives.')
            sys.exit()
        
        gtdb_taxonomy = read_gtdb_taxonomy(metadata_file)
        ncbi_taxonomy = read_gtdb_ncbi_taxonomy(metadata_file)
        ncbi_organism_names = read_gtdb_ncbi_organism_name(metadata_file)
        species = species_label(gtdb_taxonomy, ncbi_taxonomy, ncbi_organism_names)
        self.logger.info('Identified %d genomes with a GTDB or NCBI species names.' % len(species))

        # get previous representatives
        prev_gtdb_reps = set()
        for line in open(prev_rep_file):
            prev_gtdb_reps.add(line.strip().split('\t')[0])
            
        self.logger.info('Identified %d previous GTDB representatives.' % len(prev_gtdb_reps))
        
        # get genome quality
        genomes_to_consider = list(accession_to_taxid.keys())
        genome_stats = read_gtdb_metadata(metadata_file, ['checkm_completeness',
                                                            'checkm_contamination',
                                                            'contig_count',
                                                            'n50_scaffolds',
                                                            'ambiguous_bases',
                                                            'total_gap_length',
                                                            'scaffold_count',
                                                            'ssu_count',
                                                            'ncbi_molecule_count',
                                                            'ncbi_unspanned_gaps',
                                                            'ncbi_genome_representation',
                                                            'ncbi_spanned_gaps',
                                                            'ncbi_assembly_level',
                                                            'ncbi_taxonomy',
                                                            'ncbi_organism_name',
                                                            'lpsn_strain'])
        missing_quality = set(accession_to_taxid.keys()) - set(genome_stats.keys())
        if missing_quality:
            self.logger.error('There are %d genomes without metadata information.' % len(missing_quality))
            self.exit(-1)
            
        filtered_reps_file = output_file + '.filtered_reps'
        fout = open(filtered_reps_file, 'w')
        fout.write('Genome ID\tCompleteness\tContamination\tContig Count\tN50\tNote\n')

        lpsn_type_strains = defaultdict(set)
        new_genomes_to_consider = []
        genome_quality = {}
        filtered_reps = 0
        lack_ncbi_taxonomy = 0
        contig_filter_count = 0
        for genome_id in list(accession_to_taxid.keys()):
            stats = genome_stats[genome_id]
            
            if not stats.ncbi_taxonomy:
                lack_ncbi_taxonomy += 1
                fout.write('%s\t%.2f\t%.2f\t%d\t%d\t%d\t%d\t%s\n' % (genome_id, 
                                                                        comp, 
                                                                        cont, 
                                                                        stats.contig_count, 
                                                                        stats.n50_scaffolds, 
                                                                        stats.ambiguous_bases,
                                                                        stats.total_gap_length,
                                                                        'no NCBI taxonomy'))
                self.logger.warning('Skipping %s as it has no assigned NCBI taxonomy.' % genome_id)
                continue
            
            comp = stats.checkm_completeness
            cont = stats.checkm_contamination
            
            keep = False
            if genome_id in trusted_accessions:
                keep = True
            elif (comp >= min_rep_comp
                    and cont <= max_rep_cont
                    and (comp - 5*cont) >= min_quality
                    and stats.contig_count <= max_contigs
                    and stats.n50_scaffolds >= min_N50
                    and stats.ambiguous_bases <= max_ambiguous
                    and stats.total_gap_length <= max_gap_length):
                        keep = True
            elif not strict_filtering:
                # check if genome appears to consist of only an unspanned
                # chromosome and unspanned plasmids and thus can be 
                # subjected to a more lenient quality check
                if (stats.ncbi_assembly_level in ['Complete Genome', 'Chromosome']
                    and stats.ncbi_genome_representation == 'full'
                    and stats.scaffold_count == stats.ncbi_molecule_count
                    and stats.ncbi_unspanned_gaps == 0
                    and stats.ncbi_spanned_gaps <= 10
                    and stats.ambiguous_bases <= 1000
                    and stats.total_gap_length <= 100000
                    and stats.ssu_count >= 1):
                    
                    # apply lenient quality check 
                    if comp >= 50 and cont <= 15:
                        keep = True
                        
            if keep:
                new_genomes_to_consider.append(genome_id)
                genome_quality[genome_id] = comp - 5*cont
                if stats.lpsn_strain:
                    ncbi_species = stats.ncbi_taxonomy.split(';')[6].strip()
                    lpsn_type_strains[ncbi_species].add(genome_id)
            
            # check if a representative at NCBI is being filtered
            if genome_id in representative_genomes and genome_id not in new_genomes_to_consider:
                fout.write('%s\t%.2f\t%.2f\t%d\t%d\t%d\t%d\t%s\n' % (genome_id, 
                                                                        comp, 
                                                                        cont, 
                                                                        stats.contig_count, 
                                                                        stats.n50_scaffolds, 
                                                                        stats.ambiguous_bases,
                                                                        stats.total_gap_length,
                                                                        stats.ncbi_organism_name))
                                                                        
                if stats.contig_count > 300:
                    contig_filter_count += 1 

                self.logger.warning('Filtered RefSeq representative %s with comp=%.2f, cont=%.2f, contigs=%d, N50=%d' % (genome_id, 
                                                                                                                            comp, 
                                                                                                                            cont, 
                                                                                                                            stats.contig_count, 
                                                                                                                            stats.n50_scaffolds))
                filtered_reps += 1
                
        fout.close()
        
        print('contig_filter_count', contig_filter_count)

        genomes_to_consider = new_genomes_to_consider
        self.logger.info('Skipped %d genomes without an assigned NCBI taxonomy.' % lack_ncbi_taxonomy)
        self.logger.info('Filtered %d representative or reference genomes based on genome or assembly quality.' % filtered_reps)
        self.logger.info('Filtered representative or reference genomes written to %s' % filtered_reps_file)
        self.logger.info('Considering %d genomes after filtering for genome quality.' % (len(genomes_to_consider)))

        ncbi_type_strains = read_gtdb_ncbi_type_strain(metadata_file)
        self.logger.info('Identified %d genomes marked as type strains at NCBI.' % len(ncbi_type_strains))
        self.logger.info('Identified %d genomes marked as type strains at LPSN.' % sum([len(x) for x in list(lpsn_type_strains.values())]))

        genomes_to_retain = self._dereplicate(genomes_to_consider,
                                                max_species,
                                                species,
                                                representative_genomes,
                                                complete_genomes,
                                                ncbi_type_strains,
                                                lpsn_type_strains,
                                                prev_gtdb_reps,
                                                genome_quality)

        self.logger.info('Retained %d genomes.' % len(genomes_to_retain))

        if not trusted_genomes_file:
            trusted_genomes_file = ''

        fout = open(output_file, 'w')
        fout.write('# Selection criteria:\n')
        fout.write('# Maximum species: %d\n' % max_species)
        fout.write('# Trusted genomes file: %s\n' % trusted_genomes_file)
        fout.write('# Genome quality metadata file: %s\n' % str(metadata_file))
        fout.write('# Min. representative completeness: %s\n' % str(min_rep_comp))
        fout.write('# Max. representative contamination: %s\n' % str(max_rep_cont))
        fout.write('#\n')
        fout.write('# Genome Id\tGTDB Taxonomy\tNCBI Taxonomy\tType strain\tComplete\tRepresentative\n')
        for assembly_accession in genomes_to_retain:
            representative = 'yes' if assembly_accession in representative_genomes else 'no'
            complete = 'yes' if assembly_accession in complete_genomes else 'no'
            ts = 'yes' if assembly_accession in ncbi_type_strains else 'no'
            gtdb_taxa_str = ';'.join(gtdb_taxonomy.get(assembly_accession, Taxonomy.rank_prefixes))
            ncbi_taxa_str = ';'.join(ncbi_taxonomy.get(assembly_accession, Taxonomy.rank_prefixes))

            if assembly_accession.startswith('GCF_'):
                assembly_accession = 'RS_' + assembly_accession
            elif assembly_accession.startswith('GCA_'):
                assembly_accession = 'GB_' + assembly_accession

            fout.write('%s\t%s\t%s\t%s\t%s\t%s\n' % (assembly_accession,
                                                         gtdb_taxa_str,
                                                         ncbi_taxa_str,
                                                         ts,
                                                         complete,
                                                         representative))
        fout.close()
示例#14
0
    def run(self,
            rna_name,
            gtdb_metadata_file,
            rna_file,
            min_rna_length,
            min_scaffold_length,
            min_quality,
            max_contigs,
            min_N50,
            tax_filter,
            genome_list,
            output_dir,
            align_method='ssu_align'):
        """Infer rRNA gene tree spanning select GTDB genomes.

        Parameters
        ----------
        rna_name : str
            Name of rRNA gene.
        gtdb_metadata_file : str
            File specifying GTDB metadata for each genome.
        rna_file : str
            File with rRNA gene sequences in FASTA format.
        min_rna_length : int
            Minimum required length of rRNA gene sequences.
        min_scaffold_length : int
            Minimum required length of scaffold containing rRNA gene sequence.
        min_quality : float [0, 100]
            Minimum genome quality for a genome to be include in tree.
        max_contigs : int
            Maximum number of contigs to include genome.
        min_N50 : int
            Minimum N50 to include genome.
        tax_filter : boolean
            Filter sequences based on incongruent taxonomy classification.
        genome_list : str
            Explicit list of genomes to use (ignores --ncbi_rep_only and --user_genomes).
        output_dir : str
            Directory to store results
        """

        if rna_name not in ['ssu', 'lsu']:
            self.logger.error('Unrecognized rRNA gene type: %s' % rna_name)
            sys.exit(-1)

        genome_metadata = read_gtdb_metadata(gtdb_metadata_file, [
            'checkm_completeness', 'checkm_contamination', 'scaffold_count',
            'n50_scaffolds', 'organism_name', 'gtdb_representative'
        ])

        gtdb_taxonomy = read_gtdb_taxonomy(gtdb_metadata_file)

        user_genomes = set()
        uba_genomes = set()
        ncbi_genomes = set()
        rep_genomes = set()
        for genome_id in genome_metadata:
            org_name = str(genome_metadata[genome_id][4])
            if genome_id.startswith('U_'):
                if '(UBA' in org_name:
                    uba_genomes.add(genome_id)
                else:
                    user_genomes.add(genome_id)
            elif genome_id.startswith('RS_') or genome_id.startswith('GB_'):
                ncbi_genomes.add(genome_id)
            else:
                self.logger.warning('Unrecognized genome prefix: %s' %
                                    genome_id)

            rep = genome_metadata[genome_id][5] == 't'
            if rep:
                rep_genomes.add(genome_id)

        self.logger.info(
            'Initially considering %d genomes (%d NCBI, %d UBA, %d User).' %
            (len(genome_metadata), len(ncbi_genomes), len(uba_genomes),
             len(user_genomes)))
        self.logger.info('Identified %d representative genomes.' %
                         len(rep_genomes))

        # get genomes specified in genome list by user
        genomes_to_consider = set()
        if genome_list:
            for line in open(genome_list):
                gid = line.rstrip().split('\t')[0]
                if gid.startswith('RS_') or gid.startswith(
                        'GB_') or gid.startswith('U_'):
                    genomes_to_consider.add(gid)
            self.logger.info(
                'Restricting genomes to the %d in the genome list.' %
                len(genomes_to_consider))
        else:
            # filter genomes based on quality and database source
            self.logger.info('Filtering genomes based on specified critieria.')
            self.logger.info('Filtering on minimum quality <%d.' % min_quality)
            self.logger.info('Filtering on number of contigs >%d.' %
                             max_contigs)
            self.logger.info('Filtering on scaffold N50 <%d.' % min_N50)

            new_genomes_to_consider = []
            filtered_genomes = 0
            gt = 0
            gq = 0
            sc = 0
            n50 = 0
            for genome_id in genome_metadata:
                if genome_id not in rep_genomes:
                    gt += 1
                    filtered_genomes += 1
                    continue

                if genome_id not in ncbi_genomes and genome_id not in uba_genomes:
                    gt += 1
                    filtered_genomes += 1
                    continue

                comp, cont, scaffold_count, n50_contigs, _org_name, _rep = genome_metadata[
                    genome_id]
                q = float(comp) - 5 * float(cont)
                if q < min_quality or int(scaffold_count) > max_contigs or int(
                        n50_contigs) < min_N50:
                    if q < min_quality:
                        gq += 1

                    if int(scaffold_count) > max_contigs:
                        sc += 1

                    if int(n50_contigs) < min_N50:
                        n50 += 1

                    filtered_genomes += 1
                    continue

                new_genomes_to_consider.append(genome_id)

            genomes_to_consider = new_genomes_to_consider
            self.logger.info(
                'Filtered %d genomes (%d on genome type, %d on genome quality, %d on number of contigs, %d on N50).'
                % (filtered_genomes, gt, gq, sc, n50))
            self.logger.info('Considering %d genomes after filtering.' %
                             len(genomes_to_consider))

        # limit taxonomy to genomes being considered
        cur_gtdb_taxonomy = {}
        for gid in genomes_to_consider:
            cur_gtdb_taxonomy[gid] = gtdb_taxonomy[gid]

        # get rRNA gene sequences for each genome
        rna_output_file = self._get_rna_seqs(rna_name, rna_file,
                                             min_rna_length,
                                             min_scaffold_length,
                                             cur_gtdb_taxonomy,
                                             genomes_to_consider, output_dir)

        # identify erroneous rRNA gene sequences
        if tax_filter:
            self.logger.info(
                'Filtering sequences with incongruent taxonomy strings.')
            filter = self._tax_filter(rna_output_file, cur_gtdb_taxonomy,
                                      output_dir)

            self.logger.info('Filtered %d sequences.' % len(filter))
            if len(filter) > 0:
                rna_filtered_output = os.path.join(
                    output_dir, 'gtdb_%s.tax_filter.fna' % rna_name)
                fout = open(rna_filtered_output, 'w')
                for seq_id, seq, annotation in seq_io.read_seq(
                        rna_output_file, keep_annotation=True):
                    if seq_id not in filter:
                        fout.write('>' + seq_id + ' ' + annotation + '\n')
                        fout.write(seq + '\n')
                fout.close()

                rna_output_file = rna_filtered_output

        # align sequences with ssu-align or mothur
        if rna_name == 'ssu':
            if align_method == 'ssu_align':
                self.logger.info('Aligning sequences with ssu-align.')
                align_dir = os.path.join(output_dir, '%s_align' % rna_name)
                os.system('ssu-align --dna %s %s' %
                          (rna_output_file, align_dir))
                os.system('ssu-mask --afa %s' % align_dir)
            elif align_method == 'mothur':
                self.logger.info('Aligning sequences with mothur.')
                align_dir = os.path.join(output_dir, 'mothur')
                if not os.path.exists(align_dir):
                    os.makedirs(align_dir)

                mothur_cmd = 'mothur "#set.dir(output=%s, blastdir=/srv/sw/Mothur/1.39.5)' % align_dir
                mothur_cmd += '; align.seqs(candidate=%s, template=/srv/db/mothur/silva_128/silva.seed_v128.align, search=blast, flip=t, processors=%d)' % (
                    rna_output_file, self.cpus)
                input_prefix = remove_extension(rna_output_file)
                align_file = os.path.join(align_dir, input_prefix + '.align')
                mothur_cmd += '; filter.seqs(fasta=%s, hard=/srv/db/mothur/silva_128/Lane1349.silva.filter, processors=%d);"' % (
                    align_file, self.cpus)
                os.system(mothur_cmd)
                input_msa = os.path.join(align_dir,
                                         input_prefix + '.filter.fasta')
        elif rna_name == 'lsu':
            self.logger.info('Aligning sequences with ssu-align.')
            align_dir = os.path.join(output_dir, '%s_align' % rna_name)
            if not os.path.exists(align_dir):
                os.makedirs(align_dir)

            os.system('esl-sfetch --index %s' % rna_output_file)

            # search fo sequences using domain-specific LSU HMMs
            for domain in ['archaea', 'bacteria', 'eukaryote']:
                self.logger.info(
                    'Matching LSU rRNA genes to %s-specific HMM.' % domain)
                table_out = os.path.join(
                    align_dir, 'cmsearch.%s.%s.tblout' % (rna_name, domain))
                cm_dir = os.path.join(
                    os.path.dirname(os.path.realpath(__file__)), 'cm_files')
                cm_file = os.path.join(cm_dir, 'lsu_%s.cm' % domain)
                log_file = os.path.join(
                    align_dir, 'cmsearch.%s.%s.out' % (rna_name, domain))
                os.system(
                    'cmsearch --hmmonly --cpu %d --noali --tblout %s %s %s > %s'
                    %
                    (self.cpus, table_out, cm_file, rna_output_file, log_file))

            # identify top hits for each domain
            self.logger.info(
                'Identifying best domain-specific HMM for each LSU rRNA gene.')
            top_hits = {}
            for domain in ['archaea', 'bacteria', 'eukaryote']:
                table_out = os.path.join(
                    align_dir, 'cmsearch.%s.%s.tblout' % (rna_name, domain))
                for line in open(table_out):
                    if line[0] == '#':
                        continue

                    line_split = line.split()
                    seq_id = line_split[0]
                    start_seq = int(line_split[7])
                    end_seq = int(line_split[8])
                    bitscore = float(line_split[14])

                    prev_bitscore = top_hits.get(seq_id, [None, 0, 0, 0, 0])[4]
                    if bitscore > prev_bitscore:
                        top_hits[seq_id] = [
                            domain, seq_id, start_seq, end_seq, bitscore
                        ]

            # create MSA for each bacteria and archaea
            for domain in ['archaea', 'bacteria']:
                # creat file of top hits
                top_hits_out = os.path.join(
                    align_dir, 'top_hits.%s.%s.tsv' % (rna_name, domain))
                fout = open(top_hits_out, 'w')
                num_hits = 0
                for top_domain, seq_id, start_seq, end_seq, bitscore in top_hits.values(
                ):
                    if top_domain == domain:
                        fout.write('%s\t%d\t%d\%f\n' %
                                   (seq_id, start_seq, end_seq, bitscore))
                        num_hits += 1
                fout.close()

                # align top hits
                self.logger.info(
                    'Creating MSA for %s LSU rRNA genes (%d sequences).' %
                    (domain, num_hits))

                if num_hits > 0:
                    seq_file = os.path.join(
                        align_dir, 'cmsearch.%s.%s.fna' % (rna_name, domain))
                    os.system(
                        "grep -v '^#' %s | awk '{print $1, $2, $3, $1}' | esl-sfetch -Cf %s - > %s"
                        % (top_hits_out, rna_output_file, seq_file))

                    align_file = os.path.join(
                        align_dir, 'cmalign.%s.%s.stk' % (rna_name, domain))
                    os.system('cmalign --dnaout --outformat Pfam %s %s > %s' %
                              (cm_file, seq_file, align_file))

                    masked_file = os.path.join(
                        align_dir,
                        'cmalign.%s.%s.mask.afa' % (rna_name, domain))
                    os.system('esl-alimask -p --outformat AFA %s > %s' %
                              (align_file, masked_file))

        # trim sequences and infer tree
        if align_method == 'ssu_align':
            for domain in ['archaea', 'bacteria']:
                if rna_name == 'ssu':
                    input_msa = os.path.join(
                        align_dir, 'ssu_align.' + domain + '.mask.afa')
                elif rna_name == 'lsu':
                    input_msa = os.path.join(
                        align_dir,
                        'cmalign.%s.%s.mask.afa' % (rna_name, domain))

                if not os.path.exists(input_msa):
                    continue

                trimmed_msa = os.path.join(output_dir, domain + '.trimmed.fna')
                self._trim_seqs(input_msa, trimmed_msa)

                # infer tree
                self.logger.info('Inferring tree for %s genes.' % domain)
                output_tree = os.path.join(output_dir, domain + '.tree')
                os.system('FastTreeMP -nosupport -nt -gtr -gamma %s > %s' %
                          (trimmed_msa, output_tree))
        elif align_method == 'mothur':
            trimmed_msa = os.path.join(output_dir,
                                       input_prefix + '.trimmed.fna')
            self._trim_seqs(input_msa, trimmed_msa)

            # infer tree
            self.logger.info('Inferring tree for %s genes.')
            output_tree = os.path.join(output_dir, input_prefix + '.tree')
            os.system('FastTreeMP -nosupport -nt -gtr -gamma %s > %s' %
                      (trimmed_msa, output_tree))
示例#15
0
    def propagate(self, options):
        """Propagate labels to all genomes in a cluster."""

        check_file_exists(options.input_taxonomy)
        check_file_exists(options.metadata_file)

        # get representative genome information
        rep_metadata = read_gtdb_metadata(
            options.metadata_file,
            ['gtdb_representative', 'gtdb_clustered_genomes'])

        taxonomy = Taxonomy()
        explict_tax = taxonomy.read(options.input_taxonomy)
        expanded_taxonomy = {}
        incongruent_count = 0
        for genome_id, taxon_list in explict_tax.iteritems():
            taxonomy_str = ';'.join(taxon_list)

            # Propagate taxonomy strings if genome is a representatives. Also, determine
            # if genomes clustered together have compatible taxonomies. Note that a genome
            # may not have metadata as it is possible a User has removed a genome that is
            # in the provided taxonomy file.
            _rep_genome, clustered_genomes = rep_metadata.get(
                genome_id, (None, None))
            if clustered_genomes:  # genome is a representative
                clustered_genome_ids = clustered_genomes.split(';')

                # get taxonomy of all genomes in cluster with a specified taxonomy
                clustered_genome_tax = {}
                for cluster_genome_id in clustered_genome_ids:
                    if cluster_genome_id == genome_id:
                        continue

                    if cluster_genome_id not in rep_metadata:
                        continue  # genome is no longer in the GTDB so ignore it

                    if cluster_genome_id in explict_tax:
                        clustered_genome_tax[cluster_genome_id] = explict_tax[
                            cluster_genome_id]

                # determine if representative and clustered genome taxonomy strings are congruent
                working_cluster_taxonomy = list(taxon_list)
                incongruent_with_rep = False
                for cluster_genome_id, cluster_tax in clustered_genome_tax.iteritems(
                ):
                    if incongruent_with_rep:
                        working_cluster_taxonomy = list(
                            taxon_list)  # default to rep taxonomy
                        break

                    for r in xrange(0, len(Taxonomy.rank_prefixes)):
                        if cluster_tax[r] == Taxonomy.rank_prefixes[r]:
                            break  # no more taxonomy information to consider

                        if cluster_tax[r] != taxon_list[r]:
                            if taxon_list[r] == Taxonomy.rank_prefixes[r]:
                                # clustered genome has a more specific taxonomy string which
                                # should be propagate to the representative if all clustered
                                # genomes are in agreement
                                if working_cluster_taxonomy[
                                        r] == Taxonomy.rank_prefixes[r]:
                                    # make taxonomy more specific based on genomes in cluster
                                    working_cluster_taxonomy[r] = cluster_tax[
                                        r]
                                elif working_cluster_taxonomy[
                                        r] != cluster_tax[r]:
                                    # not all genomes agree on the assignment of this rank so leave it unspecified
                                    working_cluster_taxonomy[
                                        r] = Taxonomy.rank_prefixes[r]
                                    break
                            else:
                                # genomes in cluster have incongruent taxonomies so defer to representative
                                self.logger.warning(
                                    "Genomes in cluster have incongruent taxonomies."
                                )
                                self.logger.warning("Representative %s: %s" %
                                                    (genome_id, taxonomy_str))
                                self.logger.warning(
                                    "Clustered genome %s: %s" %
                                    (cluster_genome_id, ';'.join(cluster_tax)))
                                self.logger.warning(
                                    "Deferring to taxonomy specified for representative."
                                )

                                incongruent_count += 1
                                incongruent_with_rep = True
                                break

                cluster_taxonomy_str = ';'.join(working_cluster_taxonomy)

                # assign taxonomy to representative and all genomes in the cluster
                expanded_taxonomy[genome_id] = cluster_taxonomy_str
                for cluster_genome_id in clustered_genome_ids:
                    expanded_taxonomy[cluster_genome_id] = cluster_taxonomy_str
            else:
                if genome_id in expanded_taxonomy:
                    # genome has already been assigned a taxonomy based on its representative
                    pass
                else:
                    # genome is a singleton
                    expanded_taxonomy[genome_id] = taxonomy_str

        self.logger.info(
            'Identified %d clusters with incongruent taxonomies.' %
            incongruent_count)

        fout = open(options.output_taxonomy, 'w')
        for genome_id, taxonomy_str in expanded_taxonomy.iteritems():
            fout.write('%s\t%s\n' % (genome_id, taxonomy_str))
        fout.close()

        self.logger.info('Taxonomy written to: %s' % options.output_taxonomy)
示例#16
0
    def run(self, metadata_file, gtdb_user_genomes_file, gtdb_user_reps,
            ncbi_refseq_assembly_file, ncbi_genbank_assembly_file,
            gtdb_domain_report, min_comp, max_cont, min_quality, sh_exception,
            min_perc_markers, max_contigs, min_N50, max_ambiguous, output_dir):
        """Quality check all potential GTDB genomes."""

        # get GTDB and NCBI taxonomy strings for each genome
        self.logger.info('Reading NCBI taxonomy from GTDB metadata file.')
        ncbi_taxonomy = read_gtdb_ncbi_taxonomy(metadata_file)
        ncbi_species = binomial_species(ncbi_taxonomy)
        gtdb_taxonomy = read_gtdb_taxonomy(metadata_file)
        self.logger.info('Read NCBI taxonomy for %d genomes.' %
                         len(ncbi_taxonomy))
        self.logger.info('Read GTDB taxonomy for %d genomes.' %
                         len(gtdb_taxonomy))

        # determine User genomes to retain for consideration
        gtdb_user_to_genbank = self._gtdb_user_genomes(gtdb_user_genomes_file,
                                                       metadata_file)
        self.logger.info(
            'Identified %d GTDB User genomes with GenBank accessions to retain for potential inclusion in GTDB.'
            % len(gtdb_user_to_genbank))

        user_genomes = 0
        for line in open(gtdb_user_reps):
            line_split = line.strip().split('\t')
            gid, taxonomy = line_split
            if gid not in gtdb_user_to_genbank:
                if 'd__Bacteria' in taxonomy:
                    self.logger.warning(
                        'Bacterial genome %s has no NCBI accession and is being skipped.'
                        % gid)
                else:
                    gtdb_user_to_genbank[gid] = gid
                    user_genomes += 1
        self.logger.info(
            'Identified %d archaeal GTDB User genome WITHOUT GenBank accessions to retain for potential inclusion in GTDB.'
            % user_genomes)

        # calculate quality score for genomes
        self.logger.info('Parsing QC statistics for each genome.')
        quality_metadata = read_gtdb_metadata(metadata_file, [
            'checkm_completeness', 'checkm_contamination',
            'checkm_strain_heterogeneity_100', 'contig_count', 'n50_contigs',
            'ambiguous_bases', 'genome_size'
        ])

        marker_perc = parse_marker_percentages(gtdb_domain_report)

        # parse NCBI assembly files
        self.logger.info('Parsing NCBI assembly files.')
        excluded_from_refseq_note = exclude_from_refseq(
            ncbi_refseq_assembly_file, ncbi_genbank_assembly_file)

        # get type material designations for each genome
        self.logger.info(
            'Reading type material designations for genomes from GTDB metadata file.'
        )
        type_metadata = read_gtdb_metadata(metadata_file, [
            'ncbi_type_material_designation', 'gtdb_type_designation',
            'gtdb_type_designation_sources'
        ])

        ncbi_tsp = ncbi_type_strain_of_species(type_metadata)
        gtdb_tsp = gtdb_type_strain_of_species(type_metadata)

        # QC all genomes
        self.logger.info('Validating genomes.')
        fout_retained = open(os.path.join(output_dir, 'qc_passed.tsv'), 'w')
        fout_failed = open(os.path.join(output_dir, 'qc_failed.tsv'), 'w')

        header = 'Accession\tNCBI species'
        header += '\tCompleteness (%)\tContamination (%)\tQuality\tStrain heterogeneity at 100%%'
        header += '\tMarkers (%)\tNo. contigs\tN50 contigs\tAmbiguous bases'

        fout_retained.write(header + '\n')
        fout_failed.write(header)
        fout_failed.write(
            '\tFailed completeness\tFailed contamination\tFailed quality')
        fout_failed.write(
            '\tFailed marker percentage\tFailed no. contigs\tFailed N50 contigs\tFailed ambiguous bases\n'
        )

        num_retained = 0
        num_filtered = 0
        for gid in quality_metadata:
            if gid.startswith('U_') and gid not in gtdb_user_to_genbank:
                # skip user genomes not marked for retention
                continue

            failed_tests = defaultdict(int)
            passed_qc = pass_qc(quality_metadata[gid], marker_perc[gid],
                                min_comp, max_cont, min_quality, sh_exception,
                                min_perc_markers, max_contigs, min_N50,
                                max_ambiguous, failed_tests)

            if passed_qc:
                num_retained += 1
                fout_retained.write('%s\t%s' % (gid, ncbi_taxonomy[gid][6]))
                fout_retained.write(
                    '\t%.2f\t%.2f\t%.2f\t%s\t%.2f\t%d\t%d\t%d\n' %
                    (quality_metadata[gid].checkm_completeness,
                     quality_metadata[gid].checkm_contamination,
                     quality_metadata[gid].checkm_completeness -
                     5 * quality_metadata[gid].checkm_contamination,
                     ('%.2f' %
                      quality_metadata[gid].checkm_strain_heterogeneity_100) if
                     quality_metadata[gid].checkm_strain_heterogeneity_100 else
                     '-', marker_perc[gid], quality_metadata[gid].contig_count,
                     quality_metadata[gid].n50_contigs,
                     quality_metadata[gid].ambiguous_bases))
            else:
                num_filtered += 1
                fout_failed.write('%s\t%s' % (gid, ncbi_taxonomy[gid][6]))
                fout_failed.write(
                    '\t%.2f\t%.2f\t%.2f\t%s\t%.2f\t%d\t%d\t%d' %
                    (quality_metadata[gid].checkm_completeness,
                     quality_metadata[gid].checkm_contamination,
                     quality_metadata[gid].checkm_completeness -
                     5 * quality_metadata[gid].checkm_contamination,
                     ('%.2f' %
                      quality_metadata[gid].checkm_strain_heterogeneity_100) if
                     quality_metadata[gid].checkm_strain_heterogeneity_100 else
                     '-', marker_perc[gid], quality_metadata[gid].contig_count,
                     quality_metadata[gid].n50_contigs,
                     quality_metadata[gid].ambiguous_bases))
                fout_failed.write(
                    '\t%d\t%d\t%d\t%d\t%d\t%d\t%d\n' %
                    (failed_tests['comp'], failed_tests['cont'],
                     failed_tests['qual'], failed_tests['marker_perc'],
                     failed_tests['contig_count'], failed_tests['N50'],
                     failed_tests['ambig']))
        fout_retained.close()
        fout_failed.close()

        self.logger.info('Retained %d genomes and filtered %d genomes.' %
                         (num_retained, num_filtered))

        # QC genomes in each named species
        self.logger.info(
            'Performing QC of type genome for each of the %d NCBI species.' %
            len(ncbi_species))

        fout_type_fail = open(
            os.path.join(output_dir, 'type_genomes_fail_qc.tsv'), 'w')
        fout_type_fail.write(
            'Species\tAccession\tGTDB taxonomy\tNCBI taxonomy\tType sources\tNCBI assembly type\tGenome size (bp)'
        )
        fout_type_fail.write(
            '\tCompleteness (%)\tContamination (%)\tQuality\tStrain heterogeneity at 100%'
        )
        fout_type_fail.write(
            '\tMarkers (%)\tNo. contigs\tN50 contigs\tAmbiguous bases\tNCBI exclude from RefSeq\tLost species\n'
        )

        fout_fail_sp = open(os.path.join(output_dir, 'species_fail_qc.tsv'),
                            'w')
        fout_fail_sp.write(
            'Species\tAccession\tGTDB taxonomy\tNCBI taxonomy\tAssembled from type material\tGenome size (bp)'
        )
        fout_fail_sp.write(
            '\tCompleteness (%)\tContamination (%)\tQuality\tStrain heterogeneity at 100%'
        )
        fout_fail_sp.write(
            '\tMarkers (%)\tNo. contigs\tN50 contigs\tAmbiguous bases')
        fout_fail_sp.write(
            '\tFailed completeness\tFailed contamination\tFailed quality')
        fout_fail_sp.write(
            '\tFailed marker percentage\tFailed no. contigs\tFailed N50 contigs\tFailed ambiguous bases'
        )
        fout_fail_sp.write('\tNCBI exclude from RefSeq\n')

        fout_sp_lost = open(os.path.join(output_dir, 'species_lost.tsv'), 'w')
        fout_sp_lost.write('Species\tNo. genomes\tNo. type genomes')
        fout_sp_lost.write(
            '\tFail completeness\tFail contamination\tFail quality\tFailed percent markers'
        )
        fout_sp_lost.write(
            '\tFail no. contigs\tFail N50 contigs\tFail ambiguous bases\n')

        lost_type = 0
        lost_sp = 0
        filtered_genomes = 0
        failed_tests_cumulative = defaultdict(int)
        for sp, gids in ncbi_species.items():
            type_pass = set()
            type_fail = set()
            other_pass = set()
            other_fail = set()

            failed_tests_gids = {}
            for gid in gids:
                failed_tests = defaultdict(int)
                passed_qc = pass_qc(quality_metadata[gid], marker_perc[gid],
                                    min_comp, max_cont, min_quality,
                                    sh_exception, min_perc_markers,
                                    max_contigs, min_N50, max_ambiguous,
                                    failed_tests)

                failed_tests_gids[gid] = failed_tests

                if gid in gtdb_tsp or gid in ncbi_tsp:
                    if passed_qc:
                        type_pass.add(gid)
                    else:
                        type_fail.add(gid)
                        filtered_genomes += 1
                else:
                    if passed_qc:
                        other_pass.add(gid)
                    else:
                        other_fail.add(gid)
                        filtered_genomes += 1

            # tally failed species
            for test, count in failed_tests.items():
                failed_tests_cumulative[test] += count

            if len(type_pass) >= 1:
                # great: one or more type genomes pass QC and will be selected as the type genome
                continue

            if len(type_fail):
                # all potential type genomes for species failed QC so report these for manual inspection
                lost_type += 1
                for gid in type_fail:
                    fout_type_fail.write(
                        '%s\t%s\t%s\t%s\t%s\t%s\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%d\t%d\t%d\t%s\t%s\n'
                        %
                        (sp, gid, '; '.join(gtdb_taxonomy[gid]), '; '.join(
                            ncbi_taxonomy[gid]),
                         type_metadata[gid].gtdb_type_designation_sources,
                         type_metadata[gid].ncbi_type_material_designation,
                         float(quality_metadata[gid].genome_size) / 1e6,
                         quality_metadata[gid].checkm_completeness,
                         quality_metadata[gid].checkm_contamination,
                         quality_metadata[gid].checkm_completeness -
                         5 * quality_metadata[gid].checkm_contamination,
                         quality_metadata[gid].checkm_strain_heterogeneity_100,
                         marker_perc[gid], quality_metadata[gid].contig_count,
                         quality_metadata[gid].n50_contigs,
                         quality_metadata[gid].ambiguous_bases,
                         excluded_from_refseq_note[gid], len(other_pass) == 0))

            if len(other_pass) == 0:
                # no genomes for species pass QC so report loss of species
                lost_sp += 1
                fout_sp_lost.write('%s\t%d\t%d' %
                                   (sp, len(gids), len(type_fail)))
                fout_sp_lost.write(
                    '\t%d\t%d\t%d\t%d\t%d\t%d\t%d\n' %
                    (sum([failed_tests_gids[gid]['comp'] for gid in gids]),
                     sum([failed_tests_gids[gid]['cont'] for gid in gids]),
                     sum([failed_tests_gids[gid]['qual'] for gid in gids]),
                     sum([
                         failed_tests_gids[gid]['marker_perc'] for gid in gids
                     ]),
                     sum([
                         failed_tests_gids[gid]['contig_count'] for gid in gids
                     ]), sum([failed_tests_gids[gid]['N50'] for gid in gids]),
                     sum([failed_tests_gids[gid]['ambig'] for gid in gids])))

                for gid in type_fail.union(other_fail):
                    fout_fail_sp.write(
                        '%s\t%s\t%s\t%s\t%s\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%d\t%d\t%d'
                        %
                        (sp, gid, '; '.join(gtdb_taxonomy[gid]), '; '.join(
                            ncbi_taxonomy[gid]), gid in type_fail,
                         float(quality_metadata[gid].genome_size) / 1e6,
                         quality_metadata[gid].checkm_completeness,
                         quality_metadata[gid].checkm_contamination,
                         quality_metadata[gid].checkm_completeness -
                         5 * quality_metadata[gid].checkm_contamination,
                         quality_metadata[gid].checkm_strain_heterogeneity_100,
                         marker_perc[gid], quality_metadata[gid].contig_count,
                         quality_metadata[gid].n50_contigs,
                         quality_metadata[gid].ambiguous_bases))
                    fout_fail_sp.write('\t%d\t%d\t%d\t%d\t%d\t%d\t%d' %
                                       (failed_tests_gids[gid]['comp'],
                                        failed_tests_gids[gid]['cont'],
                                        failed_tests_gids[gid]['qual'],
                                        failed_tests_gids[gid]['marker_perc'],
                                        failed_tests_gids[gid]['contig_count'],
                                        failed_tests_gids[gid]['N50'],
                                        failed_tests_gids[gid]['ambig']))
                    fout_fail_sp.write('\t%s\n' %
                                       excluded_from_refseq_note[gid])

        fout_type_fail.close()
        fout_fail_sp.close()
        fout_sp_lost.close()

        self.logger.info('Genomes filtered for each criterion:')
        for test in sorted(failed_tests_cumulative):
            self.logger.info('%s: %d' % (test, failed_tests_cumulative[test]))

        self.logger.info('Filtered %d genomes assigned to NCBI species.' %
                         filtered_genomes)
        self.logger.info(
            'Identified %d species with type genomes failing QC and %d total species failing QC.'
            % (lost_type, lost_sp))