Пример #1
0
    def binomial(self, options):
        """Ensure species are designated using binomial nomenclature."""

        check_file_exists(options.input_taxonomy)

        fout = open(options.output_taxonomy, 'w')
        taxonomy = Taxonomy()
        t = taxonomy.read(options.input_taxonomy)

        for genome_id, taxon_list in t.items():
            taxonomy_str = ';'.join(taxon_list)
            if not taxonomy.check_full(taxonomy_str):
                sys.exit(-1)

            genus = taxon_list[5][3:]
            species = taxon_list[6][3:]
            if species and genus not in species:
                taxon_list[6] = 's__' + genus + ' ' + species
                taxonomy_str = ';'.join(taxon_list)

            fout.write('%s\t%s\n' % (genome_id, taxonomy_str))

        fout.close()

        self.logger.info('Revised taxonomy written to: %s' %
                         options.output_taxonomy)
Пример #2
0
    def validate(self, options):
        """Validate command"""

        check_file_exists(options.taxonomy_file)

        taxonomy = Taxonomy()
        t = taxonomy.read(options.taxonomy_file)

        errors = taxonomy.validate(t,
                                   check_prefixes=not options.no_prefix,
                                   check_ranks=not options.no_all_ranks,
                                   check_hierarchy=not options.no_hierarhcy,
                                   check_species=not options.no_species,
                                   check_group_names=True,
                                   check_duplicate_names=True,
                                   report_errors=True)

        invalid_ranks, invalid_prefixes, invalid_species_name, invalid_hierarchies, invalid_group_name = errors

        if sum([len(e) for e in errors]) == 0:
            self.logger.info('No errors identified in taxonomy file.')
        else:
            self.logger.info('Identified %d incomplete taxonomy strings.' %
                             len(invalid_ranks))
            self.logger.info('Identified %d rank prefix errors.' %
                             len(invalid_prefixes))
            self.logger.info('Identified %d invalid species names.' %
                             len(invalid_species_name))
            self.logger.info('Identified %d taxa with multiple parents.' %
                             len(invalid_hierarchies))
            self.logger.info('Identified %d invalid group names.' %
                             len(invalid_group_name))
Пример #3
0
    def validate(self, options):
        """Check taxonomy file is formatted as expected."""

        check_file_exists(options.input_taxonomy)

        taxonomy = Taxonomy()
        t = taxonomy.read(options.input_taxonomy)

        taxonomy.validate(t,
                          check_prefixes=True,
                          check_ranks=True,
                          check_hierarchy=True,
                          check_species=True,
                          check_group_names=True,
                          check_duplicate_names=True,
                          report_errors=True)

        self.logger.info('Finished performing validation tests.')
Пример #4
0
    def fill_ranks(self, options):
        """Ensure taxonomy strings contain all 7 canonical ranks."""

        check_file_exists(options.input_taxonomy)

        fout = open(options.output_taxonomy, 'w')
        taxonomy = Taxonomy()
        t = taxonomy.read(options.input_taxonomy)

        for genome_id, taxon_list in t.iteritems():
            full_taxon_list = taxonomy.fill_missing_ranks(taxon_list)

            taxonomy_str = ';'.join(full_taxon_list)
            if not taxonomy.check_full(taxonomy_str):
                sys.exit(-1)

            fout.write('%s\t%s\n' % (genome_id, taxonomy_str))

        fout.close()

        self.logger.info('Revised taxonomy written to: %s' % options.output_taxonomy)
Пример #5
0
    def fill_ranks(self, options):
        """Ensure taxonomy strings contain all 7 canonical ranks."""

        check_file_exists(options.input_taxonomy)

        fout = open(options.output_taxonomy, 'w')
        taxonomy = Taxonomy()
        t = taxonomy.read(options.input_taxonomy)

        for genome_id, taxon_list in t.items():
            full_taxon_list = taxonomy.fill_missing_ranks(taxon_list)

            taxonomy_str = ';'.join(full_taxon_list)
            if not taxonomy.check_full(taxonomy_str):
                sys.exit(-1)

            fout.write('%s\t%s\n' % (genome_id, taxonomy_str))

        fout.close()

        self.logger.info('Revised taxonomy written to: %s' % options.output_taxonomy)
Пример #6
0
    def validate(self, options):
        """Validate command"""

        check_file_exists(options.taxonomy_file)

        taxonomy = Taxonomy()
        t = taxonomy.read(options.taxonomy_file)

        errors = taxonomy.validate(t,
                                     not options.no_prefix,
                                     not options.no_all_ranks,
                                     not options.no_hierarhcy,
                                     not options.no_species,
                                     True)

        invalid_ranks, invalid_prefixes, invalid_species_name, invalid_hierarchies = errors

        if sum([len(e) for e in errors]) == 0:
            self.logger.info('No errors identified in taxonomy file.')
        else:
            self.logger.info('Identified %d incomplete taxonomy strings.' % len(invalid_ranks))
            self.logger.info('Identified %d rank prefix errors.' % len(invalid_prefixes))
            self.logger.info('Identified %d invalid species names.' % len(invalid_species_name))
            self.logger.info('Identified %d taxa with multiple parents.' % len(invalid_hierarchies))
Пример #7
0
    def propagate(self, options):
        """Propagate labels to all genomes in a cluster."""

        check_file_exists(options.input_taxonomy)
        check_file_exists(options.metadata_file)

        # get representative genome information
        rep_metadata = read_gtdb_metadata(options.metadata_file, ['gtdb_representative',
                                                                  'gtdb_clustered_genomes'])
                                                                  
        taxonomy = Taxonomy()
        explict_tax = taxonomy.read(options.input_taxonomy)
        expanded_taxonomy = {}
        incongruent_count = 0
        for genome_id, taxon_list in explict_tax.iteritems():
            taxonomy_str = ';'.join(taxon_list)

            # Propagate taxonomy strings if genome is a representatives. Also, determine
            # if genomes clustered together have compatible taxonomies. Note that a genome
            # may not have metadata as it is possible a User has removed a genome that is
            # in the provided taxonomy file.
            _rep_genome, clustered_genomes = rep_metadata.get(genome_id, (None, None))
            if clustered_genomes:  # genome is a representative
                clustered_genome_ids = clustered_genomes.split(';')

                # get taxonomy of all genomes in cluster with a specified taxonomy
                clustered_genome_tax = {}
                for cluster_genome_id in clustered_genome_ids:
                    if cluster_genome_id == genome_id:
                        continue

                    if cluster_genome_id not in rep_metadata:
                        continue  # genome is no longer in the GTDB so ignore it

                    if cluster_genome_id in explict_tax:
                        clustered_genome_tax[cluster_genome_id] = explict_tax[cluster_genome_id]

                # determine if representative and clustered genome taxonomy strings are congruent
                working_cluster_taxonomy = list(taxon_list)
                incongruent_with_rep = False
                for cluster_genome_id, cluster_tax in clustered_genome_tax.iteritems():
                    if incongruent_with_rep:
                        working_cluster_taxonomy = list(taxon_list)  # default to rep taxonomy
                        break

                    for r in xrange(0, len(Taxonomy.rank_prefixes)):
                        if cluster_tax[r] == Taxonomy.rank_prefixes[r]:
                            break  # no more taxonomy information to consider

                        if cluster_tax[r] != taxon_list[r]:
                            if taxon_list[r] == Taxonomy.rank_prefixes[r]:
                                # clustered genome has a more specific taxonomy string which
                                # should be propagate to the representative if all clustered
                                # genomes are in agreement
                                if working_cluster_taxonomy[r] == Taxonomy.rank_prefixes[r]:
                                    # make taxonomy more specific based on genomes in cluster
                                    working_cluster_taxonomy[r] = cluster_tax[r]
                                elif working_cluster_taxonomy[r] != cluster_tax[r]:
                                    # not all genomes agree on the assignment of this rank so leave it unspecified
                                    working_cluster_taxonomy[r] = Taxonomy.rank_prefixes[r]
                                    break
                            else:
                                # genomes in cluster have incongruent taxonomies so defer to representative
                                self.logger.warning("Genomes in cluster have incongruent taxonomies.")
                                self.logger.warning("Representative %s: %s" % (genome_id, taxonomy_str))
                                self.logger.warning("Clustered genome %s: %s" % (cluster_genome_id, ';'.join(cluster_tax)))
                                self.logger.warning("Deferring to taxonomy specified for representative.")

                                incongruent_count += 1
                                incongruent_with_rep = True
                                break

                cluster_taxonomy_str = ';'.join(working_cluster_taxonomy)

                # assign taxonomy to representative and all genomes in the cluster
                expanded_taxonomy[genome_id] = cluster_taxonomy_str
                for cluster_genome_id in clustered_genome_ids:
                    expanded_taxonomy[cluster_genome_id] = cluster_taxonomy_str
            else:
                if genome_id in expanded_taxonomy:
                    # genome has already been assigned a taxonomy based on its representative
                    pass
                else:
                    # genome is a singleton
                    expanded_taxonomy[genome_id] = taxonomy_str


        self.logger.info('Identified %d clusters with incongruent taxonomies.' % incongruent_count)

        fout = open(options.output_taxonomy, 'w')
        for genome_id, taxonomy_str in expanded_taxonomy.iteritems():
            fout.write('%s\t%s\n' % (genome_id, taxonomy_str))
        fout.close()

        self.logger.info('Taxonomy written to: %s' % options.output_taxonomy)
Пример #8
0
    def propagate(self, options):
        """Propagate labels to all genomes in a cluster."""

        check_file_exists(options.input_taxonomy)
        check_file_exists(options.metadata_file)

        # get representative genome information
        rep_metadata = read_gtdb_metadata(
            options.metadata_file,
            ['gtdb_representative', 'gtdb_clustered_genomes'])

        taxonomy = Taxonomy()
        explict_tax = taxonomy.read(options.input_taxonomy)
        expanded_taxonomy = {}
        incongruent_count = 0
        for genome_id, taxon_list in explict_tax.iteritems():
            taxonomy_str = ';'.join(taxon_list)

            # Propagate taxonomy strings if genome is a representatives. Also, determine
            # if genomes clustered together have compatible taxonomies. Note that a genome
            # may not have metadata as it is possible a User has removed a genome that is
            # in the provided taxonomy file.
            _rep_genome, clustered_genomes = rep_metadata.get(
                genome_id, (None, None))
            if clustered_genomes:  # genome is a representative
                clustered_genome_ids = clustered_genomes.split(';')

                # get taxonomy of all genomes in cluster with a specified taxonomy
                clustered_genome_tax = {}
                for cluster_genome_id in clustered_genome_ids:
                    if cluster_genome_id == genome_id:
                        continue

                    if cluster_genome_id not in rep_metadata:
                        continue  # genome is no longer in the GTDB so ignore it

                    if cluster_genome_id in explict_tax:
                        clustered_genome_tax[cluster_genome_id] = explict_tax[
                            cluster_genome_id]

                # determine if representative and clustered genome taxonomy strings are congruent
                working_cluster_taxonomy = list(taxon_list)
                incongruent_with_rep = False
                for cluster_genome_id, cluster_tax in clustered_genome_tax.iteritems(
                ):
                    if incongruent_with_rep:
                        working_cluster_taxonomy = list(
                            taxon_list)  # default to rep taxonomy
                        break

                    for r in xrange(0, len(Taxonomy.rank_prefixes)):
                        if cluster_tax[r] == Taxonomy.rank_prefixes[r]:
                            break  # no more taxonomy information to consider

                        if cluster_tax[r] != taxon_list[r]:
                            if taxon_list[r] == Taxonomy.rank_prefixes[r]:
                                # clustered genome has a more specific taxonomy string which
                                # should be propagate to the representative if all clustered
                                # genomes are in agreement
                                if working_cluster_taxonomy[
                                        r] == Taxonomy.rank_prefixes[r]:
                                    # make taxonomy more specific based on genomes in cluster
                                    working_cluster_taxonomy[r] = cluster_tax[
                                        r]
                                elif working_cluster_taxonomy[
                                        r] != cluster_tax[r]:
                                    # not all genomes agree on the assignment of this rank so leave it unspecified
                                    working_cluster_taxonomy[
                                        r] = Taxonomy.rank_prefixes[r]
                                    break
                            else:
                                # genomes in cluster have incongruent taxonomies so defer to representative
                                self.logger.warning(
                                    "Genomes in cluster have incongruent taxonomies."
                                )
                                self.logger.warning("Representative %s: %s" %
                                                    (genome_id, taxonomy_str))
                                self.logger.warning(
                                    "Clustered genome %s: %s" %
                                    (cluster_genome_id, ';'.join(cluster_tax)))
                                self.logger.warning(
                                    "Deferring to taxonomy specified for representative."
                                )

                                incongruent_count += 1
                                incongruent_with_rep = True
                                break

                cluster_taxonomy_str = ';'.join(working_cluster_taxonomy)

                # assign taxonomy to representative and all genomes in the cluster
                expanded_taxonomy[genome_id] = cluster_taxonomy_str
                for cluster_genome_id in clustered_genome_ids:
                    expanded_taxonomy[cluster_genome_id] = cluster_taxonomy_str
            else:
                if genome_id in expanded_taxonomy:
                    # genome has already been assigned a taxonomy based on its representative
                    pass
                else:
                    # genome is a singleton
                    expanded_taxonomy[genome_id] = taxonomy_str

        self.logger.info(
            'Identified %d clusters with incongruent taxonomies.' %
            incongruent_count)

        fout = open(options.output_taxonomy, 'w')
        for genome_id, taxonomy_str in expanded_taxonomy.iteritems():
            fout.write('%s\t%s\n' % (genome_id, taxonomy_str))
        fout.close()

        self.logger.info('Taxonomy written to: %s' % options.output_taxonomy)
Пример #9
0
    def check_tree(self, options):
        """Validate taxonomy of decorated tree and check for polyphyletic groups."""

        check_file_exists(options.decorated_tree)

        # validate taxonomy
        taxonomy = Taxonomy()
        if options.taxonomy_file:
            t = taxonomy.read(options.taxonomy_file)
        else:
            t = taxonomy.read_from_tree(options.decorated_tree)

        taxonomy.validate(t,
                          check_prefixes=True,
                          check_ranks=True,
                          check_hierarchy=True,
                          check_species=True,
                          check_group_names=True,
                          check_duplicate_names=True,
                          report_errors=True)

        # check for polyphyletic groups
        polyphyletic_groups = set()
        tree = dendropy.Tree.get_from_path(options.decorated_tree,
                                           schema='newick',
                                           rooting="force-rooted",
                                           preserve_underscores=True)

        if options.taxonomy_file:
            # reduce taxonomy to taxa in tree and map taxon labels to Taxon objects
            reduced_taxonomy = {}
            taxon_map = {}
            for leaf in tree.leaf_node_iter():
                reduced_taxonomy[leaf.taxon.label] = t[leaf.taxon.label]
                taxon_map[leaf.taxon.label] = leaf.taxon

            # find taxa with an MRCA spanning additional taxa
            for rank_label in Taxonomy.rank_labels[1:]:
                extant_taxa = taxonomy.extant_taxa_for_rank(
                    rank_label, reduced_taxonomy)
                for taxon, taxa_ids in extant_taxa.items():
                    mrca = tree.mrca(taxa=[taxon_map[t] for t in taxa_ids])
                    mrca_leaf_count = sum([1 for leaf in mrca.leaf_iter()])
                    if mrca_leaf_count != len(taxa_ids):
                        polyphyletic_groups.add(taxon)
        else:
            # find duplicate taxon labels in tree
            taxa = set()

            for node in tree.preorder_node_iter(lambda n: not n.is_leaf()):
                _support, taxon_label, _aux_info = parse_label(node.label)
                if taxon_label:
                    for taxon in [t.strip() for t in taxon_label.split(';')]:
                        if taxon in taxa:
                            polyphyletic_groups.add(taxon)

                        taxa.add(taxon)

        if len(polyphyletic_groups):
            print('')
            print('Tree contains polyphyletic groups:')
            for taxon in polyphyletic_groups:
                print('%s' % (taxon))

        self.logger.info('Finished performing validation tests.')