Exemplo n.º 1
0
    def pull(self, options):
        """Create taxonomy file from a decorated tree."""

        check_file_exists(options.input_tree)

        if options.no_validation:
            tree = dendropy.Tree.get_from_path(options.input_tree,
                                               schema='newick',
                                               rooting="force-rooted",
                                               preserve_underscores=True)

            taxonomy = {}
            for leaf in tree.leaf_node_iter():
                taxon_id = leaf.taxon.label

                node = leaf.parent_node
                taxa = []
                while node:
                    support, taxon, aux_info = parse_label(node.label)
                    if taxon:
                        for t in list(map(str.strip, taxon.split(';')))[::-1]:
                            taxa.append(t)
                    node = node.parent_node

                taxonomy[taxon_id] = taxa[::-1]
        else:
            taxonomy = Taxonomy().read_from_tree(options.input_tree)

        Taxonomy().write(taxonomy, options.output_taxonomy)

        self.logger.info('Stripped tree written to: %s' %
                         options.output_taxonomy)
Exemplo n.º 2
0
    def validate(self, options):
        """Validate command"""

        check_file_exists(options.taxonomy_file)

        taxonomy = Taxonomy()
        t = taxonomy.read(options.taxonomy_file)

        errors = taxonomy.validate(t,
                                   check_prefixes=not options.no_prefix,
                                   check_ranks=not options.no_all_ranks,
                                   check_hierarchy=not options.no_hierarhcy,
                                   check_species=not options.no_species,
                                   check_group_names=True,
                                   check_duplicate_names=True,
                                   report_errors=True)

        invalid_ranks, invalid_prefixes, invalid_species_name, invalid_hierarchies, invalid_group_name = errors

        if sum([len(e) for e in errors]) == 0:
            self.logger.info('No errors identified in taxonomy file.')
        else:
            self.logger.info('Identified %d incomplete taxonomy strings.' %
                             len(invalid_ranks))
            self.logger.info('Identified %d rank prefix errors.' %
                             len(invalid_prefixes))
            self.logger.info('Identified %d invalid species names.' %
                             len(invalid_species_name))
            self.logger.info('Identified %d taxa with multiple parents.' %
                             len(invalid_hierarchies))
            self.logger.info('Identified %d invalid group names.' %
                             len(invalid_group_name))
Exemplo n.º 3
0
    def append(self, options):
        """Append command"""

        check_file_exists(options.input_tree)
        check_file_exists(options.input_taxonomy)

        taxonomy = Taxonomy().read(options.input_taxonomy)

        tree = dendropy.Tree.get_from_path(options.input_tree,
                                           schema='newick',
                                           rooting='force-rooted',
                                           preserve_underscores=True)

        for n in tree.leaf_node_iter():
            taxa_str = taxonomy.get(n.taxon.label, None)
            if taxa_str == None:
                self.logger.error(
                    'Taxonomy file does not contain an entry for %s.' %
                    n.label)
                sys.exit(-1)
            n.taxon.label = n.taxon.label + '|' + '; '.join(
                taxonomy[n.taxon.label])

        tree.write_to_path(options.output_tree,
                           schema='newick',
                           suppress_rooting=True,
                           unquoted_underscores=True)

        self.logger.info('Decorated tree written to: %s' % options.output_tree)
Exemplo n.º 4
0
    def run(self, taxonomy_file, genome_list_file):
        """Add taxonomy to database."""

        genome_list = set()
        if genome_list_file:
            for line in open(genome_list_file):
                if '\t' in line:
                    genome_list.add(line.rstrip().split('\t')[0])
                else:
                    genome_list.add(line.rstrip().split(',')[0])

        # read taxonomy file
        taxonomy = Taxonomy().read(taxonomy_file)

        # add each taxonomic rank to database
        for i, rank in enumerate(Taxonomy.rank_labels):
            temp_file = tempfile.NamedTemporaryFile(delete=False)
            for genome_id, taxa in taxonomy.iteritems():
                if genome_list_file and genome_id not in genome_list:
                    continue

                rank_str = taxa[i]
                temp_file.write('%s\t%s\n' % (genome_id, rank_str))

            temp_file.close()
            cmd = 'gtdb -r metadata import --table %s --field %s --type %s --metadatafile %s' % (
                'metadata_taxonomy', 'gtdb_' + rank, 'TEXT', temp_file.name)
            print cmd
            os.system(cmd)
            os.remove(temp_file.name)
Exemplo n.º 5
0
    def append(self, options):
        """Append command"""
        
        check_file_exists(options.input_tree)
        check_file_exists(options.input_taxonomy)

        taxonomy = Taxonomy().read(options.input_taxonomy)

        tree = dendropy.Tree.get_from_path(options.input_tree, 
                                            schema='newick', 
                                            rooting='force-rooted', 
                                            preserve_underscores=True)
        
        for n in tree.leaf_node_iter():
            taxa_str = taxonomy.get(n.taxon.label, None)
            if taxa_str == None:
                self.logger.error('Taxonomy file does not contain an entry for %s.' % n.label)
                sys.exit(-1)
            n.taxon.label = n.taxon.label + '|' + '; '.join(taxonomy[n.taxon.label])

        tree.write_to_path(options.output_tree, 
                            schema='newick', 
                            suppress_rooting=True, 
                            unquoted_underscores=True)

        self.logger.info('Decorated tree written to: %s' % options.output_tree)
Exemplo n.º 6
0
    def run(self, taxonomy_file, genome_list):
        """Add taxonomy to database."""

        if genome_list:
            genomes_to_process = set()
            for line in open(genome_list):
                if line[0] == '#':
                    continue

                genomes_to_process.add(line.rstrip().split('\t')[0])

        # read taxonomy file
        taxonomy = Taxonomy().read(taxonomy_file)

        # add full taxonomy string to database
        temp_file = tempfile.NamedTemporaryFile(delete=False)
        for genome_id, taxa in taxonomy.iteritems():
            if genome_id.startswith('GCA_'):
                genome_id = 'GB_' + genome_id
            elif genome_id.startswith('GCF_'):
                genome_id = 'RS_' + genome_id

            if not genome_list or genome_id in genomes_to_process:
                taxa_str = ';'.join(taxa)
                temp_file.write('%s\t%s\n' % (genome_id, taxa_str))

        temp_file.close()
        cmd = 'gtdb -r metadata import --table %s --field %s --type %s --metadatafile %s' % (
            'metadata_taxonomy', 'ncbi_taxonomy_unfiltered', 'TEXT',
            temp_file.name)
        print cmd
        os.system(cmd)
        os.remove(temp_file.name)
Exemplo n.º 7
0
    def binomial(self, options):
        """Ensure species are designated using binomial nomenclature."""

        check_file_exists(options.input_taxonomy)

        fout = open(options.output_taxonomy, 'w')
        taxonomy = Taxonomy()
        t = taxonomy.read(options.input_taxonomy)

        for genome_id, taxon_list in t.items():
            taxonomy_str = ';'.join(taxon_list)
            if not taxonomy.check_full(taxonomy_str):
                sys.exit(-1)

            genus = taxon_list[5][3:]
            species = taxon_list[6][3:]
            if species and genus not in species:
                taxon_list[6] = 's__' + genus + ' ' + species
                taxonomy_str = ';'.join(taxon_list)

            fout.write('%s\t%s\n' % (genome_id, taxonomy_str))

        fout.close()

        self.logger.info('Revised taxonomy written to: %s' %
                         options.output_taxonomy)
    def manual_species(self, init_taxonomy, manually_curated_tree):
        """Identify species names manually set by curators."""

        # read initial and manually curated taxonomy
        self.logger.info('Reading initial species names.')
        init_taxonomy = Taxonomy().read(init_taxonomy, use_canonical_gid=True)
        init_num_gids = sum(
            [1 for gid in init_taxonomy if not gid.startswith('D-')])
        self.logger.info(
            ' - read taxonomy for {:,} genomes.'.format(init_num_gids))

        self.logger.info('Reading manually-curated species names from tree.')
        mc_tree = dendropy.Tree.get_from_path(manually_curated_tree,
                                              schema='newick',
                                              rooting='force-rooted',
                                              preserve_underscores=True)
        mc_taxonomy = Taxonomy().read_from_tree(mc_tree)

        mc_specific = {}
        for gid, taxa in mc_taxonomy.items():
            if gid.startswith('D-'):
                continue

            mc_sp = taxa[-1]
            if not mc_sp.startswith('s__') or mc_sp == 's__':
                self.logger.error(
                    'Most specific classification for {} is {}.'.format(
                        gid, taxa))
                continue

            mc_specific[gid] = specific_epithet(mc_sp)

        self.logger.info(' - read taxonomy for {:,} genomes.'.format(
            len(mc_specific)))

        # report genomes with modified specific name assignment
        self.logger.info(
            'Identifying genomes with manually-curated species names.')
        fout = open(os.path.join(self.output_dir, 'manual_species_names.tsv'),
                    'w')
        fout.write('Genome ID\tInitial species\tManually-curated species\n')
        num_mc = 0
        for gid, mc_sp in mc_specific.items():
            init_species = init_taxonomy[gid][Taxonomy.SPECIES_INDEX]
            init_specific = specific_epithet(init_species)

            if init_specific != mc_sp:
                mc_generic = mc_taxonomy[gid][Taxonomy.GENUS_INDEX].replace(
                    'g__', '')
                mc_species = 's__{} {}'.format(mc_generic, mc_sp)
                num_mc += 1
                fout.write('{}\t{}\t{}\n'.format(gid, init_species,
                                                 mc_species))

        fout.close()

        self.logger.info(
            ' - identified {:,} manually-curated species names.'.format(
                num_mc))
Exemplo n.º 9
0
    def pull(self, options):
        """Create taxonomy file from a decorated tree."""

        check_file_exists(options.input_tree)

        taxonomy = Taxonomy().read_from_tree(options.input_tree)
        Taxonomy().write(taxonomy, options.output_taxonomy)

        self.logger.info('Stripped tree written to: %s' %
                         options.output_taxonomy)
Exemplo n.º 10
0
def species_label(gtdb_taxonomy, ncbi_taxonomy, ncbi_organism_name):
    """Determine 'best' species label for each genome.

    Currently, this is just being set to the species label in the
    GTDB taxonomy. In theory, the NCBI taxonomy and organism name
    could also be consulted. However, since the GTDB taxonomy redefines
    some species this might be problematic so isn't currently being
    done.

    Parameters
    ----------
    gtdb_taxonomy : d[assembly_accession] -> [d__, ..., s__]
        GTDB taxonomy of each genome.
    ncbi_taxonomy : d[assembly_accession] -> [d__, ..., s__]
        NCBI taxonomy of each genome.
    ncbi_organism_name : d[assembly_accession] -> name
        NCBI organism name of each genome.

    Return
    ------
    dict : d[assembly_accession] -> species name
        Species name of each genome.
    """

    taxonomy = Taxonomy()

    species = {}
    species_index = Taxonomy.rank_index['s__']
    for genome_id, taxa in gtdb_taxonomy.iteritems():
        sp = taxa[species_index]
        if sp != 's__':
            species[genome_id] = sp

    if False:   # do not consider NCBI information as
                # it may conflict with GTDB information
                # in unwanted ways
        
        for genome_id, taxa in ncbi_taxonomy.iteritems():
            if genome_id in species:
                continue

            sp = taxa[species_index]
            sp = taxonomy.extract_valid_species_name(sp)
            if sp:
                species[genome_id] = sp

        for genome_id, sp in ncbi_organism_name.iteritems():
            if genome_id in species:
                continue

            sp = taxonomy.extract_valid_species_name(sp)
            if sp:
                species[genome_id] = sp

    return species
Exemplo n.º 11
0
def species_label(gtdb_taxonomy, ncbi_taxonomy, ncbi_organism_name):
    """Determine 'best' species label for each genome.

    Currently, this is just being set to the species label in the
    GTDB taxonomy. In theory, the NCBI taxonomy and organism name
    could also be consulted. However, since the GTDB taxonomy redefines
    some species this might be problematic so isn't currently being
    done.

    Parameters
    ----------
    gtdb_taxonomy : d[assembly_accession] -> [d__, ..., s__]
        GTDB taxonomy of each genome.
    ncbi_taxonomy : d[assembly_accession] -> [d__, ..., s__]
        NCBI taxonomy of each genome.
    ncbi_organism_name : d[assembly_accession] -> name
        NCBI organism name of each genome.

    Return
    ------
    dict : d[assembly_accession] -> species name
        Species name of each genome.
    """

    taxonomy = Taxonomy()

    species = {}
    species_index = Taxonomy.rank_index['s__']
    for genome_id, taxa in gtdb_taxonomy.iteritems():
        sp = taxa[species_index]
        if sp != 's__':
            species[genome_id] = sp

    if False:  # do not consider NCBI information as
        # it may conflict with GTDB information
        # in unwanted ways

        for genome_id, taxa in ncbi_taxonomy.iteritems():
            if genome_id in species:
                continue

            sp = taxa[species_index]
            sp = taxonomy.extract_valid_species_name(sp)
            if sp:
                species[genome_id] = sp

        for genome_id, sp in ncbi_organism_name.iteritems():
            if genome_id in species:
                continue

            sp = taxonomy.extract_valid_species_name(sp)
            if sp:
                species[genome_id] = sp

    return species
Exemplo n.º 12
0
    def pull(self, options):
        """Pull command"""
        check_file_exists(options.input_tree)

        t = Taxonomy().read_from_tree(options.input_tree) #, False)
        if not options.no_rank_fill:
            for taxon_id, taxa in t.iteritems():
                t[taxon_id] = Taxonomy().fill_missing_ranks(taxa)

        Taxonomy().write(t, options.output_file)

        self.logger.info('Taxonomy strings written to: %s' % options.output_file)
Exemplo n.º 13
0
    def clean_ftp(self,
                  new_list_genomes,
                  ftp_genome_dir_file,
                  ftp_genome_dir,
                  report_dir,
                  taxonomy_file=None):
        list_of_files = new_list_genomes.split(',')
        genome_in_new_rel = []
        make_sure_path_exists(report_dir)
        for new_genome_file in list_of_files:
            with open(new_genome_file, 'r') as ngf:
                for line in ngf:
                    genome_in_new_rel.append(line.strip().split('\t')[0])

        # read taxonomy file
        taxonomy = {}
        if taxonomy_file is not None:
            taxonomy = Taxonomy().read(taxonomy_file)

        current_ftp_genomes = {}
        with open(ftp_genome_dir_file) as fgdf:
            for line in fgdf:
                infos = line.strip().split('\t')
                current_ftp_genomes[infos[0]] = infos[1]

        deleted_genomes = list(
            set(current_ftp_genomes.keys()) - set(genome_in_new_rel))
        added_genomes = list(
            set(genome_in_new_rel) - set(current_ftp_genomes.keys()))

        deleted_genome_file = open(
            os.path.join(report_dir, 'deleted_genomes.tsv'), 'w')
        added_genome_file = open(os.path.join(report_dir, 'added_genomes.tsv'),
                                 'w')

        print('{} genomes have been deleted in the release'.format(
            len(deleted_genomes)))
        print('{} genomes have been added in the release'.format(
            len(added_genomes)))

        for idx, deleted_genome in enumerate(deleted_genomes):
            print("{}/{} genomes deleted".format(idx, len(deleted_genomes)),
                  end="\r")
            deleted_genome_file.write('{}\n'.format(deleted_genome))
            #print('we delete {}'.format(current_ftp_genomes.get(deleted_genome)))
            shutil.rmtree(current_ftp_genomes.get(deleted_genome))
            self.delete_empty_directory(
                os.path.dirname(current_ftp_genomes.get(deleted_genome)))

        for added_genome in added_genomes:
            added_genome_file.write('{}\t{}\n'.format(
                added_genome,
                taxonomy.get(added_genome, ['N/A'] * 7)[6]))
Exemplo n.º 14
0
    def pull(self, options):
        """Pull command"""
        check_file_exists(options.input_tree)

        t = Taxonomy().read_from_tree(options.input_tree)  #, False)
        if not options.no_rank_fill:
            for taxon_id, taxa in t.items():
                t[taxon_id] = Taxonomy().fill_missing_ranks(taxa)

        Taxonomy().write(t, options.output_file)

        self.logger.info('Taxonomy strings written to: %s' %
                         options.output_file)
  def run(self, taxonomy_file, genome_list_file):
    """Add taxonomy to database."""

    genome_list = set()
    if genome_list_file:
        for line in open(genome_list_file):
	  if '\t' in line:
            genome_list.add(line.rstrip().split('\t')[0])
	  else:
	    genome_list.add(line.rstrip().split(',')[0])

    # read taxonomy file
    taxonomy = Taxonomy().read(taxonomy_file)

    # add full taxonomy string to database
    temp_file = tempfile.NamedTemporaryFile(delete=False)
    for genome_id, taxa in taxonomy.iteritems():
      if genome_list_file and genome_id not in genome_list:
        continue

      taxa_str = ';'.join(taxa)
      temp_file.write('%s\t%s\n' % (genome_id, taxa_str))

    temp_file.close()
    cmd = 'gtdb -r metadata import --table %s --field %s --type %s --metadatafile %s' % ('metadata_taxonomy', 'gtdb_taxonomy', 'TEXT', temp_file.name)
    print cmd
    os.system(cmd)
    os.remove(temp_file.name)

    # add each taxonomic rank to database
    for i, rank in enumerate(Taxonomy.rank_labels):
      temp_file = tempfile.NamedTemporaryFile(delete=False)
      for genome_id, taxa in taxonomy.iteritems():
        if genome_list_file and genome_id not in genome_list:
          continue

        rank_str = taxa[i]
        if Taxonomy.rank_labels[i] == 'species':
          # ensure species name includes genus
          if taxa[i-1][3:] not in taxa[i]:
            rank_str = 's__' + taxa[i-1][3:] + ' ' + taxa[i][3:]

        temp_file.write('%s\t%s\n' % (genome_id, rank_str))

      temp_file.close()
      cmd = 'gtdb -r metadata import --table %s --field %s --type %s --metadatafile %s' % ('metadata_taxonomy', 'gtdb_' + rank, 'TEXT', temp_file.name)
      print cmd
      os.system(cmd)
      os.remove(temp_file.name)
Exemplo n.º 16
0
    def add_sp_label(self, options):
        """Generate tree with species labels."""

        check_file_exists(options.taxonomy_file)
        check_file_exists(options.tree_file)

        self.logger.info('Reading GTDB taxonomy.')
        gtdb_taxonomy = Taxonomy().read(options.taxonomy_file)

        self.logger.info('Reading input tree.')
        tree = dendropy.Tree.get_from_path(options.tree_file,
                                           schema='newick',
                                           rooting='force-rooted',
                                           preserve_underscores=True)

        self.logger.info('Appending species labels.')
        for node in tree.postorder_node_iter():
            if node.is_leaf():
                gid = node.taxon.label
                species = gtdb_taxonomy[gid][Taxonomy.SPECIES_INDEX].replace(
                    's__', '')
                node.taxon.label += ' | {}'.format(species)

        self.logger.info('Writing output tree.')
        tree.write_to_path(options.output_tree,
                           schema='newick',
                           suppress_rooting=True,
                           unquoted_underscores=True)

        self.logger.info('Done.')
Exemplo n.º 17
0
    def _assign_taxon_labels(self, fmeasure_for_taxa):
        """Assign taxon labels to nodes.
        
        Parameters
        ----------
        fmeasure_for_taxa : d[taxon] -> [(Node, F-measure, precision, recall), ...]
          Node with highest F-measure for each taxon.
          
        Returns
        -------
        set
            Taxon labels placed in tree.
        """

        placed_taxon = set()
        for taxon in Taxonomy().sort_taxa(list(fmeasure_for_taxa.keys())):
            if len(fmeasure_for_taxa[taxon]) == 1:
                placed_taxon.add(taxon)
                node, fmeasure, precision, recall = fmeasure_for_taxa[taxon][0]

                support, taxon_label, aux_info = parse_label(node.label)
                if taxon_label:
                    taxon_label += '; ' + taxon
                else:
                    taxon_label = taxon
                node.label = create_label(support, taxon_label, aux_info)

        return placed_taxon
Exemplo n.º 18
0
    def classify(self, seq_file, db, taxonomy_file, evalue_threshold, output_dir):
        """Classify rRNA genes.

        Parameters
        ----------
        seq_file : str
            Name of fasta file containing rRNA sequences.
        ssu_db : str
            BLAST database of rRNA genes.
        ssu_taxonomy_file : str
            Taxonomy file for genes in the rRNA database.
        evalue_threshold : float
            E-value threshold for defining valid hits.
        output_dir : str
            Output directory.
        """

        # blast sequences against rRNA database
        blast = Blast(self.cpus)
        blast_file = os.path.join(output_dir, '%s.blastn.tsv' % self.rna_name)
        blast.blastn(seq_file, db, blast_file, evalue=evalue_threshold,
                     max_matches=5, output_fmt='custom')

        # read taxonomy file
        taxonomy = Taxonomy().read(taxonomy_file)

        # write out classification file
        classification_file = os.path.join(
            output_dir, '%s.taxonomy.tsv' % self.rna_name)
        fout = open(classification_file, 'w')
        fout.write(
            'query_id\ttaxonomy\tlength\tblast_subject_id\tblast_evalue\tblast_bitscore\tblast_align_len\tblast_perc_identity\n')

        processed_query_ids = set()
        for line in open(blast_file):
            line_split = [x.strip() for x in line.split('\t')]
            query_id = line_split[0]

            if query_id in processed_query_ids:
                # A query may have multiple hits to different genes or sections
                # of a gene. Blast results are organized by bitscore so
                # only the first hit is considered.
                continue

            processed_query_ids.add(query_id)
            query_len = int(line_split[1])
            subject_id = line_split[2]
            align_len = line_split[5]
            perc_identity = line_split[6]
            evalue = line_split[7]
            bitscore = line_split[8]

            taxonomy_str = ';'.join(taxonomy[subject_id])

            fout.write('%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n' % (query_id, taxonomy_str,
                                                             query_len, subject_id, evalue, bitscore, align_len, perc_identity))

        fout.close()
Exemplo n.º 19
0
    def validate(self, options):
        """Check taxonomy file is formatted as expected."""

        check_file_exists(options.input_taxonomy)

        taxonomy = Taxonomy()
        t = taxonomy.read(options.input_taxonomy)

        taxonomy.validate(t,
                          check_prefixes=True,
                          check_ranks=True,
                          check_hierarchy=True,
                          check_species=True,
                          check_group_names=True,
                          check_duplicate_names=True,
                          report_errors=True)

        self.logger.info('Finished performing validation tests.')
Exemplo n.º 20
0
    def run(self, gtdb_bac_taxonomy_file, gtdb_ar_taxonomy_file, silva_ssu_ref,
            silva_lsu_ref, ssu_blast_table, lsu_blast_table, output_dir):
        """Create table assigning GTDB taxonomy to SILVA accessions based on SSU and LSU BLAST results."""

        if not os.path.exists(output_dir):
            os.makedirs(output_dir)

        # read GTDB taxonomy
        print('Reading GTDB taxonomy.')
        gtdb_bac_taxonomy = Taxonomy().read(gtdb_bac_taxonomy_file)
        gtdb_ar_taxonomy = Taxonomy().read(gtdb_ar_taxonomy_file)
        gtdb_taxonomy = gtdb_bac_taxonomy.copy()
        gtdb_taxonomy.update(gtdb_ar_taxonomy)

        print('Identified %d bacterial genomes to process.' %
              len(gtdb_bac_taxonomy))
        print('Identified %d archaeal genomes to process.' %
              len(gtdb_ar_taxonomy))
        print('Identified %d genomes to process.' % len(gtdb_taxonomy))

        # read SILVA taxonomy
        print('Reading SILVA 16S and 23S rRNA taxonomies.')
        silva_ssu_taxonomy = {}
        for seq_id, seq, taxonomy in seq_io.read_seq(silva_ssu_ref,
                                                     keep_annotation=True):
            silva_ssu_taxonomy[seq_id] = taxonomy

        silva_lsu_taxonomy = {}
        for seq_id, seq, taxonomy in seq_io.read_seq(silva_lsu_ref,
                                                     keep_annotation=True):
            silva_lsu_taxonomy[seq_id] = taxonomy

        # parse BLAST tables
        print('Parsing BLAST tables.')

        ssu_table = os.path.join(output_dir, 'ssu_silva.tsv')
        self._parse_blast_table(ssu_blast_table, gtdb_taxonomy,
                                silva_ssu_taxonomy, self.min_ssu_len,
                                ssu_table)

        lsu_table = os.path.join(output_dir, 'lsu_silva.tsv')
        self._parse_blast_table(lsu_blast_table, gtdb_taxonomy,
                                silva_lsu_taxonomy, self.min_lsu_len,
                                lsu_table)
Exemplo n.º 21
0
    def __init__(self, genome_id, taxonomy):
        """Initialization.

        Parameters
        ----------
        genome_id : str
            Unique id of genome
        taxonomy : dict[ref_genome_id] -> [domain, phylum, ..., species]
            Taxonomic assignment of each reference genome.

        """

        self.percent_to_classify = 0.2

        self.rank_prefixes = Taxonomy().rank_prefixes
        self.rank_labels = Taxonomy().rank_labels

        self.genome_id = genome_id
        self.taxonomy = taxonomy

        self.unclassified = 'unclassified'

        self.TaxaInfo = namedtuple(
            'TaxaInfo', """evalue
                                                perc_identity
                                                aln_length
                                                num_seqs
                                                num_basepairs""")

        # track hits at each rank: dict[contig_id][rank][taxa] -> [HitInfo, ...]
        self.HitInfo = namedtuple(
            'HitInfo', """evalue
                                                perc_identity
                                                aln_length""")
        self.hits = defaultdict(lambda: defaultdict(lambda: defaultdict(list)))

        # total fragments from genome
        self.total_fragments = 0

        # length of contigs
        self.seq_len = {}

        # number of fragments from each sequence
        self.fragments_from_seq = {}
Exemplo n.º 22
0
    def __init__(self):
        """Initialize."""

        self.logger = logging.getLogger('timestamp')
        self.prev_taxonomy_dir = '/srv/projects/gtdb/data/taxonomy_gtdb'

        # get all previous taxonomy files
        self.logger.info('Reading previous GTDB taxonomy files in {}:'.format(
            self.prev_taxonomy_dir))
        taxonomies = defaultdict(lambda: {})
        for f in os.listdir(self.prev_taxonomy_dir):
            if f.endswith('.tsv') and 'gtdb' in f:
                self.logger.info('  %s' % f)
                taxonomy_file = os.path.join(self.prev_taxonomy_dir, f)

                taxonomy_id = '_'.join(f.split('_')[0:2])
                taxonomies[taxonomy_id].update(Taxonomy().read(taxonomy_file))

        self.logger.info(
            'Considering taxonomy from {:,} previous releases.'.format(
                len(taxonomies)))

        # get highest alphabetic suffix for each taxon
        self.logger.info(
            'Determining highest polyphyletic alphabetic suffix for each taxon.'
        )

        self.taxon_suffix = {}
        for taxonomy in taxonomies.values():
            for taxa in taxonomy.values():
                for taxon in taxa:
                    rank_prefix = taxon[0:3]
                    taxon_name = taxon[3:]

                    if '_' in taxon_name:
                        if rank_prefix != 's__':
                            taxon_name, suffix = taxon_name.rsplit('_', 1)
                        else:
                            # check if the specific name has a suffix
                            generic_name, specific_name = taxon_name.split()
                            if '_' in specific_name:
                                canonical_specific_name, suffix = specific_name.rsplit(
                                    '_', 1)
                                taxon_name = '{} {}'.format(
                                    generic_name, canonical_specific_name)
                            else:
                                continue

                        canonical_taxon = '{}{}'.format(
                            rank_prefix, taxon_name)
                        cur_suffix = self.taxon_suffix.get(
                            canonical_taxon, 'A')

                        if self._suffix_value(suffix) >= self._suffix_value(
                                cur_suffix):
                            self.taxon_suffix[canonical_taxon] = suffix
Exemplo n.º 23
0
    def root(self, options):
        """Root tree using outgroup."""
        self.logger.warning("Tree rooting is still under development!")

        check_file_exists(options.input_tree)

        gtdb_taxonomy = Taxonomy().read(Config.TAXONOMY_FILE)

        self.logger.info('Identifying genomes from the specified outgroup.')
        outgroup = set()
        for genome_id, taxa in gtdb_taxonomy.iteritems():
            if options.outgroup_taxon in taxa:
                outgroup.add(genome_id)

        reroot = RerootTree()
        reroot.root_with_outgroup(options.input_tree, options.output_tree,
                                  outgroup)

        self.logger.info('Done.')
Exemplo n.º 24
0
    def taxon_stats(self, options):
        """Taxon stats command"""

        check_file_exists(options.taxonomy_file)

        taxonomy = Taxonomy().read(options.taxonomy_file)
        taxon_children = Taxonomy().taxon_children(taxonomy)

        fout = open(options.output_file, 'w')
        fout.write('Taxa')
        for rank in Taxonomy.rank_labels[1:]:
            fout.write('\t# named %s' % rank)
        fout.write('\t# extant taxon with complete taxonomy')
        fout.write('\n')

        for rank_prefix in Taxonomy.rank_prefixes:
            # find taxon at the specified rank
            cur_taxa = []
            for taxon in taxon_children:
                if taxon.startswith(rank_prefix):
                    cur_taxa.append(taxon)

            cur_taxa.sort()

            for taxon in cur_taxa:
                fout.write(taxon)
                fout.write('\t-' * Taxonomy.rank_index[rank_prefix])

                next_taxa = [taxon]
                for _ in range(Taxonomy.rank_index[rank_prefix],
                               Taxonomy.rank_index['s__'] + 1):
                    children_taxa = set()
                    for t in next_taxa:
                        children_taxa.update(taxon_children[t])

                    fout.write('\t%d' % len(children_taxa))
                    next_taxa = children_taxa
                fout.write('\n')

        fout.close()

        self.logger.info('Summary statistics written to: %s' %
                         options.output_file)
Exemplo n.º 25
0
    def _write_summary_table(self, fmeasure_for_taxa, taxonomy, summary_table):
        """Write table containing statistics for each taxonomic rank.
        
        Parameters
        ----------
        fmeasure_for_taxa : d[taxon] -> [(Node, F-measure, precision, recall)]
          Node with highest F-measure for each taxon.
        taxonomy : d[unique_id] -> [d__<taxon>; ...; s__<taxon>]
          Taxonomic information for taxa in tree of interest.
        summary_table : str
          Output table to write statistics for assigned labels.  
        """

        # get number of monophyletic, operationally monophyletic, and polyphyletic
        # taxa at each taxonomic rank
        taxon_count = defaultdict(int)
        mono = defaultdict(int)
        op_mono = defaultdict(int)
        poly = defaultdict(int)
        for taxon in Taxonomy().sort_taxa(fmeasure_for_taxa.keys()):
            if len(fmeasure_for_taxa[taxon]) != 1:
                self.logger.error(
                    'Multiple positions specified for taxon label.')
                sys.exit()

            rank_prefix = taxon[0:3]
            taxon_count[rank_prefix] += 1

            stat_table = fmeasure_for_taxa[taxon][0]
            if stat_table.fmeasure == 1.0:
                mono[rank_prefix] += 1
            elif stat_table.fmeasure >= 0.95:
                op_mono[rank_prefix] += 1
            else:
                poly[rank_prefix] += 1

        fout = open(summary_table, 'w')
        fout.write('Rank\tNo. taxon')
        fout.write(
            '\tNo. monophyletic\tNo. operationally monophyletic\tNo. polyphyletic'
        )
        fout.write(
            '\tMonophyletic (%)\tOperationally monophyletic (%)\tPolyphyletic (%)\n'
        )
        for idx, rank_prefix in enumerate(Taxonomy.rank_prefixes):
            fout.write('{}\t{}'.format(Taxonomy.rank_labels[idx],
                                       taxon_count[rank_prefix]))
            fout.write('\t{}\t{}\t{}'.format(mono[rank_prefix],
                                             op_mono[rank_prefix],
                                             poly[rank_prefix]))
            fout.write('\t{:.3f}\t{:.3f}\t{:.3f}\n'.format(
                mono[rank_prefix] * 100.0 / taxon_count[rank_prefix],
                op_mono[rank_prefix] * 100.0 / taxon_count[rank_prefix],
                poly[rank_prefix] * 100.0 / taxon_count[rank_prefix]))
        fout.close()
Exemplo n.º 26
0
    def __init__(self, cpus, output_dir):
        """Initialization.

        Parameters
        ----------
        cpus : int
            Number of cpus to use.
        output_dir : str
            Directory to store results.
        """
        self.logger = logging.getLogger()

        self.cpus = cpus
        self.output_dir = output_dir

        self.rank_prefixes = Taxonomy().rank_prefixes
        self.rank_labels = Taxonomy().rank_labels

        # profile for each genome
        self.profiles = {}
Exemplo n.º 27
0
    def fill_ranks(self, options):
        """Ensure taxonomy strings contain all 7 canonical ranks."""

        check_file_exists(options.input_taxonomy)

        fout = open(options.output_taxonomy, 'w')
        taxonomy = Taxonomy()
        t = taxonomy.read(options.input_taxonomy)

        for genome_id, taxon_list in t.iteritems():
            full_taxon_list = taxonomy.fill_missing_ranks(taxon_list)

            taxonomy_str = ';'.join(full_taxon_list)
            if not taxonomy.check_full(taxonomy_str):
                sys.exit(-1)

            fout.write('%s\t%s\n' % (genome_id, taxonomy_str))

        fout.close()

        self.logger.info('Revised taxonomy written to: %s' % options.output_taxonomy)
Exemplo n.º 28
0
    def fill_ranks(self, options):
        """Ensure taxonomy strings contain all 7 canonical ranks."""

        check_file_exists(options.input_taxonomy)

        fout = open(options.output_taxonomy, 'w')
        taxonomy = Taxonomy()
        t = taxonomy.read(options.input_taxonomy)

        for genome_id, taxon_list in t.items():
            full_taxon_list = taxonomy.fill_missing_ranks(taxon_list)

            taxonomy_str = ';'.join(full_taxon_list)
            if not taxonomy.check_full(taxonomy_str):
                sys.exit(-1)

            fout.write('%s\t%s\n' % (genome_id, taxonomy_str))

        fout.close()

        self.logger.info('Revised taxonomy written to: %s' % options.output_taxonomy)
Exemplo n.º 29
0
 def _write_statistics_table(self, fmeasure_for_taxa, taxonomy, out_table):
     """Write table containing statistics for each taxon.
     
     Parameters
     ----------
     fmeasure_for_taxa : d[taxon] -> [(Node, F-measure, precision, recall)]
       Node with highest F-measure for each taxon.
     taxonomy : d[unique_id] -> [d__<taxon>; ...; s__<taxon>]
       Taxonomic information for taxa in tree of interest.
     out_table : str
       Output table to write statistics for assigned labels.  
     """
     
     # get extent taxa
     extant_taxa = Taxonomy().extant_taxa(taxonomy)
 
     fout_table = open(out_table, 'w')
     fout_table.write('Taxon\tNo. Expected in Tree\tF-measure\tPrecision\tRecall')
     fout_table.write('\tNo. Genomes from Taxon\tNo. Genome In Lineage')
     fout_table.write('\tRogue out\tRogue in\n')
     for taxon in Taxonomy().sort_taxa(fmeasure_for_taxa.keys()):     
         if len(fmeasure_for_taxa[taxon]) != 1:
             self.logger.error('Multiple positions specified for taxon label.')
             sys.exit()
             
         num_genomes = len(extant_taxa[taxon])
         
         stat_table = fmeasure_for_taxa[taxon][0]
         fout_table.write('%s\t%d\t%.4f\t%.4f\t%.4f\t%d\t%d\t%s\t%s\n' % (
                             taxon, 
                             num_genomes, 
                             stat_table.fmeasure, 
                             stat_table.precision, 
                             stat_table.recall,
                             stat_table.taxa_in_lineage,
                             stat_table.num_leaves_with_taxa,
                             ','.join(stat_table.rogue_out),
                             ','.join(stat_table.rogue_in)))
             
     fout_table.close()
Exemplo n.º 30
0
    def diff(self, options):
        """Compare two taxonomy files."""

        check_file_exists(options.input_taxonomy1)
        check_file_exists(options.input_taxonomy2)

        taxonomy1 = Taxonomy().read(options.input_taxonomy1)
        taxonomy2 = Taxonomy().read(options.input_taxonomy2)

        all_taxon_ids = set(taxonomy1.keys()).union(list(taxonomy2.keys()))

        rank_index = Taxonomy.rank_labels.index(options.rank)
        for taxon_id in all_taxon_ids:
            if options.report_missing_taxa:
                if taxon_id not in taxonomy1:
                    print('Missing in taxonomy 1: %s' % taxon_id)
                elif taxon_id not in taxonomy2:
                    print('Missing in taxonomy 2: %s' % taxon_id)

            if taxon_id in taxonomy1 and taxon_id in taxonomy2:
                taxon1 = taxonomy1[taxon_id][rank_index]
                taxon2 = taxonomy2[taxon_id][rank_index]

                if taxon1 != taxon2:
                    if options.report_missing_ranks or (taxon1[3:]
                                                        and taxon2[3:]):
                        print('Different taxon for %s: %s %s' %
                              (taxon_id, taxon1, taxon2))

        print('Done.')
Exemplo n.º 31
0
 def _tax_diff_table(self, tax1, tax2, output_table):
     """Tabulate incongruency of taxonomy strings at each rank."""
     
     fout = open(output_table, 'w')
     fout.write('Lineage\tNo. Extent Taxa')
     for rank_label in Taxonomy.rank_labels:
         fout.write('\t%s (%%)' % rank_label.title())
     fout.write('\n')
     
     taxonomy = Taxonomy()
     named_lineages_at_rank = taxonomy.named_lineages_at_rank(tax1)
     for rank, taxa in named_lineages_at_rank.items():
         rank_label = Taxonomy.rank_labels[rank]
         if rank_label == 'species':
             continue
             
         extant_taxa_for_rank = taxonomy.extant_taxa_for_rank(rank_label, tax1)
         
         for taxon in taxa:
             extent_taxa = extant_taxa_for_rank[taxon]
             fout.write('%s\t%d' % (taxon, len(extent_taxa)))
             
             row = defaultdict(list)
             for genome_id in extent_taxa:
                 taxa1 = tax1[genome_id]
                 taxa2 = tax2[genome_id]
                 
                 for cur_rank, (taxa1, taxa2) in enumerate(list(zip(taxa1, taxa2))):
                      row[cur_rank].append(taxa1 == taxa2)
                      
             for cur_rank, matches in row.items():
                 if cur_rank <= rank:
                     fout.write('\t-')
                 else:
                     perc_match = sum(matches) * 100.0 / len(matches)
                     fout.write('\t%.1f' % (100.0 - perc_match))
             fout.write('\n')
     fout.close()
Exemplo n.º 32
0
 def _tax_diff_table(self, tax1, tax2, output_table):
     """Tabulate incongruency of taxonomy strings at each rank."""
     
     fout = open(output_table, 'w')
     fout.write('Lineage\tNo. Extent Taxa')
     for rank_label in Taxonomy.rank_labels:
         fout.write('\t%s (%%)' % rank_label.title())
     fout.write('\n')
     
     taxonomy = Taxonomy()
     named_lineages_at_rank = taxonomy.named_lineages_at_rank(tax1)
     for rank, taxa in named_lineages_at_rank.iteritems():
         rank_label = Taxonomy.rank_labels[rank]
         if rank_label == 'species':
             continue
             
         extant_taxa_for_rank = taxonomy.extant_taxa_for_rank(rank_label, tax1)
         
         for taxon in taxa:
             extent_taxa = extant_taxa_for_rank[taxon]
             fout.write('%s\t%d' % (taxon, len(extent_taxa)))
             
             row = defaultdict(list)
             for genome_id in extent_taxa:
                 taxa1 = tax1[genome_id]
                 taxa2 = tax2[genome_id]
                 
                 for cur_rank, (taxa1, taxa2) in enumerate(zip(taxa1, taxa2)):
                      row[cur_rank].append(taxa1 == taxa2)
                      
             for cur_rank, matches in row.iteritems():
                 if cur_rank <= rank:
                     fout.write('\t-')
                 else:
                     perc_match = sum(matches) * 100.0 / len(matches)
                     fout.write('\t%.1f' % (100.0 - perc_match))
             fout.write('\n')
     fout.close()
Exemplo n.º 33
0
    def classify_seqs(self):
        """Classify sequences.

        Sequences are classified using a majority vote
        over all fragments originating from the sequence
        with a valid hit. If less than 20% of fragments have
        a valid hit, the sequence is considered unclassified.
        Classification is performed from the highest (domain)
        to lowest (species) rank. If a rank is taxonomically
        inconsistent with a higher ranks classification, this
        rank and all lower ranks are set to unclassified.

        Returns
        -------
        dict : d[contig_id][rank] -> [taxa, HitInfo]
            Classification of each sequence along with summary statistics
            of hits to the specified taxa.
        """

        expected_parent = Taxonomy().taxonomic_consistency(self.taxonomy)

        # classify each sequence using a majority vote
        seq_assignments = defaultdict(lambda: defaultdict(list))
        for seq_id, rank_hits in self.hits.iteritems():
            parent_taxa = None
            for rank in xrange(0, len(self.rank_prefixes)):
                taxa = max(rank_hits[rank],
                           key=lambda x: len(rank_hits[rank][x]))
                count = len(rank_hits[rank][taxa])

                if count >= self.percent_to_classify * self.fragments_from_seq[
                        seq_id]:
                    if rank == 0 or expected_parent[taxa] == parent_taxa:
                        seq_assignments[seq_id][rank] = [
                            taxa, rank_hits[rank][taxa]
                        ]
                        parent_taxa = taxa
                else:
                    # set to  unclassified at all lower ranks
                    for r in xrange(rank, len(self.rank_prefixes)):
                        seq_assignments[seq_id][r] = [self.unclassified, None]
                    break

        # identify sequences with no hits
        for seq_id in self.seq_len:
            if seq_id not in seq_assignments:
                for rank in xrange(0, len(self.rank_prefixes)):
                    seq_assignments[seq_id][rank] = [self.unclassified, None]

        return seq_assignments
Exemplo n.º 34
0
    def outgroup(self, options):
        """Reroot tree with outgroup."""

        check_file_exists(options.taxonomy_file)

        self.logger.info('Identifying genomes from the specified outgroup.')
        outgroup = set()
        for genome_id, taxa in Taxonomy().read(options.taxonomy_file).items():
            if options.outgroup_taxon in taxa:
                outgroup.add(genome_id)
        self.logger.info('Identifying %d genomes in the outgroup.' % len(outgroup))

        reroot = RerootTree()
        reroot.root_with_outgroup(options.input_tree, options.output_tree, outgroup)
Exemplo n.º 35
0
    def validate(self, options):
        """Validate command"""

        check_file_exists(options.taxonomy_file)

        taxonomy = Taxonomy()
        t = taxonomy.read(options.taxonomy_file)

        errors = taxonomy.validate(t,
                                     not options.no_prefix,
                                     not options.no_all_ranks,
                                     not options.no_hierarhcy,
                                     not options.no_species,
                                     True)

        invalid_ranks, invalid_prefixes, invalid_species_name, invalid_hierarchies = errors

        if sum([len(e) for e in errors]) == 0:
            self.logger.info('No errors identified in taxonomy file.')
        else:
            self.logger.info('Identified %d incomplete taxonomy strings.' % len(invalid_ranks))
            self.logger.info('Identified %d rank prefix errors.' % len(invalid_prefixes))
            self.logger.info('Identified %d invalid species names.' % len(invalid_species_name))
            self.logger.info('Identified %d taxa with multiple parents.' % len(invalid_hierarchies))
Exemplo n.º 36
0
    def create_records(self, metadata_file, msa_file, taxonomy_file,
                       genome_list, output_file):
        """Create ARB records from GTDB metadata."""

        seqs = {}
        if msa_file:
            seqs = seq_io.read(msa_file)

        taxonomy = {}
        if taxonomy_file:
            taxonomy = Taxonomy().read(taxonomy_file)

        genomes_to_keep = set()
        if genome_list:
            for line in open(genome_list):
                genomes_to_keep.add(line.strip())

        fout = open(output_file, 'w')

        delimiter = ','
        if metadata_file.endswith('.tsv'):
            delimiter = '\t'

        header = True
        for row in csv.reader(open(metadata_file, 'rb'), delimiter=delimiter):
            if header:
                fields = [
                    f.lower().replace(' ', '_').replace('-', '_')
                    for f in row[1:]
                ]
                if taxonomy:
                    fields.append('gtdb_taxonomy')
                header = False
            else:
                genome_id = row[0]
                values = row[1:]
                if taxonomy:
                    values.append('; '.join(taxonomy[genome_id]))
                aligned_seq = seqs.get(genome_id, '')

                if not genomes_to_keep or genome_id in genomes_to_keep:
                    self._record(fout, genome_id, fields, values, aligned_seq)

        fout.close()
Exemplo n.º 37
0
    def dump(self, genomic_file, gtdb_taxonomy, min_5S_len, min_16S_ar_len,
             min_16S_bac_len, min_23S_len, min_contig_len, include_user,
             genome_list, output_dir):
        """Dump 5S, 16S, and 23S sequences to files."""

        if include_user:
            self.logger.warning('User genomes not currently supported.')
            sys.exit(-1)

        gtdb_taxonomy = Taxonomy().read(gtdb_taxonomy)

        genomes_of_interest = set()
        if genome_list:
            for line in open(genome_list):
                line_split = line.strip().split('\t')
                gid = line_split[0]
                if gid.startswith('GCA_'):
                    gid = 'GB_' + gid
                elif gid.startswith('GCF_'):
                    gid = 'RS_' + gid
                genomes_of_interest.add(gid)

            self.logger.info('Restricting gene dump to %d genomes.' %
                             len(genomes_of_interest))

        self.logger.info('Dumping 5S sequences.')
        self._dump_seqs(genomic_file, gtdb_taxonomy, genomes_of_interest,
                        'lsu_5S/lsu_5S', min_5S_len, min_5S_len,
                        min_contig_len, 'lsu_5s', output_dir)

        self.logger.info('Dumping 16S sequences.')
        self._dump_seqs(genomic_file, gtdb_taxonomy, genomes_of_interest,
                        'rna_silva/ssu', min_16S_ar_len, min_16S_bac_len,
                        min_contig_len, 'ssu', output_dir)

        self.logger.info('Dumping 23S sequences.')
        self._dump_seqs(genomic_file, gtdb_taxonomy, genomes_of_interest,
                        'rna_silva/lsu_23S', min_23S_len, min_23S_len,
                        min_contig_len, 'lsu_23s', output_dir)
Exemplo n.º 38
0
    def propagate(self, options):
        """Propagate labels to all genomes in a cluster."""

        check_file_exists(options.input_taxonomy)
        check_file_exists(options.metadata_file)

        # get representative genome information
        rep_metadata = read_gtdb_metadata(options.metadata_file, ['gtdb_representative',
                                                                  'gtdb_clustered_genomes'])
                                                                  
        taxonomy = Taxonomy()
        explict_tax = taxonomy.read(options.input_taxonomy)
        expanded_taxonomy = {}
        incongruent_count = 0
        for genome_id, taxon_list in explict_tax.iteritems():
            taxonomy_str = ';'.join(taxon_list)

            # Propagate taxonomy strings if genome is a representatives. Also, determine
            # if genomes clustered together have compatible taxonomies. Note that a genome
            # may not have metadata as it is possible a User has removed a genome that is
            # in the provided taxonomy file.
            _rep_genome, clustered_genomes = rep_metadata.get(genome_id, (None, None))
            if clustered_genomes:  # genome is a representative
                clustered_genome_ids = clustered_genomes.split(';')

                # get taxonomy of all genomes in cluster with a specified taxonomy
                clustered_genome_tax = {}
                for cluster_genome_id in clustered_genome_ids:
                    if cluster_genome_id == genome_id:
                        continue

                    if cluster_genome_id not in rep_metadata:
                        continue  # genome is no longer in the GTDB so ignore it

                    if cluster_genome_id in explict_tax:
                        clustered_genome_tax[cluster_genome_id] = explict_tax[cluster_genome_id]

                # determine if representative and clustered genome taxonomy strings are congruent
                working_cluster_taxonomy = list(taxon_list)
                incongruent_with_rep = False
                for cluster_genome_id, cluster_tax in clustered_genome_tax.iteritems():
                    if incongruent_with_rep:
                        working_cluster_taxonomy = list(taxon_list)  # default to rep taxonomy
                        break

                    for r in xrange(0, len(Taxonomy.rank_prefixes)):
                        if cluster_tax[r] == Taxonomy.rank_prefixes[r]:
                            break  # no more taxonomy information to consider

                        if cluster_tax[r] != taxon_list[r]:
                            if taxon_list[r] == Taxonomy.rank_prefixes[r]:
                                # clustered genome has a more specific taxonomy string which
                                # should be propagate to the representative if all clustered
                                # genomes are in agreement
                                if working_cluster_taxonomy[r] == Taxonomy.rank_prefixes[r]:
                                    # make taxonomy more specific based on genomes in cluster
                                    working_cluster_taxonomy[r] = cluster_tax[r]
                                elif working_cluster_taxonomy[r] != cluster_tax[r]:
                                    # not all genomes agree on the assignment of this rank so leave it unspecified
                                    working_cluster_taxonomy[r] = Taxonomy.rank_prefixes[r]
                                    break
                            else:
                                # genomes in cluster have incongruent taxonomies so defer to representative
                                self.logger.warning("Genomes in cluster have incongruent taxonomies.")
                                self.logger.warning("Representative %s: %s" % (genome_id, taxonomy_str))
                                self.logger.warning("Clustered genome %s: %s" % (cluster_genome_id, ';'.join(cluster_tax)))
                                self.logger.warning("Deferring to taxonomy specified for representative.")

                                incongruent_count += 1
                                incongruent_with_rep = True
                                break

                cluster_taxonomy_str = ';'.join(working_cluster_taxonomy)

                # assign taxonomy to representative and all genomes in the cluster
                expanded_taxonomy[genome_id] = cluster_taxonomy_str
                for cluster_genome_id in clustered_genome_ids:
                    expanded_taxonomy[cluster_genome_id] = cluster_taxonomy_str
            else:
                if genome_id in expanded_taxonomy:
                    # genome has already been assigned a taxonomy based on its representative
                    pass
                else:
                    # genome is a singleton
                    expanded_taxonomy[genome_id] = taxonomy_str


        self.logger.info('Identified %d clusters with incongruent taxonomies.' % incongruent_count)

        fout = open(options.output_taxonomy, 'w')
        for genome_id, taxonomy_str in expanded_taxonomy.iteritems():
            fout.write('%s\t%s\n' % (genome_id, taxonomy_str))
        fout.close()

        self.logger.info('Taxonomy written to: %s' % options.output_taxonomy)
Exemplo n.º 39
0
   def tax_diff(self, tax1_file, tax2_file, include_user_taxa, output_dir):
       """Tabulate differences between two taxonomies.
       
       Parameters
       ----------
       tax1_file : str
           First taxonomy file.
       tax2_file : str
           Second taxonomy file.
       include_user_taxa : boolean
           Flag indicating if User genomes should be considered.
       output_dir : str
           Output directory.
       """
       
       tax1 = Taxonomy().read(tax1_file)
       tax2 = Taxonomy().read(tax2_file)
       
       if not include_user_taxa:
           new_tax1 = {}
           for genome_id, taxonomy in tax1.iteritems():
               if not genome_id.startswith('U_'):
                   new_tax1[genome_id] = taxonomy
           tax1 = new_tax1
           
           new_tax2 = {}
           for genome_id, taxonomy in tax2.iteritems():
               if not genome_id.startswith('U_'):
                   new_tax2[genome_id] = taxonomy
           tax2 = new_tax2
       
       common_taxa = set(tax1.keys()).intersection(tax2.keys())
       
       self.logger.info('First taxonomy contains %d taxa.' % len(tax1))
       self.logger.info('Second taxonomy contains %d taxa.' % len(tax2))
       self.logger.info('Taxonomies have %d taxa in common.' % len(common_taxa))
       
       # identify differences between taxonomies
       tax_file_name1 = os.path.splitext(os.path.basename(tax1_file))[0]
       tax_file_name2 = os.path.splitext(os.path.basename(tax2_file))[0]
       output_table = os.path.join(output_dir, '%s.tax_diff.tsv' % tax_file_name1)
       
       fout = open(output_table, 'w')
       fout.write('Genome ID\tChange\tRank\t%s\t%s\n' % (tax_file_name1, tax_file_name2))
       
       unchanged = defaultdict(int)           # T2 = g__Bob -> T1 = g__Bob, or T2 = g__ -> T1 = g__
       active_change = defaultdict(int)       # T2 = g__Bob -> T1 = g__Jane, or T2 = g__Bob -> T1 = g__Bob_A
       passive_change = defaultdict(int)      # T2 = g__??? -> T1 = g__Jane
       unresolved_change = defaultdict(int)   # T2 = g__Box -> T1 = g__???
       for taxa in common_taxa:
           t1 = tax1[taxa]
           t2 = tax2[taxa]
           
           for rank, (taxon1, taxon2) in enumerate(zip(t1, t2)):
               if taxon1 == taxon2:
                   unchanged[rank] += 1
               elif taxon1 != Taxonomy.rank_prefixes[rank] and taxon2 != Taxonomy.rank_prefixes[rank]:
                   active_change[rank] += 1
                   fout.write('%s\t%s\t%s\t%s\t%s\n' % (taxa, 'active', Taxonomy.rank_labels[rank], ';'.join(t1), ';'.join(t2)))
               elif taxon2 == Taxonomy.rank_prefixes[rank]:
                   passive_change[rank] += 1
                   fout.write('%s\t%s\t%s\t%s\t%s\n' % (taxa, 'passive', Taxonomy.rank_labels[rank], ';'.join(t1), ';'.join(t2)))
               elif taxon1 == Taxonomy.rank_prefixes[rank]:
                   unresolved_change[rank] += 1
                   fout.write('%s\t%s\t%s\t%s\t%s\n' % (taxa, 'unresolved', Taxonomy.rank_labels[rank], ';'.join(t1), ';'.join(t2)))
                   
       fout.close()
 
       # report results
       output_table = os.path.join(output_dir, '%s.tax_diff_summary.tsv' % tax_file_name1)
       
       fout = open(output_table, 'w')
       fout.write('Rank\tUnchanged\tUnchanged (%)\tActive\t Active (%)\tPassive\tPassive (%)\tUnresolved\tUnresolved (%)\n')
       print 'Rank\tUnchanged\tActive\tPassive\tUnresolved\tTotal'
       for rank in xrange(0, len(Taxonomy.rank_prefixes)):
           total = unchanged[rank] + active_change[rank] + passive_change[rank] + unresolved_change[rank]
           if total != 0:
               fout.write('%s\t%d\t%.1f\t%d\t%.1f\t%d\t%.1f\t%d\t%.1f\n' %
                                   (Taxonomy.rank_labels[rank],
                                   unchanged[rank], unchanged[rank] * 100.0 / total,
                                   active_change[rank], active_change[rank] * 100.0 / total,
                                   passive_change[rank], passive_change[rank] * 100.0 / total,
                                   unresolved_change[rank], unresolved_change[rank] * 100.0 / total))
               print '%s\t%d\t%d\t%d\t%d\t%d' % (Taxonomy.rank_labels[rank],
                                                   unchanged[rank],
                                                   active_change[rank],
                                                   passive_change[rank],
                                                   unresolved_change[rank],
                                                   total)
Exemplo n.º 40
0
    def run(self, 
                input_tree, 
                taxonomy_file, 
                trusted_taxa_file, 
                min_children, 
                min_support,
                output_tree):
        """Decorate internal nodes with taxa labels.

        Parameters
        ----------
        input_tree : str
          Tree to decorate
        taxonomy_file : str
          File indicating taxonomic information for extant taxa.
        trusted_taxa_file : str
          File specifying trusted taxa to consider when inferring distribution. Set to None to consider all taxa.
        min_children : int
          Only consider taxa with at least the specified number of children taxa when inferring distribution.
        min_support : float
          Only consider taxa with at least this level of support when inferring distribution.
        output_tree: str
          Name of output tree.
        """
        
        # read tree
        self.logger.info('Reading tree.')
        tree = dendropy.Tree.get_from_path(input_tree, 
                                            schema='newick', 
                                            rooting='force-rooted', 
                                            preserve_underscores=True)
                                            
        # remove any previous taxon labels
        self.logger.info('Removing any previous internal node labels.')
        self._strip_taxon_labels(tree)
                                   
        # read taxonomy and trim to taxa in tree
        self.logger.info('Reading taxonomy.')
        full_taxonomy = Taxonomy().read(taxonomy_file)
        
        taxonomy = {}
        for leaf in tree.leaf_node_iter():
            taxonomy[leaf.taxon.label] = full_taxonomy.get(leaf.taxon.label, Taxonomy.rank_prefixes)

        # find best placement for each taxon based 
        # on the F-measure statistic
        self.logger.info('Calculating F-measure statistic for each taxa.')
        fmeasure_for_taxa = self._fmeasure(tree, taxonomy)
        
        # place labels with only one acceptable position and calculate
        # the relative divergence thresholds from these as a guide for
        # placing the remaining labels
        self.logger.info('Placing labels with unambiguous position in tree.')
        placed_taxon = self._assign_taxon_labels(fmeasure_for_taxa)

        # calculating relative 
        self.logger.info('Establishing median relative divergence for taxonomic ranks.')
        median_rank_rd = self._median_rank_rd(tree, 
                                                placed_taxon, 
                                                taxonomy,
                                                trusted_taxa_file, 
                                                min_children, 
                                                min_support)
                                                                                      
        # resolve ambiguous position in tree
        self.logger.info('Resolving ambiguous taxon label placements using median relative divergences.')
        self._resolve_ambiguous_placements(tree, fmeasure_for_taxa, median_rank_rd)
       
        # write statistics for placed taxon labels
        self.logger.info('Writing out statistics for taxa.')
        out_table = output_tree + '-table'
        self._write_statistics_table(fmeasure_for_taxa, out_table)
                                          
        # output taxonomy of extant taxa on tree
        self.logger.info('Writing out taxonomy for extant taxa.')
        out_taxonomy = output_tree + '-taxonomy'
        self._write_taxonomy(tree, out_taxonomy)
        
        # validate taxonomy
        self.logger.info('Validating taxonomy for extant taxa.')
        tree_taxonomy = Taxonomy().read(out_taxonomy)
        Taxonomy().validate(tree_taxonomy,
                          check_prefixes=True,
                          check_ranks=True,
                          check_hierarchy=True,
                          check_species=True,
                          check_group_names=True,
                          check_duplicate_names=True,
                          report_errors=True)
                                                                                  
        # output decorated tree
        self.logger.info('Writing out decorated tree.')
        tree.write_to_path(output_tree, 
                            schema='newick', 
                            suppress_rooting=True, 
                            unquoted_underscores=True)
Exemplo n.º 41
0
def filter_taxa_for_dist_inference(tree, taxonomy, trusted_taxa, min_children, min_support):
    """Determine taxa to use for inferring distribution of relative divergences.

    Parameters
    ----------
    tree : Dendropy Tree
        Phylogenetic tree.
    taxonomy : d[taxon ID] -> [d__x; p__y; ...]
        Taxonomy for each taxon.
    trusted_taxa : iterable
        Trusted taxa to consider when inferring distribution.
    min_children : int
        Only consider taxa with at least the specified number of children taxa when inferring distribution.
    min_support : float
        Only consider taxa with at least this level of support when inferring distribution.
    """

    # determine children taxa for each named group
    taxon_children = Taxonomy().taxon_children(taxonomy)

    # get all named groups
    taxa_for_dist_inference = set()
    for taxon_id, taxa in taxonomy.iteritems():
        for taxon in taxa:
            taxa_for_dist_inference.add(taxon)

    # sanity check species names as these are a common problem
    species = set()
    for taxon_id, taxa in taxonomy.iteritems():
        if len(taxa) > Taxonomy.rank_index['s__']:
            species_name = taxa[Taxonomy.rank_index['s__']]
            valid, error_msg = True, None
            if species_name != 's__':
                valid, error_msg = Taxonomy().validate_species_name(species_name, require_full=True, require_prefix=True)
            if not valid:
                print '[Warning] Species name %s for %s is invalid: %s' % (species_name, taxon_id, error_msg)
                continue
                
            species.add(species_name)

    # restrict taxa to those with a sufficient number of named children
    # Note: a taxonomic group with no children will not end up in the
    # taxon_children data structure so care must be taken when applying
    # this filtering criteria.
    if min_children > 0:
        valid_taxa = set()
        for taxon, children_taxa in taxon_children.iteritems():
            if len(children_taxa) >= min_children:
                valid_taxa.add(taxon)

        taxa_for_dist_inference.intersection_update(valid_taxa)

        # explicitly add in the species since they have no
        # children and thus be absent from the taxon_child dictionary
        taxa_for_dist_inference.update(species)

    # restrict taxa used for inferring distribution to those with sufficient support
    if min_support > 0:
        for node in tree.preorder_node_iter():
            if not node.label or node.is_leaf():
                continue

            # check for support value
            support, taxon_name, _auxiliary_info = parse_label(node.label)

            if not taxon_name:
                continue

            if support and float(support) < min_support:
                taxa_for_dist_inference.difference_update([taxon_name])
            elif not support and min_support > 0:
                # no support value, so inform user if they were trying to filter on this property
                print '[Error] Tree does not contain support values. As such, --min_support should be set to 0.'
                continue

    # restrict taxa used for inferring distribution to the trusted set
    if trusted_taxa:
        taxa_for_dist_inference = trusted_taxa.intersection(taxa_for_dist_inference)

    return taxa_for_dist_inference
Exemplo n.º 42
0
    def tree_tax_diff(self, tree1_file, tree2_file, output_dir):
        """Tabulate differences between two taxonomies on a tree.
        
        Parameters
        ----------
        tree1_file : str
            File with tree in Newick format.
        tree2_file : str
            File with tree in Newick format.
        output_dir : str
            Output directory.
        """
        
        tree1 = dendropy.Tree.get_from_path(tree1_file, 
                                            schema='newick', 
                                            rooting='force-rooted', 
                                            preserve_underscores=True)
                                            
        tree2 = dendropy.Tree.get_from_path(tree2_file, 
                                            schema='newick', 
                                            rooting='force-rooted', 
                                            preserve_underscores=True)
        
        # prune both trees to a set of common taxa
        taxa1 = set()
        for t in tree1.leaf_node_iter():
            taxa1.add(t.taxon.label)
            
        taxa2 = set()
        for t in tree2.leaf_node_iter():
            taxa2.add(t.taxon.label)
            
        taxa_in_common = taxa1.intersection(taxa2)
        self.logger.info('Tree 1 contains %d taxa.' % len(taxa1))
        self.logger.info('Tree 2 contains %d taxa.' % len(taxa2))
        self.logger.info('Pruning trees to the %d taxa in common.' % len(taxa_in_common))
        
        tree1.retain_taxa_with_labels(taxa_in_common)
        tree2.retain_taxa_with_labels(taxa_in_common)
        
        # get named lineages at each taxonomic rank
        taxonomy = Taxonomy()
        tax1 = taxonomy.read_from_tree(tree1)
        tax2 = taxonomy.read_from_tree(tree2)
        
        taxa_at_rank1 = taxonomy.named_lineages_at_rank(tax1)
        taxa_at_rank2 = taxonomy.named_lineages_at_rank(tax2)

        # identify retained taxonomic names
        tax_file_name = os.path.splitext(os.path.basename(tree1_file))[0]
        output_file = os.path.join(output_dir, '%s.taxa_diff.tsv' % tax_file_name)
        fout = open(output_file, 'w')
        fout.write('Rank\tClassification\tTaxonomy 1\tTaxonomy 2\n')
        taxon2_accounted_for = defaultdict(set)
        for rank, rank_label in enumerate(Taxonomy.rank_labels[0:-1]):
            for taxon in taxa_at_rank1[rank]: 
                # check if taxon has been retained
                if taxon in taxa_at_rank2[rank]:
                    fout.write('%s\t%s\t%s\t%s\n' % (rank_label, 'retained', taxon, taxon))
                    taxon2_accounted_for[rank].add(taxon)
                    continue
                    
                # check if name was simply corrected by changing suffix
                old_taxon = self._change_suffix(taxon, rank, taxa_at_rank2)  
                if old_taxon:
                    fout.write('%s\t%s\t%s\t%s\n' % (rank_label, 'corrected', taxon, old_taxon))
                    taxon2_accounted_for[rank].add(old_taxon)
                    continue
                                         
                # check if taxon has been moved up or down in rank
                old_taxon, old_rank = self._renamed(taxon, rank, taxa_at_rank2)
                if old_taxon:
                    if rank < old_rank:
                        fout.write('%s\t%s\t%s\t%s\n' % (rank_label, 'more general', taxon, old_taxon))
                    elif rank == old_rank:
                        fout.write('%s\t%s\t%s\t%s\n' % (rank_label, 'corrected', taxon, old_taxon))
                    else:
                        fout.write('%s\t%s\t%s\t%s\n' % (rank_label, 'more specific', taxon, old_taxon))
                    
                    taxon2_accounted_for[old_rank].add(old_taxon)   
                    continue
                          
                # otherwise, the taxon appears to be new
                fout.write('%s\t%s\t%s\t%s\n' % (rank_label, 'new', taxon, 'NA'))
               
        # report deprecated taxa
        for rank, rank_label in enumerate(Taxonomy.rank_labels[0:-1]):
            for taxon in taxa_at_rank2[rank]:
                if taxon not in taxon2_accounted_for[rank]:
                    fout.write('%s\t%s\t%s\t%s\n' % (rank_label, 'deprecated', 'NA', taxon))

        fout.close()
        
        # tabulate congruence of taxonomy strings
        output_table = os.path.join(output_dir, '%s.perc_diff.tsv' % tax_file_name)
        self._tax_diff_table(tax1, tax2, output_table)