示例#1
0
    def append(self, options):
        """Append command"""
        
        check_file_exists(options.input_tree)
        check_file_exists(options.input_taxonomy)

        taxonomy = Taxonomy().read(options.input_taxonomy)

        tree = dendropy.Tree.get_from_path(options.input_tree, 
                                            schema='newick', 
                                            rooting='force-rooted', 
                                            preserve_underscores=True)
        
        for n in tree.leaf_node_iter():
            taxa_str = taxonomy.get(n.taxon.label, None)
            if taxa_str == None:
                self.logger.error('Taxonomy file does not contain an entry for %s.' % n.label)
                sys.exit(-1)
            n.taxon.label = n.taxon.label + '|' + '; '.join(taxonomy[n.taxon.label])

        tree.write_to_path(options.output_tree, 
                            schema='newick', 
                            suppress_rooting=True, 
                            unquoted_underscores=True)

        self.logger.info('Decorated tree written to: %s' % options.output_tree)
示例#2
0
    def append(self, options):
        """Append command"""

        check_file_exists(options.input_tree)
        check_file_exists(options.input_taxonomy)

        taxonomy = Taxonomy().read(options.input_taxonomy)

        tree = dendropy.Tree.get_from_path(options.input_tree,
                                           schema='newick',
                                           rooting='force-rooted',
                                           preserve_underscores=True)

        for n in tree.leaf_node_iter():
            taxa_str = taxonomy.get(n.taxon.label, None)
            if taxa_str == None:
                self.logger.error(
                    'Taxonomy file does not contain an entry for %s.' %
                    n.label)
                sys.exit(-1)
            n.taxon.label = n.taxon.label + '|' + '; '.join(
                taxonomy[n.taxon.label])

        tree.write_to_path(options.output_tree,
                           schema='newick',
                           suppress_rooting=True,
                           unquoted_underscores=True)

        self.logger.info('Decorated tree written to: %s' % options.output_tree)
示例#3
0
    def clean_ftp(self,
                  new_list_genomes,
                  ftp_genome_dir_file,
                  ftp_genome_dir,
                  report_dir,
                  taxonomy_file=None):
        list_of_files = new_list_genomes.split(',')
        genome_in_new_rel = []
        make_sure_path_exists(report_dir)
        for new_genome_file in list_of_files:
            with open(new_genome_file, 'r') as ngf:
                for line in ngf:
                    genome_in_new_rel.append(line.strip().split('\t')[0])

        # read taxonomy file
        taxonomy = {}
        if taxonomy_file is not None:
            taxonomy = Taxonomy().read(taxonomy_file)

        current_ftp_genomes = {}
        with open(ftp_genome_dir_file) as fgdf:
            for line in fgdf:
                infos = line.strip().split('\t')
                current_ftp_genomes[infos[0]] = infos[1]

        deleted_genomes = list(
            set(current_ftp_genomes.keys()) - set(genome_in_new_rel))
        added_genomes = list(
            set(genome_in_new_rel) - set(current_ftp_genomes.keys()))

        deleted_genome_file = open(
            os.path.join(report_dir, 'deleted_genomes.tsv'), 'w')
        added_genome_file = open(os.path.join(report_dir, 'added_genomes.tsv'),
                                 'w')

        print('{} genomes have been deleted in the release'.format(
            len(deleted_genomes)))
        print('{} genomes have been added in the release'.format(
            len(added_genomes)))

        for idx, deleted_genome in enumerate(deleted_genomes):
            print("{}/{} genomes deleted".format(idx, len(deleted_genomes)),
                  end="\r")
            deleted_genome_file.write('{}\n'.format(deleted_genome))
            #print('we delete {}'.format(current_ftp_genomes.get(deleted_genome)))
            shutil.rmtree(current_ftp_genomes.get(deleted_genome))
            self.delete_empty_directory(
                os.path.dirname(current_ftp_genomes.get(deleted_genome)))

        for added_genome in added_genomes:
            added_genome_file.write('{}\t{}\n'.format(
                added_genome,
                taxonomy.get(added_genome, ['N/A'] * 7)[6]))
示例#4
0
    def run(self, input_tree, taxonomy_file, trusted_taxa_file, min_children,
            min_support, skip_rd_refine, output_tree):
        """Decorate internal nodes with taxa labels.

        Parameters
        ----------
        input_tree : str
          Tree to decorate
        taxonomy_file : str
          File indicating taxonomic information for extant taxa.
        trusted_taxa_file : str
          File specifying trusted taxa to consider when inferring distribution. Set to None to consider all taxa.
        min_children : int
          Only consider taxa with at least the specified number of children taxa when inferring distribution.
        min_support : float
          Only consider taxa with at least this level of support when inferring distribution.
        skip_rd_refine : boolean
          Skip refinement of taxonomy based on relative divergence information.
        output_tree: str
          Name of output tree.
        """

        # read tree
        self.logger.info('Reading tree.')
        tree = dendropy.Tree.get_from_path(input_tree,
                                           schema='newick',
                                           rooting='force-rooted',
                                           preserve_underscores=True)

        # remove any previous taxon labels
        self.logger.info('Removing any previous internal node labels.')
        self._strip_taxon_labels(tree)

        # read taxonomy and trim to taxa in tree
        self.logger.info('Reading taxonomy.')
        full_taxonomy = Taxonomy().read(taxonomy_file)

        taxonomy = {}
        for leaf in tree.leaf_node_iter():
            taxonomy[leaf.taxon.label] = full_taxonomy.get(
                leaf.taxon.label, Taxonomy.rank_prefixes)

        # find best placement for each taxon based
        # on the F-measure statistic
        self.logger.info('Calculating F-measure statistic for each taxa.')
        fmeasure_for_taxa = self._fmeasure(tree, taxonomy)

        # place labels with only one acceptable position and calculate
        # the relative divergence thresholds from these as a guide for
        # placing the remaining labels
        self.logger.info('Placing labels with unambiguous position in tree.')
        placed_taxon = self._assign_taxon_labels(fmeasure_for_taxa)

        # calculating relative
        if not skip_rd_refine:
            self.logger.info(
                'Establishing median relative divergence for taxonomic ranks.')
            median_rank_rd = self._median_rank_rd(tree, placed_taxon, taxonomy,
                                                  trusted_taxa_file,
                                                  min_children, min_support)

            # resolve ambiguous position in tree
            self.logger.info(
                'Resolving ambiguous taxon label placements using median relative divergences.'
            )
            self._resolve_ambiguous_placements(fmeasure_for_taxa,
                                               median_rank_rd)
        else:
            # simply select most terminal placement in order to be conservative
            ambiguous_placements = set()
            for taxon, fmeasures in list(fmeasure_for_taxa.items()):
                if len(fmeasures) != 1:
                    ambiguous_placements.add(taxon)
                    fmeasure_for_taxa[taxon] = [fmeasures[-1]]

            if len(ambiguous_placements) > 0:
                self.logger.warning(
                    'There are %d taxon with multiple placements of equal quality.'
                    % len(ambiguous_placements))
                self.logger.warning(
                    'These were resolved by placing the label at a terminal position.'
                )

            # place all labels on tree
            self.logger.info('Placing labels on tree.')
            self._strip_taxon_labels(tree)
            placed_taxon = self._assign_taxon_labels(fmeasure_for_taxa)

        # write statistics for placed taxon labels
        self.logger.info('Writing out statistics for taxa.')
        out_table = output_tree + '-table'
        self._write_statistics_table(fmeasure_for_taxa, out_table)

        # output taxonomy of extant taxa on tree
        self.logger.info('Writing out taxonomy for extant taxa.')
        out_taxonomy = output_tree + '-taxonomy'
        self._write_taxonomy(tree, out_taxonomy)

        # output decorated tree
        self.logger.info('Writing out decorated tree.')
        tree.write_to_path(output_tree,
                           schema='newick',
                           suppress_rooting=True,
                           unquoted_underscores=True)

        # validate taxonomy
        if False:
            self.logger.info('Validating taxonomy for extant taxa.')
            tree_taxonomy = Taxonomy().read(out_taxonomy)
            Taxonomy().validate(tree_taxonomy,
                                check_prefixes=True,
                                check_ranks=True,
                                check_hierarchy=True,
                                check_species=True,
                                check_group_names=True,
                                check_duplicate_names=True,
                                report_errors=True)
示例#5
0
    def run(self, 
                input_tree, 
                taxonomy_file, 
                trusted_taxa_file, 
                min_children, 
                min_support,
                output_tree):
        """Decorate internal nodes with taxa labels.

        Parameters
        ----------
        input_tree : str
          Tree to decorate
        taxonomy_file : str
          File indicating taxonomic information for extant taxa.
        trusted_taxa_file : str
          File specifying trusted taxa to consider when inferring distribution. Set to None to consider all taxa.
        min_children : int
          Only consider taxa with at least the specified number of children taxa when inferring distribution.
        min_support : float
          Only consider taxa with at least this level of support when inferring distribution.
        output_tree: str
          Name of output tree.
        """
        
        # read tree
        self.logger.info('Reading tree.')
        tree = dendropy.Tree.get_from_path(input_tree, 
                                            schema='newick', 
                                            rooting='force-rooted', 
                                            preserve_underscores=True)
                                            
        # remove any previous taxon labels
        self.logger.info('Removing any previous internal node labels.')
        self._strip_taxon_labels(tree)
                                   
        # read taxonomy and trim to taxa in tree
        self.logger.info('Reading taxonomy.')
        full_taxonomy = Taxonomy().read(taxonomy_file)
        
        taxonomy = {}
        for leaf in tree.leaf_node_iter():
            taxonomy[leaf.taxon.label] = full_taxonomy.get(leaf.taxon.label, Taxonomy.rank_prefixes)

        # find best placement for each taxon based 
        # on the F-measure statistic
        self.logger.info('Calculating F-measure statistic for each taxa.')
        fmeasure_for_taxa = self._fmeasure(tree, taxonomy)
        
        # place labels with only one acceptable position and calculate
        # the relative divergence thresholds from these as a guide for
        # placing the remaining labels
        self.logger.info('Placing labels with unambiguous position in tree.')
        placed_taxon = self._assign_taxon_labels(fmeasure_for_taxa)

        # calculating relative 
        self.logger.info('Establishing median relative divergence for taxonomic ranks.')
        median_rank_rd = self._median_rank_rd(tree, 
                                                placed_taxon, 
                                                taxonomy,
                                                trusted_taxa_file, 
                                                min_children, 
                                                min_support)
                                                                                      
        # resolve ambiguous position in tree
        self.logger.info('Resolving ambiguous taxon label placements using median relative divergences.')
        self._resolve_ambiguous_placements(tree, fmeasure_for_taxa, median_rank_rd)
       
        # write statistics for placed taxon labels
        self.logger.info('Writing out statistics for taxa.')
        out_table = output_tree + '-table'
        self._write_statistics_table(fmeasure_for_taxa, out_table)
                                          
        # output taxonomy of extant taxa on tree
        self.logger.info('Writing out taxonomy for extant taxa.')
        out_taxonomy = output_tree + '-taxonomy'
        self._write_taxonomy(tree, out_taxonomy)
        
        # validate taxonomy
        self.logger.info('Validating taxonomy for extant taxa.')
        tree_taxonomy = Taxonomy().read(out_taxonomy)
        Taxonomy().validate(tree_taxonomy,
                          check_prefixes=True,
                          check_ranks=True,
                          check_hierarchy=True,
                          check_species=True,
                          check_group_names=True,
                          check_duplicate_names=True,
                          report_errors=True)
                                                                                  
        # output decorated tree
        self.logger.info('Writing out decorated tree.')
        tree.write_to_path(output_tree, 
                            schema='newick', 
                            suppress_rooting=True, 
                            unquoted_underscores=True)
示例#6
0
    def run(self, query_proteins, db_file, custom_db_file, taxonomy_file,
            custom_taxonomy_file, evalue, per_identity, per_aln_len,
            max_matches, homology_search, min_per_taxa, consensus, min_per_bp,
            use_trimAl, restrict_taxon, msa_program, tree_program, prot_model,
            skip_rooting, output_dir):
        """Infer a gene tree for homologs genes identified by blast.

        Workflow for inferring a gene tree from sequences identified as being
        homologs to a set of query proteins. Homologs are identified using BLASTP
        and a set of user-defined parameters.

        Parameters
        ----------
        query_proteins : str
            Fasta file containing query proteins.
        db_file : str
            BLAST database of reference proteins.
        custom_db_file : str
            Custom database of proteins.
        taxonomy_file : str
            Taxonomic assignment of each reference genomes.
        custom_taxonomy_file : str
            Taxonomic assignment of genomes in custom database.
        evalue : float
            E-value threshold used to define homolog.
        per_identity : float
            Percent identity threshold used to define a homolog.
        per_aln_len : float
            Alignment length threshold used to define a homolog.
        max_matches : int
            Maximum matches per query protein.
        metadata : dict[genome_id] -> metadata dictionary
            Metadata for genomes.
        homology_search : str
            Type of homology search to perform.
        min_per_taxa : float
            Minimum percentage of taxa required to retain a column.
        consensus : float
            Minimum percentage of the same amino acid required to retain column.
        min_per_bp : float
            Minimum percentage of base pairs required to keep trimmed sequence.
        use_trimAl : boolean
            Filter columns using trimAl.
        restrict_taxon : str
            Restrict alignment to specific taxonomic group (e.g., k__Archaea).
        msa_program : str
            Program to use for multiple sequence alignment ['mafft', 'muscle'].
        tree_program : str
            Program to use for tree inference ['fasttree', 'raxml'].
        prot_model : str
            Protein substitution model for tree inference ['WAG', 'LG', 'AUTO'].
        skip_rooting : boolean
            Skip midpoint rooting if True.
        output_dir : str
            Directory to store results.
        """

        # validate query sequence names for use with GeneTreeTk
        validate_seq_ids(query_proteins)

        # read taxonomy file
        self.logger.info('Reading taxonomy file.')
        taxonomy = Taxonomy().read(taxonomy_file)

        if custom_taxonomy_file:
            custom_taxonomy = Taxonomy().read(custom_taxonomy_file)
            taxonomy.update(custom_taxonomy)

        # report distribution of query genes
        mean_len, max_len, min_len, p10, p50, p90 = self._gene_distribution(
            query_proteins)
        self.logger.info(
            'Query gene lengths: min, mean, max = %d, %.1f, %d | p10, p50, p90 = %.1f, %.1f, %.1f'
            % (min_len, mean_len, max_len, p10, p50, p90))

        # identify homologs using BLASTP
        self.logger.info('Identifying homologs using %s.' % homology_search)
        blast = Blast(self.cpus)
        blast_output = os.path.join(output_dir, 'reference_hits.tsv')
        if homology_search == 'diamond':
            diamond = Diamond(self.cpus)
            diamond.blastp(query_proteins,
                           db_file,
                           evalue,
                           per_identity,
                           per_aln_len,
                           max_matches,
                           blast_output,
                           output_fmt='custom')
        else:
            blast.blastp(query_proteins,
                         db_file,
                         blast_output,
                         evalue,
                         max_matches,
                         output_fmt='custom',
                         task=homology_search)
        homologs = blast.identify_homologs(blast_output, evalue, per_identity,
                                           per_aln_len)
        self.logger.info('Identified %d homologs in reference database.' %
                         len(homologs))

        custom_homologs = None
        if custom_db_file:
            custom_blast_output = os.path.join(output_dir, 'custom_hits.tsv')
            if homology_search == 'diamond':
                diamond = Diamond(self.cpus)
                diamond.blastp(query_proteins,
                               custom_db_file,
                               evalue,
                               per_identity,
                               per_aln_len,
                               max_matches,
                               custom_blast_output,
                               output_fmt='custom')
            else:
                blast.blastp(query_proteins,
                             custom_db_file,
                             custom_blast_output,
                             evalue,
                             max_matches,
                             output_fmt='custom',
                             task=homology_search)
            custom_homologs = blast.identify_homologs(custom_blast_output,
                                                      evalue, per_identity,
                                                      per_aln_len)
            self.logger.info('Identified %d homologs in custom database.' %
                             len(custom_homologs))

        # restrict homologs to specific taxonomic group
        if restrict_taxon:
            self.logger.info('Restricting homologs to %s.' % restrict_taxon)
            restricted_homologs = {}
            for query_id, hit in homologs.iteritems():
                genome_id = hit.subject_id.split('~')[0]
                if restrict_taxon in taxonomy[genome_id]:
                    restricted_homologs[query_id] = hit

            self.logger.info(
                '%d of %d homologs in reference database are from the specified group.'
                % (len(restricted_homologs), len(homologs)))
            homologs = restricted_homologs

        if len(homologs) == 0:
            self.logger.error(
                'Too few homologs were identified. Gene tree cannot be inferred.'
            )
            sys.exit()

        # extract homologs
        self.logger.info(
            'Extracting homologs and determining local gene context.')
        db_homologs_tmp = os.path.join(output_dir, 'homologs_db.tmp')
        gene_precontext, gene_postcontext = self.extract_homologs_and_context(
            homologs.keys(), db_file, db_homologs_tmp)

        # report gene length distribution of homologs
        mean_len, max_len, min_len, p10, p50, p90 = self._gene_distribution(
            db_homologs_tmp)
        self.logger.info(
            'Homolog gene lengths: min, mean, max = %d, %.1f, %d | p10, p50, p90 = %.1f, %.1f, %.1f'
            % (min_len, mean_len, max_len, p10, p50, p90))

        # concatenate homologs with initial query genes
        homolog_ouput_tmp = os.path.join(output_dir, 'homologs.faa.tmp')
        if custom_homologs:
            custom_db_homologs_tmp = os.path.join(output_dir,
                                                  'custom_homologs_db.tmp')
            custom_gene_precontext, custom_gene_postcontext = self.extract_homologs_and_context(
                custom_homologs.keys(), custom_db_file, custom_db_homologs_tmp)
            gene_precontext.update(custom_gene_precontext)
            gene_postcontext.update(custom_gene_postcontext)
            homologs.update(custom_homologs)
            concatenate_files(
                [query_proteins, db_homologs_tmp, custom_db_homologs_tmp],
                homolog_ouput_tmp)
            os.remove(custom_db_homologs_tmp)
        else:
            concatenate_files([query_proteins, db_homologs_tmp],
                              homolog_ouput_tmp)

        os.remove(db_homologs_tmp)

        # remove stop codons
        homolog_ouput = os.path.join(output_dir, 'homologs.faa')
        self._remove_stop_codons(homolog_ouput_tmp, homolog_ouput)
        os.remove(homolog_ouput_tmp)

        # infer multiple sequence alignment
        msa = MsaWorkflow(self.cpus)
        trimmed_msa_output = msa.run(homolog_ouput, min_per_taxa, consensus,
                                     min_per_bp, use_trimAl, msa_program,
                                     output_dir)

        # infer tree
        tw = TreeWorkflow(self.cpus)
        tree_output = tw.run(trimmed_msa_output, tree_program, prot_model,
                             skip_rooting, output_dir)

        # create tax2tree consensus map and decorate tree
        self.logger.info('Decorating internal tree nodes with tax2tree.')
        output_taxonomy_file = os.path.join(output_dir, 'taxonomy.tsv')
        fout = open(output_taxonomy_file, 'w')
        for homolog_id in homologs.keys():
            genome_id = homolog_id.split('~')[0]
            t = taxonomy.get(genome_id, None)
            if t:
                fout.write(homolog_id + '\t' + ';'.join(t) + '\n')
        fout.close()

        t2t_tree = os.path.join(output_dir, 'homologs.tax2tree.tree')
        cmd = 't2t decorate -m %s -t %s -o %s' % (output_taxonomy_file,
                                                  tree_output, t2t_tree)
        os.system(cmd)

        # create tree with leaf nodes given as genome accessions
        tree = dendropy.Tree.get_from_path(t2t_tree,
                                           schema='newick',
                                           rooting='force-rooted',
                                           preserve_underscores=True)

        for leaf in tree.leaf_node_iter():
            leaf.taxon.label = leaf.taxon.label.split('~')[0]

        genome_tree = os.path.join(output_dir,
                                   'homologs.tax2tree.genome_accessions.tree')
        tree.write_to_path(genome_tree,
                           schema='newick',
                           suppress_rooting=True,
                           unquoted_underscores=True)

        # setup metadata for ARB file
        src_dir = os.path.dirname(os.path.realpath(__file__))
        version_file = open(os.path.join(src_dir, 'VERSION'))

        metadata = {}
        metadata['genetreetk_version'] = version_file.read().strip()
        metadata['genetreetk_query_proteins'] = query_proteins
        metadata['genetreetk_db_file'] = db_file
        metadata['genetreetk_taxonomy_file'] = taxonomy_file
        metadata['genetreetk_blast_evalue'] = str(evalue)
        metadata['genetreetk_blast_per_identity'] = str(per_identity)
        metadata['genetreetk_blast_per_aln_len'] = str(per_aln_len)
        metadata['genetreetk_blast_max_matches'] = str(max_matches)
        metadata['genetreetk_homology_search'] = homology_search

        metadata['genetreetk_msa_min_per_taxa'] = str(min_per_taxa)
        metadata['genetreetk_msa_consensus'] = str(consensus)
        metadata['genetreetk_msa_min_per_bp'] = str(min_per_bp)
        metadata['genetreetk_msa_program'] = msa_program

        metadata['genetreetk_tree_program'] = tree_program
        metadata['genetreetk_tree_prot_model'] = prot_model

        # create ARB metadata file
        self.logger.info('Creating ARB metadata file.')
        arb_metadata_file = os.path.join(output_dir, 'arb.metadata.txt')
        self.create_arb_metadata(homologs, trimmed_msa_output, taxonomy,
                                 metadata, gene_precontext, gene_postcontext,
                                 arb_metadata_file)
示例#7
0
    def run(self, genomes, align_dir, out_dir, prefix, debugopt=False):
        try:
            """Classify genomes based on position in reference tree."""

            for marker_set_id in ('bac120', 'ar122'):
                user_msa_file = os.path.join(
                    align_dir, prefix + '.%s.user_msa.fasta' % marker_set_id)
                if not os.path.exists(user_msa_file):
                    # file will not exist if there are no User genomes from a given domain
                    continue

                classify_tree = self.place_genomes(user_msa_file,
                                                   marker_set_id, out_dir,
                                                   prefix)

                # get taxonomic classification of each user genome
                tree = dendropy.Tree.get_from_path(classify_tree,
                                                   schema='newick',
                                                   rooting='force-rooted',
                                                   preserve_underscores=True)

                gtdb_taxonomy = Taxonomy().read(self.taxonomy_file)

                fout = open(
                    os.path.join(
                        out_dir,
                        prefix + '.%s.classification.tsv' % marker_set_id),
                    'w')
                fastaniout = open(
                    os.path.join(
                        out_dir,
                        prefix + '.%s.fastani_results.tsv' % marker_set_id),
                    'w')
                redfout = open(
                    os.path.join(out_dir,
                                 prefix + '.%s.summary.tsv' % marker_set_id),
                    'w')
                if debugopt:
                    parchiinfo = open(
                        os.path.join(
                            out_dir,
                            prefix + '.%s.debug_file.tsv' % marker_set_id),
                        'w')

                reddictfile = open(
                    os.path.join(
                        out_dir,
                        prefix + '.%s.red_dictionary.tsv' % marker_set_id),
                    'w')

                marker_dict = {}
                if marker_set_id == 'bac120':
                    marker_dict = Config.RED_DIST_BAC_DICT
                elif marker_set_id == 'ar122':
                    marker_dict = Config.RED_DIST_ARC_DICT
                reddictfile.write('Phylum\t{0}\n'.format(
                    marker_dict.get('p__')))
                reddictfile.write('Class\t{0}\n'.format(
                    marker_dict.get('c__')))
                reddictfile.write('Order\t{0}\n'.format(
                    marker_dict.get('o__')))
                reddictfile.write('Family\t{0}\n'.format(
                    marker_dict.get('f__')))
                reddictfile.write('Genus\t{0}\n'.format(
                    marker_dict.get('g__')))
                reddictfile.close()

                fastaniout.write("User genome\tReference genome\tANI\n")
                redfout.write(
                    "user_genome\tclassification_method\tred_value\n")
                if debugopt:
                    parchiinfo.write(
                        "User genome\tHigher rank\tHigher value\tLower rank\tLower value\tcase\tclosest_rank\n"
                    )

                # Genomes can be classified by using Mash or RED values
                # We go through all leaves of the tree. if the leaf is a user genome we take it's parent node and look at all the leaves for this node.
                # If the parent node has only one Reference genome ( GB or RS ) we calculate the mash distance between the user genome and the reference genome
                analysed_nodes = []
                fastani_dict = {}
                all_fastani_dict = {}

                fastani_list = []
                # some genomes of Case C are handled here, if Mash distance is close enough
                self.logger.info(
                    'Calculating Average Nucleotide Identity using FastANI.')

                for nd in tree.preorder_node_iter():
                    #We store the prefixes of each leaves to check if one starts with GB_ or RS_
                    list_subnode_initials = [
                        subnd.taxon.label.replace("'", '')[0:3]
                        for subnd in nd.leaf_iter()
                    ]
                    list_subnode = [
                        subnd.taxon.label.replace("'", '')
                        for subnd in nd.leaf_iter()
                    ]
                    #if only one genome is a reference genome
                    if (list_subnode_initials.count('RS_') +
                            list_subnode_initials.count('GB_') +
                            list_subnode_initials.count('UBA')) == 1 and len(
                                list_subnode_initials
                            ) > 1 and list_subnode[0] not in analysed_nodes:
                        fastani_list.append(list_subnode)
                        analysed_nodes.extend(list_subnode)

                manager = multiprocessing.Manager()
                out_q = manager.dict()
                procs = []
                nprocs = self.cpus
                if len(fastani_list) > 0:
                    for item in splitchunks_list(fastani_list, nprocs):
                        p = multiprocessing.Process(target=self._fastaniWorker,
                                                    args=(item, genomes,
                                                          out_q))
                        procs.append(p)
                        p.start()

                    # Collect all results into a single result dict. We know how many dicts
                    # with results to expect.
                    #while out_q.empty():
                    #    time.sleep(1)

                    # Wait for all worker processes to finish
                    for p in procs:
                        p.join()
                        if p.exitcode == 1:
                            raise ValueError("Stop!!")

                    all_fastani_dict = dict(out_q)

                for k, v in all_fastani_dict.iteritems():
                    fastaniout.write("{0}\t{1}\t{2}\n".format(
                        k, v.get("ref_genome"), v.get("ani")))
                    if Config.FASTANI_SPECIES_THRESHOLD <= v.get("ani"):
                        suffixed_name = add_ncbi_prefix(v.get("ref_genome"))
                        taxa_str = ";".join(gtdb_taxonomy.get(suffixed_name))
                        if taxa_str.endswith("s__"):
                            taxa_str = taxa_str + v.get("ref_genome")
                        fout.write('%s\t%s\n' % (k, taxa_str))
                        fastani_dict[k] = v
                        redfout.write("{0}\tani\tNone\n".format(k))
                fastaniout.close()

                self.logger.info(
                    '{0} genomes have been classify with FastANI.'.format(
                        len(fastani_dict)))

                scaled_tree = self._calculate_red_distances(
                    classify_tree, out_dir)

                user_genome_ids = set(read_fasta(user_msa_file).keys())
                user_genome_ids = user_genome_ids.difference(
                    set(fastani_dict.keys()))
                # for all other cases we measure the RED distance between a leaf and a parent node ( RED = 1-edge_length). This RED value will tell us
                # the rank level that can be associated with a User genome.
                # As an example if the RED value is close to the order level, the user genome will take the order level of the Reference genome under the same parent node.
                # Is there are multiple orders under the parent node. The user genome is considered as a new order
                for leaf in scaled_tree.leaf_node_iter():
                    if leaf.taxon.label in user_genome_ids:
                        taxa = []
                        # In some cases , pplacer can associate 2 user genomes on the same parent node so we need to go up the tree to find a node with a reference genome as leaf.
                        cur_node = leaf.parent_node
                        list_subnode_initials = [
                            subnd.taxon.label.replace("'", '')[0:3]
                            for subnd in cur_node.leaf_iter()
                        ]
                        while 'RS_' not in list_subnode_initials and 'GB_' not in list_subnode_initials and 'UBA' not in list_subnode_initials:
                            cur_node = cur_node.parent_node
                            list_subnode_initials = [
                                subnd.taxon.label.replace("'", '')[0:3]
                                for subnd in cur_node.leaf_iter()
                            ]

                        current_rel_list = cur_node.rel_dist

                        parent_taxon_node = cur_node.parent_node
                        _support, parent_taxon, _aux_info = parse_label(
                            parent_taxon_node.label)

                        while parent_taxon_node is not None and not parent_taxon:
                            parent_taxon_node = parent_taxon_node.parent_node
                            _support, parent_taxon, _aux_info = parse_label(
                                parent_taxon_node.label)

                        parent_rank = parent_taxon.split(";")[-1][0:3]
                        parent_rel_dist = parent_taxon_node.rel_dist

                        genome_parent_child = [
                            leaf.taxon.label, parent_rank, parent_rel_dist, '',
                            '', '', ''
                        ]

                        child_taxons = []
                        closest_rank = None
                        detection = "RED"
                        # if the genome is placed between the genus and specie ranks , it will be associated with the genus when _get_closest_red_rank is called
                        if parent_rank != 'g__':
                            child_rk = self.order_rank[
                                self.order_rank.index(parent_rank) + 1]
                            list_subnode = [
                                childnd.taxon.label.replace("'", '')
                                for childnd in cur_node.leaf_iter()
                                if (childnd.taxon.label.startswith('RS_')
                                    or childnd.taxon.label.startswith('GB_'))
                            ]
                            list_ranks = [
                                gtdb_taxonomy.get(name)[self.order_rank.index(
                                    child_rk)] for name in list_subnode
                            ]
                            if len(set(list_ranks)) == 1:
                                for subranknd in cur_node.preorder_iter():
                                    _support, subranknd_taxon, _aux_info = parse_label(
                                        subranknd.label)
                                    if subranknd.is_internal(
                                    ) and subranknd_taxon is not None and subranknd_taxon.startswith(
                                            child_rk):
                                        child_taxons = subranknd_taxon.split(
                                            ";")
                                        child_taxon_node = subranknd
                                        child_rel_dist = child_taxon_node.rel_dist
                                        break
                            else:
                                #case 2a and 2b
                                closest_rank = parent_rank
                                detection = "Topology"
                        else:
                            #case 1a
                            closest_rank = parent_rank
                            detection = "Topology"

                        #case 1b
                        if len(child_taxons) == 0 and closest_rank is None:
                            list_leaves = [
                                childnd.taxon.label.replace("'", '')
                                for childnd in cur_node.leaf_iter()
                                if (childnd.taxon.label.startswith('RS_')
                                    or childnd.taxon.label.startswith('GB_'))
                            ]
                            if len(list_leaves) != 1:
                                self.logger.error(
                                    'There should be only one leaf.')
                                sys.exit(-1)
                            list_leaf_ranks = gtdb_taxonomy.get(
                                list_leaves[0])[self.order_rank.index(child_rk
                                                                      ):-1]
                            for leaf_taxon in reversed(list_leaf_ranks):
                                if leaf_taxon == list_leaf_ranks[0]:
                                    if abs(current_rel_list - marker_dict.get(
                                            leaf_taxon[:3])) < abs(
                                                (current_rel_list) -
                                                marker_dict.get(parent_rank)):
                                        #and current_rel_list - marker_dict.get(leaf_taxon[:3]) > 0 ):
                                        closest_rank = leaf_taxon[:3]
                                        genome_parent_child[3] = leaf_taxon
                                        genome_parent_child[
                                            5] = 'case 1b - III'
                                        break
                                else:
                                    pchildrank = list_leaf_ranks[
                                        list_leaf_ranks.index(leaf_taxon) - 1]
                                    if abs(
                                            current_rel_list -
                                            marker_dict.get(leaf_taxon[:3])
                                    ) < abs(current_rel_list -
                                            marker_dict.get(pchildrank[:3])):
                                        #and current_rel_list - marker_dict.get(leaf_taxon[:3]) > 0 ) :
                                        closest_rank = leaf_taxon[:3]
                                        genome_parent_child[1] = pchildrank
                                        genome_parent_child[2] = 1.0
                                        genome_parent_child[3] = leaf_taxon
                                        genome_parent_child[5] = 'case 1b - II'
                                        break
                            if closest_rank is None:
                                closest_rank = parent_rank
                                genome_parent_child[3] = list_leaf_ranks[0]
                                genome_parent_child[5] = 'case 1b - IV'

                        #if there is multiple ranks on the child node (i.e genome between p__Nitrospirae and c__Nitrospiria;o__Nitrospirales;f__Nitropiraceae)
                        #we loop through the list of rank from f_ to c_ rank
                        for child_taxon in reversed(child_taxons):
                            # if lower rank is c__Nitropiria
                            if child_taxon == child_taxons[0]:
                                if (abs(current_rel_list -
                                        marker_dict.get(child_taxon[:3])) <
                                        abs(child_rel_dist -
                                            marker_dict.get(child_taxon[:3]))
                                        and
                                        abs(current_rel_list -
                                            marker_dict.get(child_taxon[:3])) <
                                        abs(current_rel_list -
                                            marker_dict.get(parent_rank))):
                                    genome_parent_child[3] = ';'.join(
                                        child_taxons)
                                    genome_parent_child[4] = child_rel_dist
                                    genome_parent_child[5] = 'case 3b - II'
                                    closest_rank = child_taxon[:3]
                                elif closest_rank is None:
                                    closest_rank = parent_rank
                                    genome_parent_child[3] = ';'.join(
                                        child_taxons)
                                    genome_parent_child[4] = child_rel_dist
                                    genome_parent_child[5] = 'case 3b - III'
                            else:
                                pchildrank = child_taxons[
                                    child_taxons.index(child_taxon) - 1]
                                if (abs(current_rel_list -
                                        marker_dict.get(child_taxon[:3])) <
                                        abs(current_rel_list -
                                            marker_dict.get(pchildrank[:3]))
                                        and
                                        abs(current_rel_list -
                                            marker_dict.get(child_taxon[:3])) <
                                        abs(child_rel_dist -
                                            marker_dict.get(child_taxon[:3]))):
                                    closest_rank = child_taxon
                                    genome_parent_child[3] = ';'.join(
                                        child_taxons)
                                    genome_parent_child[4] = child_rel_dist
                                    genome_parent_child[5] = 'case 3b - I'
                                    break

                        # case 1b
                        if closest_rank is None:
                            print "IT SHOULDN'T HAPPEN!!!"

                        genome_parent_child[6] = closest_rank

                        list_subnode = [
                            subnd.taxon.label.replace("'", '')
                            for subnd in cur_node.leaf_iter()
                        ]
                        red_taxonomy = self._get_redtax(
                            list_subnode, closest_rank, gtdb_taxonomy)

                        fout.write('{0}\t{1}\n'.format(leaf.taxon.label,
                                                       red_taxonomy))
                        del genome_parent_child[0]
                        redfout.write("{0}\t{1}\t{2}\n".format(
                            leaf.taxon.label, detection, current_rel_list))
                        if debugopt:
                            parchiinfo.write('{0}\t{1}\t{2}\t{3}\n'.format(
                                leaf.taxon.label, current_rel_list,
                                '\t'.join(str(x) for x in genome_parent_child),
                                detection))

                redfout.close()
                fout.close()
                if debugopt:
                    parchiinfo.close()

                pplaceout = open(
                    os.path.join(
                        out_dir, prefix +
                        '.%s.classification_pplacer.tsv' % marker_set_id), 'w')

                # We get the pplacer taxonomy for comparison
                user_genome_ids = set(read_fasta(user_msa_file).keys())
                for leaf in tree.leaf_node_iter():
                    if leaf.taxon.label in user_genome_ids:
                        taxa = []
                        cur_node = leaf
                        while cur_node.parent_node:
                            _support, taxon, _aux_info = parse_label(
                                cur_node.label)
                            if taxon:
                                for t in taxon.split(';')[::-1]:
                                    taxa.append(t.strip())
                            cur_node = cur_node.parent_node
                        taxa_str = ';'.join(taxa[::-1])
                        pplaceout.write('%s\t%s\n' %
                                        (leaf.taxon.label, taxa_str))
                pplaceout.close()
        except ValueError as error:
            print "GTDB-Tk has stopped before finishing"
            sys.exit(-1)
        except Exception as error:
            print "GTDB-Tk has stopped before finishing"
            sys.exit(-1)
示例#8
0
    def run(self,
                taxonomy_file, type_strains_file,
                genome_prot_dir, extension,
                max_taxa, rank,
                per_identity, per_aln_len,
                genomes_to_process, keep_all_genes,
                no_reformat_gene_ids,
                output_dir):
        """ Create dereplicate set of genes.

        Taxonomy file should have the following format:
            <genome_id>\t<taxonomy_str>

            where taxonomy_str is in GreenGenes format:
                d__Bacteria;p__Proteobacteria;...;s__Escherichia coli

        Type strain file should have the following format:
            <genome_id>\t<genome name>

        Parameters
        ----------
        taxonomy_file : str
            File indicating taxonomy string for all genomes of interest
        type_strains_file : str
            File indicating type strains.
        genome_prot_dir : str
            Directory containing amino acid genes for each genome.
        extension : str
            Extension of files with called genes.
        max_taxa : int
            Maximum taxa to retain in a named group.
        rank : int
            Taxonomic rank to perform dereplication (0 = domain, ..., 6 = species).
        per_identity : float
            Percent identity for subsampling similar genes.
        per_aln_len : float
            Percent alignment length for subsampling similar genes.
        genomes_to_process : str
            File with list of genomes to retain instead of performing taxon subsampling.
        keep_all_genes : boolean
            Flag indicating that no gene subsampling should be performed.
        no_reformat_gene_ids : boolean
            Flag indicating if gene ids should be reformatted to include scaffold names given by the GFF file.
        output_dir : str
            Desired output directory for storing results.
        """

        make_sure_path_exists(output_dir)
        self.logger.info('Dereplicating at the rank of %s.' % self.rank_labels[rank])

        # get taxonomy string for each genome
        taxonomy = {}
        if taxonomy_file:
            self.logger.info('Reading taxonomy file.')
            taxonomy = Taxonomy().read(taxonomy_file)
            self.logger.info('There are %d genomes with taxonomy strings.' % len(taxonomy))

        # get type strains; genomes which should never be dereplicated
        type_strains = set()
        if type_strains_file:
            self.logger.info('Reading type strain file.')
            type_strains = self.read_type_strain(type_strains_file)
            self.logger.info('There are %d type strains.' % len(type_strains))

        # get specific list of genomes to process
        genomes_to_retain = set()
        if genomes_to_process:
            self.logger.info('Reading genomes to retain.')
            for line in open(genomes_to_process):
                line_split = line.split()
                genomes_to_retain.add(line_split[0])
            self.logger.info('Retaining %d genomes.' % len(genomes_to_retain))
            
        # make sure extension filter starts with a '.'
        if not extension.startswith('.'):
            extension = '.' + extension

        # identify unique genes in each named group
        fout = open(os.path.join(output_dir, 'genomes_without_called_genes.tsv'), 'w')
        rank_genomes = defaultdict(list)
        genome_files = os.listdir(genome_prot_dir)
        underclassified_genomes = 0
        genomes_with_missing_data = 0
        for genome_file in genome_files:
            genome_id = remove_extension(genome_file, extension)

            if not genome_file.endswith(extension):
                continue

            if genomes_to_process and genome_id not in genomes_to_retain:
                continue

            genome_file = os.path.join(genome_prot_dir, genome_file)
            if not os.path.exists(genome_file):
                genomes_with_missing_data += 1
                fout.write(genome_id + '\t' + ';'.join(taxonomy[genome_id]) + '\n')
                continue

            t = taxonomy.get(genome_id, self.rank_prefixes)
            taxa = t[rank]
            if taxa[3:] == '':
                underclassified_genomes += 1
                rank_genomes[self.underclassified].append(genome_id)
            else:
                rank_genomes[taxa].append(genome_id)

            validate_seq_ids(genome_file)

        fout.close()

        total_genomes_to_process = sum([len(genome_list) for genome_list in rank_genomes.values()])
        if total_genomes_to_process == 0:
            self.logger.error('No genomes found in directory: %s. Check the --extension flag used to identify genomes.' % genome_prot_dir)
            sys.exit(-1)

        self.logger.info('Under-classified genomes automatically placed into the database: %d' % underclassified_genomes)
        self.logger.info('Genomes with missing sequence data: %d' % genomes_with_missing_data)
        self.logger.info('Total named groups: %d' % len(rank_genomes))
        self.logger.info('Total genomes to process: %d' % total_genomes_to_process)

        # process each named group
        gene_file = os.path.join(output_dir, 'custom_db.faa')
        gene_out = open(gene_file, 'w')

        taxonomy_out = open(os.path.join(output_dir, 'custom_taxonomy.tsv'), 'w')

        tmp_dir = tempfile.mkdtemp()
        total_genes_removed = 0
        total_genes_kept = 0
        total_genomes_kept = 0
        processed_genomes = 0
        for taxa, genome_list in rank_genomes.iteritems():
            processed_genomes += len(genome_list)

            print '-------------------------------------------------------------------------------'
            self.logger.info('Processing %s | Finished %d of %d (%.2f%%) genomes.' % (taxa, processed_genomes, total_genomes_to_process, processed_genomes * 100.0 / total_genomes_to_process))

            # create directory with selected genomes
            taxon_dir = os.path.join(tmp_dir, 'taxon')
            os.mkdir(taxon_dir)

            reduced_genome_list = genome_list
            if not genomes_to_process and taxa != self.underclassified:  # perform taxon subsampling
                reduced_genome_list = self.select_taxa(genome_list, taxonomy, type_strains, max_taxa)
            total_genomes_kept += len(reduced_genome_list)

            gene_dir = os.path.join(taxon_dir, 'genes')
            os.mkdir(gene_dir)
            for genome_id in reduced_genome_list:
                taxonomy_out.write(genome_id + '\t' + ';'.join(taxonomy.get(genome_id, self.rank_prefixes)) + '\n')

                genome_gene_file = os.path.join(genome_prot_dir, genome_id + extension)
                gff_file = os.path.join(genome_prot_dir, genome_id + '.gff')
                output_gene_file = os.path.join(gene_dir, genome_id + '.faa')
                if not no_reformat_gene_ids:
                    self.reformat_gene_id_to_scaffold_id(genome_gene_file, gff_file, taxonomy, output_gene_file)
                else:
                    os.system('cp %s %s' % (genome_gene_file, output_gene_file))

            # filter genes based on amino acid identity
            genes_to_remove = []
            amended_gene_dir = os.path.join(taxon_dir, 'amended_genes')
            if keep_all_genes or taxa == self.underclassified:
                # modify gene identifiers to include genome ids
                self.amend_gene_identifies(gene_dir, amended_gene_dir)
            else:
                # filter genes on AAI
                genes_to_remove = self.filter_aai(taxon_dir, gene_dir, amended_gene_dir, per_identity, per_aln_len, self.cpus)

            self.logger.info('Writing unique genes from genomes in %s.' % taxa)
            genes_kept = self.write_gene_file(gene_out, amended_gene_dir, reduced_genome_list, taxonomy, genes_to_remove)

            self.logger.info('Retain %d of %d taxa.' % (len(reduced_genome_list), len(genome_list)))
            self.logger.info('Genes to keep: %d' % genes_kept)
            self.logger.info('Genes removed: %d' % len(genes_to_remove))

            total_genes_kept += genes_kept
            total_genes_removed += len(genes_to_remove)

            shutil.rmtree(taxon_dir)

        taxonomy_out.close()
        gene_out.close()

        self.logger.info('Retain %d of %d (%.1f%%) genomes' % (total_genomes_kept, total_genomes_to_process, total_genomes_kept * 100.0 / (total_genomes_to_process)))
        self.logger.info('Total genes kept: %d' % total_genes_kept)
        self.logger.info('Total genes removed: %d (%.1f%%)' % (total_genes_removed, total_genes_removed * 100.0 / (total_genes_kept + total_genes_removed)))

        self.logger.info('Creating BLAST database.')
        os.system('makeblastdb -dbtype prot -in %s' % gene_file)

        shutil.rmtree(tmp_dir)
示例#9
0
    def combine(self, ssu_msa, ssu_tree, lsu_msa, lsu_tree, output_dir):
        """Infer 16S + 23S tree spanning GTDB genomes."""

        # identify common 16S and 23S sequences
        ssu_seqs = {}
        for seq_id, seq, annotation in seq_io.read_seq(ssu_msa,
                                                       keep_annotation=True):
            genome_id = seq_id.split('~')[0]
            ssu_seqs[genome_id] = [seq, annotation]
        self.logger.info('Read %d SSU rRNA sequences.' % len(ssu_seqs))

        lsu_seqs = {}
        for seq_id, seq, annotation in seq_io.read_seq(lsu_msa,
                                                       keep_annotation=True):
            genome_id = seq_id.split('~')[0]
            lsu_seqs[genome_id] = [seq, annotation]
        self.logger.info('Read %d LSU rRNA sequences.' % len(lsu_seqs))

        common_seqs = set(ssu_seqs.keys()).intersection(set(lsu_seqs.keys()))
        self.logger.info('Identified %d sequences in common.' %
                         len(common_seqs))

        # identify incongruent taxonomic order classifcations between trees
        self.logger.info(
            'Identifying incongruent order-level taxonomic classifications between trees.'
        )
        ssu_taxonomy = Taxonomy().read_from_tree(ssu_tree)
        lsu_taxonomy = Taxonomy().read_from_tree(lsu_tree)

        order_index = Taxonomy.rank_labels.index('order')

        seqs_to_filter = set()
        for seq_id in common_seqs:
            ssu_order = ssu_taxonomy.get(seq_id)[order_index][3:]
            lsu_order = lsu_taxonomy.get(seq_id)[order_index][3:]

            # remove designator of paraphyletic orders
            # (since in the concatenated tree this may be resolved)
            ssu_order = ssu_order.split('_')[0]
            lsu_order = lsu_order.split('_')[0]

            if ssu_order != lsu_order:
                seqs_to_filter.add(seq_id)

        self.logger.info(
            'Identified %d sequences with incongruent classifcations.' %
            len(seqs_to_filter))
        common_seqs.difference_update(seqs_to_filter)

        # write out MSA
        concatenated_msa = os.path.join(output_dir, 'ssu_lsu_concatenated.fna')
        fout = open(concatenated_msa, 'w')
        for seq_id in common_seqs:
            fout.write('>%s %s %s\n' %
                       (seq_id, ssu_seqs[seq_id][1], lsu_seqs[seq_id][1]))
            fout.write('%s%s\n' % (ssu_seqs[seq_id][0], lsu_seqs[seq_id][0]))
        fout.close()

        # infer tree
        output_tree = os.path.join(output_dir, 'ssu_lsu_concatenated.tree')
        os.system('FastTreeMP -nosupport -nt -gtr -gamma %s > %s' %
                  (concatenated_msa, output_tree))
示例#10
0
    def run(self, input_tree, 
                    taxonomy_file, 
                    output_dir, 
                    plot_taxa_file,
                    plot_dist_taxa_only,
                    plot_domain,
                    highlight_polyphyly,
                    highlight_taxa_file,
                    trusted_taxa_file,
                    fixed_root,
                    min_children, 
                    min_support,
                    mblet,
                    fmeasure_table,
                    min_fmeasure,
                    fmeasure_mono,
                    verbose_table):
        """Determine distribution of taxa at each taxonomic rank.

        Parameters
        ----------
        input_tree : str
          Name of input tree.
        taxonomy_file : str
          File with taxonomy strings for each taxa.
        output_dir : str
          Desired output directory.
        plot_taxa_file : str
          File specifying taxa to plot. Set to None to consider all taxa.
        plot_dist_taxa_only : boolean    
          Only plot the taxa used to infer distribution.
        plot_domain : boolean
          Plot domain rank.
        trusted_taxa_file : str
          File specifying trusted taxa to consider when inferring distribution. Set to None to consider all taxa.
        fixed_root : boolean
          Usa a single fixed root to infer outliers.
        min_children : int
          Only consider taxa with at least the specified number of children taxa when inferring distribution.
        min_support : float
          Only consider taxa with at least this level of support when inferring distribution.
        verbose_table : boolean
          Print additional columns in output table.
        """
        
        # read tree
        self.logger.info('Reading tree.')
        tree = dendropy.Tree.get_from_path(input_tree, 
                                            schema='newick', 
                                            rooting='force-rooted', 
                                            preserve_underscores=True)

        input_tree_name = os.path.splitext(os.path.basename(input_tree))[0]

        # pull taxonomy from tree and file
        self.logger.info('Reading taxonomy.')
        taxonomy = Taxonomy().read(taxonomy_file)
        tree_taxonomy = Taxonomy().read_from_tree(input_tree,
                                                    warnings=False)
            
        gtdb_parent_ranks = Taxonomy().parents(tree_taxonomy)

        # read trusted taxa
        trusted_taxa = None
        if trusted_taxa_file:
            trusted_taxa = read_taxa_file(trusted_taxa_file)
            
        # read F-measure for taxa
        fmeasure = None
        if fmeasure_table:
            fmeasure = self.read_fmeasure(fmeasure_table)

        # determine taxa to be used for inferring distribution
        taxa_for_dist_inference = filter_taxa_for_dist_inference(tree, 
                                                                    taxonomy, 
                                                                    trusted_taxa, 
                                                                    min_children, 
                                                                    min_support,
                                                                    fmeasure,
                                                                    min_fmeasure)

        # limit plotted taxa
        taxa_to_plot = None
        if plot_dist_taxa_only:
            taxa_to_plot = taxa_for_dist_inference
        elif plot_taxa_file:
            taxa_to_plot = read_taxa_file(plot_taxa_file)
        else:
            # plot every taxon defined in tree
            taxa_to_plot = set()
            for node in tree.preorder_node_iter():
                support, taxon, _auxiliary_info = parse_label(node.label)
                if taxon:
                    taxon = taxon.split(';')[-1].strip() # get most specific taxon from compound names 
                                                         # (e.g. p__Armatimonadetes; c__Chthonomonadetes)
                    taxa_to_plot.add(taxon)
            
            if False:
                # HACK FOR NCBI: only plot taxa with >= 2 taxa
                taxa_to_plot = set()
                for node in tree.preorder_node_iter():
                    if not node.label or node.is_leaf():
                        continue

                    support, taxon, _auxiliary_info = parse_label(node.label)
                    if not taxon:
                        continue
                    taxon = taxon.split(';')[-1].strip() # get most specific taxon from compound names 
                                                         # (e.g. p__Armatimonadetes; c__Chthonomonadetes)
                  
                    # count number of subordinate children
                    rank_prefix = taxon[0:3]
                    if min_children > 0 and rank_prefix != 's__':
                        child_rank_index = Taxonomy().rank_index[rank_prefix] + 1
                        child_rank_prefix = Taxonomy.rank_prefixes[child_rank_index]
                        subordinate_taxa = set()
                        for leaf in node.leaf_iter():
                            taxa = taxonomy.get(leaf.taxon.label, Taxonomy.rank_prefixes)
                            if len(taxa) > child_rank_index:
                                sub_taxon = taxa[child_rank_index]
                                if sub_taxon != Taxonomy.rank_prefixes[child_rank_index] and sub_taxon.startswith(child_rank_prefix):
                                    subordinate_taxa.add(sub_taxon)

                        if len(subordinate_taxa) < min_children:
                            continue
                            
                    taxa_to_plot.add(taxon)
            
        # highlight taxa
        highlight_taxa = set()
        if highlight_taxa_file:
            for line in open(highlight_taxa_file):
                highlight_taxa.add(line.strip().split('\t')[0])
                
        # check if a single fixed root should be used
        if fixed_root or mblet:
            self.logger.info('Using single fixed rooting for inferring distributions.')
            if not mblet:
                rel_dists = self.rd_fixed_root(tree, taxa_for_dist_inference)
            else:
                rel_dists = self.mblet(tree, taxa_for_dist_inference)
                
            # create fixed rooting style tables and plots
            distribution_table = os.path.join(output_dir, '%s.rank_distribution.tsv' % input_tree_name)
            plot_file = os.path.join(output_dir, '%s.png' % input_tree_name)
            self._distribution_plot(rel_dists, 
                                        taxa_for_dist_inference,
                                        highlight_polyphyly,
                                        highlight_taxa,
                                        distribution_table,
                                        fmeasure,
                                        fmeasure_mono,
                                        plot_file)

            median_outlier_table = os.path.join(output_dir, '%s.tsv' % input_tree_name)
            self._median_outlier_file(rel_dists, 
                                        taxa_for_dist_inference, 
                                        gtdb_parent_ranks, 
                                        median_outlier_table)
        else:
            # calculate relative distance to taxa
            rd = RelativeDistance()
            rel_dists = rd.rel_dist_to_named_clades(tree)
            
            # restrict to taxa of interest
            if taxa_to_plot:
                for r in rel_dists:
                    for k in set(rel_dists[r].keys()) - set(taxa_to_plot):
                        del rel_dists[r][k]
            
            # report number of taxa at each rank
            print ''
            print 'Rank\tTaxa to Plot\tTaxa for Inference'
            for rank, taxa in rel_dists.iteritems():
                taxa_for_inference = [x for x in taxa if x in taxa_for_dist_inference]
                print '%s\t%d\t%d' % (Taxonomy.rank_labels[rank], len(taxa), len(taxa_for_inference))
            print ''
        
            # *** determine phyla for inferring distribution
            if True:
                phylum_rel_dists, rel_node_dists = self.median_rd_over_phyla(tree, 
                                                                                taxa_for_dist_inference)
            else:                                                                    
                phyla_for_inference = filter_taxa_for_dist_inference(tree, 
                                                                        taxonomy, 
                                                                        trusted_taxa, 
                                                                        2, 
                                                                        min_support,
                                                                        fmeasure,
                                                                        min_fmeasure)
                phylum_rel_dists, rel_node_dists = self.median_rd_over_phyla(tree, 
                                                                                phyla_for_inference)
                print ''
                print 'Phyla for RED Inference:'
                print ','.join(phylum_rel_dists)
                phyla_file = os.path.join(output_dir, '%s.phyla.tsv' % input_tree_name)
                fout = open(phyla_file, 'w')
                for p in phylum_rel_dists:
                    fout.write(p + '\n')
                fout.close()
                                                                            
            # set edge lengths to median value over all rootings
            tree.seed_node.rel_dist = 0.0
            for n in tree.preorder_node_iter(lambda n: n != tree.seed_node):
                n.rel_dist = np_median(rel_node_dists[n.id])
                rd_to_parent = n.rel_dist - n.parent_node.rel_dist
                if rd_to_parent < 0:
                    self.logger.warning('Not all branches are positive after scaling.')
                n.edge_length = rd_to_parent

            for phylum, rel_dists in phylum_rel_dists.iteritems():
                phylum_dir = os.path.join(output_dir, phylum)
                if not os.path.exists(phylum_dir):
                    os.makedirs(phylum_dir)
                    
                # restrict to taxa of interest
                if taxa_to_plot:
                    for r in rel_dists:
                        for k in set(rel_dists[r].keys()) - set(taxa_to_plot):
                            del rel_dists[r][k]
                    
                # create distribution plot
                distribution_table = os.path.join(phylum_dir, '%s.rank_distribution.tsv' % phylum)
                plot_file = os.path.join(phylum_dir, '%s.rank_distribution.png' % phylum)
                self._distribution_plot(rel_dists, 
                                        taxa_for_dist_inference,
                                        highlight_polyphyly,
                                        highlight_taxa,
                                        distribution_table,
                                        fmeasure,
                                        fmeasure_mono,
                                        plot_file)

                median_outlier_table = os.path.join(phylum_dir, '%s.median_outlier.tsv' % phylum)
                self._median_outlier_file(rel_dists, 
                                            taxa_for_dist_inference, 
                                            gtdb_parent_ranks,
                                            median_outlier_table)
   
            plot_file = os.path.join(output_dir, '%s.png' % input_tree_name)
            self._distribution_summary_plot(phylum_rel_dists, 
                                            taxa_for_dist_inference,
                                            highlight_polyphyly,
                                            highlight_taxa,
                                            fmeasure,
                                            fmeasure_mono,
                                            plot_file)

            median_outlier_table = os.path.join(output_dir, '%s.tsv' % input_tree_name)
            median_rank_file = os.path.join(output_dir, '%s.dict' % input_tree_name)
            self._median_summary_outlier_file(phylum_rel_dists, 
                                                taxa_for_dist_inference, 
                                                gtdb_parent_ranks, 
                                                median_outlier_table, 
                                                median_rank_file, 
                                                verbose_table)

        output_rd_file = os.path.join(output_dir, '%s.node_rd.tsv' % input_tree_name)
        self._write_rd(tree, output_rd_file)
                                                
        output_tree = os.path.join(output_dir, '%s.scaled.tree' % input_tree_name)
        tree.write_to_path(output_tree, 
                            schema='newick', 
                            suppress_rooting=True, 
                            unquoted_underscores=True)                
示例#11
0
    def run(self,
            input_tree,
            taxonomy_file,
            viral,
            skip_species,
            gtdb_metadata,
            trusted_taxa_file,
            min_children,
            min_support,
            skip_rd_refine,
            output_tree):
        """Decorate internal nodes with taxa labels based on F-measure."""
        
        # read GTDB metadata
        rep_placeholder_stems, rep_latin_stems = self.parse_gtdb_metadata(gtdb_metadata)
        
        # read tree
        self.logger.info('Reading tree.')
        tree = dendropy.Tree.get_from_path(input_tree,
                                           schema='newick',
                                           rooting='force-rooted',
                                           preserve_underscores=True)

        # remove any previous taxon labels
        self.logger.info('Removing any previous internal node labels.')
        self._strip_taxon_labels(tree)

        # read taxonomy and trim to taxa in tree
        self.logger.info('Reading taxonomy.')
        full_taxonomy = Taxonomy().read(taxonomy_file)

        if viral:
            self.logger.info('Translating viral prefixes.')
            full_taxonomy = translate_viral_taxonomy(full_taxonomy)

        taxonomy = {}
        for leaf in tree.leaf_node_iter():
            taxonomy[leaf.taxon.label] = full_taxonomy.get(leaf.taxon.label, Taxonomy.rank_prefixes)

        # find best placement for each taxon based 
        # on the F-measure statistic
        self.logger.info('Calculating F-measure statistic for each taxa.')
        fmeasure_for_taxa = self._fmeasure(tree, taxonomy, skip_species)

        # calculating relative
        if not skip_rd_refine:
            # place labels with only one acceptable position and calculate
            # the relative divergence thresholds from these as a guide for
            # placing the remaining labels
            self.logger.info('Placing labels with unambiguous position in tree.')
            placed_taxon = self._assign_taxon_labels(fmeasure_for_taxa)

            self.logger.info('Establishing median relative divergence for taxonomic ranks.')
            median_rank_rd = self._median_rank_rd(tree,
                                                  placed_taxon,
                                                  taxonomy,
                                                  trusted_taxa_file,
                                                  min_children,
                                                  min_support)

            # resolve ambiguous position in tree
            self.logger.info('Resolving ambiguous taxon label placements using median relative divergences.')
            self._resolve_ambiguous_placements(fmeasure_for_taxa, median_rank_rd)
        else:
            # resolve cases where 2 or more nodes have the same F-measure
            self.resolve_equal_fmeasure(fmeasure_for_taxa, 
                                        rep_placeholder_stems, 
                                        rep_latin_stems,
                                        output_tree)
                                        
            # place all labels on tree
            self.logger.info('Placing labels on tree.')
            placed_taxon = self._assign_taxon_labels(fmeasure_for_taxa)

        # write statistics for placed taxon labels
        self.logger.info('Writing out statistics for taxa.')
        out_table = output_tree + '-table'
        self._write_statistics_table(fmeasure_for_taxa, taxonomy, out_table)
        
        summary_table = output_tree + '-summary'
        self._write_summary_table(fmeasure_for_taxa, taxonomy, summary_table)

        # output taxonomy of extant taxa on tree
        self.logger.info('Writing out taxonomy for extant taxa.')
        out_taxonomy = output_tree + '-taxonomy'
        self._write_taxonomy(tree, out_taxonomy)

        # output decorated tree
        self.logger.info('Writing out decorated tree.')
        tree.write_to_path(output_tree,
                           schema='newick',
                           suppress_rooting=True,
                           unquoted_underscores=True)

        if viral:
            self.logger.info('Translating output files to viral prefixes.')
            rev_translate_output_file(out_table)
            rev_translate_output_file(out_taxonomy)
            rev_translate_output_file(output_tree)