Exemplo n.º 1
0
    def lca_star(self,
                 taxonomy_list,
                 min_tree_depth=3,
                 majority_threshold=0.51):
        """Find the LCA within a list of taxonomies after filtering those taxonomies by tree depth.
        One can also vary what constitutes a majority consensus for the counts, with the default
        being 51%.

        Args:
            taxonomy_list (list): list of taxonomy names or IDs
            min_tree_depth (int): the mininum allowable tree depth of taxon to be considered within
                the taxonomy list; those found sooner in the tree will be filtered out of consideration
            majority_threshold (float): 0-1; the fraction of taxonomy counts which constitutes a
                majority; a lower fraction will classify with less confidence deeper in the tree
                while a higher threshold will classify with more confidence higher in the tree

        Returns:
            dict of 'taxonomy' and 'pvalue'

        Example:
            >>> tree = Tree("ref/ncbi_taxonomy_tree.txt")
            >>> taxonomy_list = ['gamma subgroup', 'RNA similarity group I',
                                 'purple photosynthetic bacteria and relatives',
                                 'not Bacteria Haeckel 1894',
                                 'purple photosynthetic bacteria and relatives', 'gamma subgroup',
                                 'gamma subgroup', 'purple photosynthetic bacteria and relatives',
                                 'purple photosynthetic bacteria and relatives']
            >>> tree.lca_star(taxonomy_list)
            {'pvalue': 0.012791848981090311, 'taxonomy': '1224'}

        """
        # tree depth based filter
        taxonomy_list = self.filter_taxonomy_list(taxonomy_list,
                                                  min_tree_depth)
        # all have been filtered
        if not taxonomy_list:
            majority = "1"
            p = 1.
        else:
            taxonomy_counts = Counter(taxonomy_list)
            majority_cutoff = len(taxonomy_list) * majority_threshold
            # majority based on existing taxonomy counts alone
            if taxonomy_counts.most_common()[0][1] > majority_cutoff:
                majority = taxonomy_counts.most_common()[0][0]
                p = nettleton_pvalue(taxonomy_list, majority)
            # create majority from lca
            else:
                majority, lineages = self.lca_majority(taxonomy_list,
                                                       majority_cutoff)
                aggregate_counts = self.counts_to_majority_list(
                    taxonomy_counts, lineages, majority)
                p = nettleton_pvalue(aggregate_counts, majority)
        return {"taxonomy": majority, "pvalue": p}
Exemplo n.º 2
0
def process_orfs_with_tree(orf_assignments, tree, output, aggregation_method, majority_threshold=0.51, table_name="refseq"):
    """Processing the already classified ORFs through secondary contig classification.

    Args:
        orf_assignments (dict): dict of dict for per ORF tax assignment per contig
        tree (Tree): taxonomic tree object
        output (filehandle): output file handle
        aggregation_method (str): lca, lca-majority, or majority
        majority_threshold (float): constitutes a majority fraction at tree node for 'lca-majority' ORF aggregation method
    """
    print("contig", "orf", "taxonomy", "erfc", "orf_taxonomy", "%s_product" % table_name,
          "%s_evalue" % table_name, "%s_bitscore" % table_name, sep="\t", file=output)
    for contig, orfs in orf_assignments.items():
        taxonomies = [x[1] for x in orfs.values()]
        if aggregation_method == "lca-majority":
            res = tree.lca_star(taxonomies, majority_threshold=majority_threshold)
            contig_taxonomy = res["taxonomy"]
            error_function = res["pvalue"]
        elif aggregation_method == "lca":
            # TODO incorporate threshold into LCAs?
            contig_taxonomy = tree.lca(taxonomies)
            error_function = nettleton_pvalue(taxonomies, contig_taxonomy)
        # simple majority
        else:
            contig_taxonomy = BlastHits(taxonomies).majority()
            error_function = nettleton_pvalue(taxonomies, contig_taxonomy)
        lineage = {}
        for item in tree.taxonomic_lineage(contig_taxonomy):
            node = tree.tree[item]
            if node.tax_level in TAX_LEVELS:
                # does not account for "no rank" and some other cases of "unclassified"
                lineage["k" if node.tax_level == "superkingdom" else node.tax_level[0]] = node.taxonomy
        lineage = validate_lineage(lineage)

        for idx in sorted(orfs.keys()):
            orf_function, orf_tax_id, bitscore, evalue = orfs[idx]
            orf_taxonomy = tree.tree[orf_tax_id].taxonomy
            print(contig, "%s_%s" % (contig, idx), lineage, error_function, orf_taxonomy,
                  orf_function, evalue, bitscore, sep="\t", file=output)