Пример #1
0
def unifrac(classifications, weighted=True,
            field='readcount_w_children', rank='species', strict=False):
    """
    A beta diversity metric that takes into account the relative relatedness of community members.
    Weighted UniFrac looks at abundances, unweighted UniFrac looks at presence
    """
    assert field in ACCEPTABLE_FIELDS
    counts, tax_ids, ids = beta_counts(classifications, field=field, rank=rank)

    tree = None
    for c in classifications:
        if strict and c.job.id != classifications[0].job.id:
            raise OneCodexException('All Classifications must have the same Job for Unifrac')
        tree = generate_skbio_tree(c, existing_tree=tree)

    # there's a bug (?) in skbio where it expects the root to only have
    # one child, so we do a little faking here
    new_tree = TreeNode(name='fake root')
    new_tree.rank = 'no rank'
    new_tree.append(tree)

    # prune low-level nodes off the tree so the tips are what we're comparing
    prune_to_rank(new_tree, rank=rank)

    if weighted:
        return skbio.diversity.beta_diversity('weighted_unifrac', counts, ids,
                                              tree=new_tree, otu_ids=tax_ids)
    else:
        return skbio.diversity.beta_diversity('unweighted_unifrac', counts, ids,
                                              tree=new_tree, otu_ids=tax_ids)
Пример #2
0
    def unifrac(self, weighted=True, rank="auto"):
        """Calculate the UniFrac beta diversity metric.

        UniFrac takes into account the relatedness of community members. Weighted UniFrac considers
        abundances, unweighted UniFrac considers presence.

        Parameters
        ----------
        weighted : `bool`
            Calculate the weighted (True) or unweighted (False) distance metric.
        rank : {'auto', 'kingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species'}, optional
            Analysis will be restricted to abundances of taxa at the specified level.

        Returns
        -------
        skbio.stats.distance.DistanceMatrix, a distance matrix.
        """
        # needs read counts, not relative abundances
        import skbio.diversity

        if self._guess_normalized():
            raise OneCodexException("UniFrac requires unnormalized read counts.")

        df = self.to_df(rank=rank, normalize=False)

        counts = []
        for c_id in df.index:
            counts.append(df.loc[c_id].tolist())

        tax_ids = df.keys().tolist()

        tree = self.tree_build()
        tree = self.tree_prune_rank(tree, rank=df.ocx_rank)

        # there's a bug (?) in skbio where it expects the root to only have
        # one child, so we do a little faking here
        from skbio.tree import TreeNode

        new_tree = TreeNode(name="fake root")
        new_tree.rank = "no rank"
        new_tree.append(tree)

        # then finally run the calculation and return
        if weighted:
            return skbio.diversity.beta_diversity(
                "weighted_unifrac", counts, df.index.tolist(), tree=new_tree, otu_ids=tax_ids
            )
        else:
            return skbio.diversity.beta_diversity(
                "unweighted_unifrac", counts, df.index.tolist(), tree=new_tree, otu_ids=tax_ids
            )
Пример #3
0
    def unifrac(self, weighted=True, rank="auto"):
        """A beta diversity metric that takes into account the relative relatedness of community
        members. Weighted UniFrac looks at abundances, unweighted UniFrac looks at presence.

        Parameters
        ----------
        weighted : `bool`
            Calculate the weighted (True) or unweighted (False) distance metric.
        rank : {'auto', 'kingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species'}, optional
            Analysis will be restricted to abundances of taxa at the specified level.

        Returns
        -------
        skbio.stats.distance.DistanceMatrix, a distance matrix.
        """
        # needs read counts, not relative abundances
        if self._guess_normalized():
            raise OneCodexException("UniFrac requires unnormalized read counts.")

        df = self.to_df(rank=rank, normalize=False)

        counts = []
        for c_id in df.index:
            counts.append(df.loc[c_id].tolist())

        tax_ids = df.keys().tolist()

        tree = self.tree_build()
        tree = self.tree_prune_rank(tree, rank=df.ocx_rank)

        # there's a bug (?) in skbio where it expects the root to only have
        # one child, so we do a little faking here
        from skbio.tree import TreeNode

        new_tree = TreeNode(name="fake root")
        new_tree.rank = "no rank"
        new_tree.append(tree)

        # then finally run the calculation and return
        if weighted:
            return skbio.diversity.beta_diversity(
                "weighted_unifrac", counts, df.index.tolist(), tree=new_tree, otu_ids=tax_ids
            )
        else:
            return skbio.diversity.beta_diversity(
                "unweighted_unifrac", counts, df.index.tolist(), tree=new_tree, otu_ids=tax_ids
            )
Пример #4
0
    def unifrac(self, weighted=True, rank=Rank.Auto):
        """Calculate the UniFrac beta diversity metric.

        UniFrac takes into account the relatedness of community members. Weighted UniFrac considers
        abundances, unweighted UniFrac considers presence.

        Parameters
        ----------
        weighted : `bool`
            Calculate the weighted (True) or unweighted (False) distance metric.
        rank : {'auto', 'kingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species'}, optional
            Analysis will be restricted to abundances of taxa at the specified level.

        Returns
        -------
        skbio.stats.distance.DistanceMatrix, a distance matrix.
        """
        import skbio.diversity

        df = self.to_df(rank=rank, normalize=self._guess_normalized())

        ocx_rank = df.ocx_rank
        # The scikit-bio implementations of phylogenetic metrics require integer counts
        if self._guess_normalized():
            df = df * 10e9

        tax_ids = df.keys().tolist()

        tree = self.tree_build()
        tree = self.tree_prune_rank(tree, rank=ocx_rank)

        # `scikit-bio` requires that the tree root has no more than 2
        # children, otherwise it considers it "unrooted".
        #
        # https://github.com/biocore/scikit-bio/blob/f3ae1dcfe8ea88e52e19f6693d79e529d05bda04/skbio/diversity/_util.py#L89
        #
        # Our taxonomy root regularly has more than 2 children, so we
        # add a fake parent of `root` to the tree here.
        from skbio.tree import TreeNode

        new_tree = TreeNode(name="fake root")
        new_tree.rank = "no rank"
        new_tree.append(tree)

        # then finally run the calculation and return
        if weighted:
            return skbio.diversity.beta_diversity(
                BetaDiversityMetric.WeightedUnifrac,
                df,
                df.index,
                tree=new_tree,
                otu_ids=tax_ids,
                normalized=True,
            )
        else:
            return skbio.diversity.beta_diversity(
                BetaDiversityMetric.UnweightedUnifrac,
                df,
                df.index,
                tree=new_tree,
                otu_ids=tax_ids,
            )