Пример #1
0
    def _get_distance_matrix(self, X):
        """
        computes UniFrac distances with the fitted samples

        Parameters
        ----------
        X : biom.Table
            new samples

        Returns
        -------
        dm : DistanceMatrix
            distances from old samples to new samples

        """
        # TODO one problem with this approach is that
        #  if any samples in X overlap self.table, the counts will
        #  be doubled
        merged_table = self.table.merge(X)
        with self.hdf5_table(merged_table) as f:
            dm = ssu(
                f.name,
                self.tree_path,
                unifrac_method=self.unifrac_method,
                variance_adjust=False,
                alpha=1.0,
                bypass_tips=False,
                threads=1,
            )
        return dm
Пример #2
0
    def test_unweighted_root_eval_issue_46(self):
        tree = self.get_data_path('crawford.tre')
        table = self.get_data_path('crawford.biom')

        table_inmem = load_table(table)
        tree_inmem = skbio.TreeNode.read(tree)

        ids = table_inmem.ids()
        otu_ids = table_inmem.ids(axis='observation')
        cnts = table_inmem.matrix_data.astype(int).toarray().T
        exp = skbio.diversity.beta_diversity('unweighted_unifrac',
                                             cnts,
                                             ids=ids,
                                             otu_ids=otu_ids,
                                             tree=tree_inmem)
        obs = ssu(table, tree, 'unweighted', False, 1.0, False, 1)
        npt.assert_almost_equal(obs.data, exp.data)

        obs2 = unweighted(table, tree)
        npt.assert_almost_equal(obs2.data, exp.data)

        tmpfile = '/tmp/uf_ta_1.md5'
        unweighted_to_file(table, tree, tmpfile, pcoa_dims=0)

        try:
            obs3 = h5unifrac(tmpfile)
            npt.assert_almost_equal(obs3.data, exp.data)
        finally:
            os.unlink(tmpfile)
    def transform(self, X):
        """

        X : biom.Table

        """
        # TODO one problem with this approach is that
        #  if any samples in X overlap self.table, the counts will
        #  be doubled
        merged_table = self.table.merge(X)
        with tempfile.NamedTemporaryFile() as f:
            with biom_open(f.name, 'w') as b:
                merged_table.to_hdf5(b, "merged")

            dm = ssu(
                f.name,
                self.tree_path,
                unifrac_method='unweighted',
                variance_adjust=False,
                alpha=1.0,
                bypass_tips=False,
                threads=1,
            )

        # get indices of test ID's
        X_idx = [dm.index(name) for name in X.ids('sample')]
        # get indices of table ID's
        ref_idx = [dm.index(name) for name in self.table.ids('sample')]

        # extract sub-distance matrix
        idxs = np.ix_(X_idx, ref_idx)
        sub_dm = dm.data[idxs]
        return sub_dm
Пример #4
0
def weighted_unnormalized_fp32(table: str,
                               phylogeny: str,
                               threads: int = 1,
                               variance_adjusted: bool = False,
                               bypass_tips: bool = False
                               ) -> skbio.DistanceMatrix:
    # noqa
    """Compute weighted unnormalized UniFrac using fp32 math

    Parameters
    ----------
    table : str
        A filepath to a BIOM-Format 2.1 file.
    phylogeny : str
        A filepath to a Newick formatted tree.
    threads : int, optional
        The number of threads to use. Default is 1.
    variance_adjusted : bool, optional
        Adjust for varianace or not. Default is False.
    bypass_tips : bool
        Bypass the tips of the tree in the computation. This reduces compute
        by about 50%, but is an approximation.

    Returns
    -------
    skbio.DistanceMatrix
        The resulting distance matrix.

    Raises
    ------
    IOError
        If the tree file is not found
        If the table is not found
    ValueError
        If the table does not appear to be BIOM-Format v2.1.
        If the phylogeny does not appear to be in Newick format.

    Notes
    -----
    Weighted UniFrac was originally described in [1]_. Variance Adjusted
    Weighted UniFrac was originally described in [2]_.

    References
    ----------
    .. [1] Lozupone, C. A., Hamady, M., Kelley, S. T. & Knight, R. Quantitative
       and qualitative beta diversity measures lead to different insights into
       factors that structure microbial communities. Appl. Environ. Microbiol.
       73, 1576-1585 (2007).
    .. [2] Chang, Q., Luan, Y. & Sun, F. Variance adjusted weighted UniFrac: a
       powerful beta diversity measure for comparing communities based on
       phylogeny. BMC Bioinformatics 12:118 (2011).
    """
    _validate(table, phylogeny)
    return qsu.ssu(str(table), str(phylogeny), 'weighted_unnormalized_fp32',
                   variance_adjusted, 1.0, bypass_tips, threads)
Пример #5
0
def unweighted(table: str,
               phylogeny: str,
               threads: int = 1,
               variance_adjusted: bool = False,
               bypass_tips: bool = False) -> skbio.DistanceMatrix:
    """Compute Unweighted UniFrac

    Parameters
    ----------
    table : str
        A filepath to a BIOM-Format 2.1 file.
    phylogeny : str
        A filepath to a Newick formatted tree.
    threads : int, optional
        The number of threads to use. Default of 1.
    variance_adjusted : bool, optional
        Adjust for varianace or not. Default is False.
    bypass_tips : bool
        Bypass the tips of the tree in the computation. This reduces compute
        by about 50%, but is an approximation.

    Returns
    -------
    skbio.DistanceMatrix
        The resulting distance matrix.

    Raises
    ------
    IOError
        If the tree file is not found
        If the table is not found
    ValueError
        If the table does not appear to be BIOM-Format v2.1.
        If the phylogeny does not appear to be in Newick format.

    Notes
    -----
    Unweighted UniFrac was originally described in [1]_. Variance Adjusted
    UniFrac was originally described in [2]_, and while its application to
    Unweighted UniFrac was not described, factoring in the variance adjustment
    is still feasible and so it is exposed.

    References
    ----------
    .. [1] Lozupone, C. & Knight, R. UniFrac: a new phylogenetic method for
       comparing microbial communities. Appl. Environ. Microbiol. 71, 8228-8235
       (2005).
    .. [2] Chang, Q., Luan, Y. & Sun, F. Variance adjusted weighted UniFrac: a
       powerful beta diversity measure for comparing communities based on
       phylogeny. BMC Bioinformatics 12:118 (2011).
    """
    _validate(table, phylogeny)
    return qsu.ssu(table, phylogeny, 'unweighted',
                   variance_adjusted, 1.0, bypass_tips, threads)
Пример #6
0
    def test_meta_unifrac(self):
        t1 = self.get_data_path('t1.newick')
        e1 = self.get_data_path('e1.biom')

        result = ssu(e1, t1, 'unweighted', False, 1.0, False, 1)

        u1_distances = np.array([[0, 10 / 16.,
                                  8 / 13.], [10 / 16., 0, 8 / 17.],
                                 [8 / 13., 8 / 17., 0]])

        npt.assert_almost_equal(u1_distances, result.data)
        self.assertEqual(tuple('ABC'), result.ids)
Пример #7
0
    def _work(self, u_counts, v_counts, otu_ids, tree, method):
        data = np.array([u_counts, v_counts]).T

        bt = Table(data, otu_ids, ['u', 'v'])

        ta = os.path.join(gettempdir(), 'table.biom')
        tr = os.path.join(gettempdir(), 'tree.biom')

        self.files_to_delete.append(ta)
        self.files_to_delete.append(tr)

        with biom_open(ta, 'w') as fhdf5:
            bt.to_hdf5(fhdf5, 'Table for unit testing')
        tree.write(tr)

        # return value is a distance matrix, get the distance from u->v
        return ssu(ta, tr, method, False, 1.0, False, 1)['u', 'v']
Пример #8
0
    def test_unweighted_root_eval_issue_46(self):
        tree = self.get_data_path('crawford.tre')
        table = self.get_data_path('crawford.biom')

        table_inmem = load_table(table)
        tree_inmem = skbio.TreeNode.read(tree)

        ids = table_inmem.ids()
        otu_ids = table_inmem.ids(axis='observation')
        cnts = table_inmem.matrix_data.astype(int).toarray().T
        exp = skbio.diversity.beta_diversity('unweighted_unifrac',
                                             cnts,
                                             ids=ids,
                                             otu_ids=otu_ids,
                                             tree=tree_inmem)
        obs = ssu(table, tree, 'unweighted', False, 1.0, False, 1)
        npt.assert_almost_equal(obs.data, exp.data)
Пример #9
0
def generalized_fp32(table: str,
                     phylogeny: str,
                     threads: int = 1,
                     alpha: float = 1.0,
                     variance_adjusted: bool = False,
                     bypass_tips: bool = False) -> skbio.DistanceMatrix:
    """Compute Generalized UniFrac using fp32 math

    Parameters
    ----------
    table : str
        A filepath to a BIOM-Format 2.1 file.
    phylogeny : str
        A filepath to a Newick formatted tree.
    threads : int, optional
        The number of threads to use. Default is 1
    alpha : float, optional
        The level of contribution of high abundance branches. Higher alpha
        increases the contribution of from high abundance branches while lower
        alpha reduces the contribution. Alpha was originally defined over the
        range [0, 1]. Default is 1.0.
    variance_adjusted : bool, optional
        Adjust for varianace or not. Default is False.
    bypass_tips : bool
        Bypass the tips of the tree in the computation. This reduces compute
        by about 50%, but is an approximation.

    Returns
    -------
    skbio.DistanceMatrix
        The resulting distance matrix.

    Raises
    ------
    IOError
        If the tree file is not found
        If the table is not found
    ValueError
        If the table does not appear to be BIOM-Format v2.1.
        If the phylogeny does not appear to be in Newick format.

    Notes
    -----
    Generalized UniFrac was originally described in [1]_. Variance Adjusted
    UniFrac was originally described in [2]_, but was not described in as
    applied to Generalized UniFrac. It is feasible to do, so it is exposed
    here.

    An alpha of 1.0 is Weighted normalized UniFrac. An alpha of 0.0 is
    approximately Unweighted UniFrac, and is if the proportions are
    dichotomized.

    References
    ----------
    .. [1] Chen, J., Bittinger, K., Charlson, E. S., Hoffmann C., Lewis, J.,
       Wu, G. D., Collman R. G., Bushman, F. D. & Hongzhe L. Associating
       microbiome composition with environmental covariates using generalized
       UniFrac distances. Bioinformatics 28(16), 2106–2113 (2012).
    .. [2] Chang, Q., Luan, Y. & Sun, F. Variance adjusted weighted UniFrac: a
       powerful beta diversity measure for comparing communities based on
       phylogeny. BMC Bioinformatics 12:118 (2011).
    """
    _validate(table, phylogeny)
    if alpha == 1.0:
        warn("alpha of 1.0 is weighted-normalized UniFrac. "
             "Weighted-normalized is being used instead as it is more "
             "optimized.",
             Warning)
        return weighted_normalized_fp32(table, phylogeny, threads,
                                        variance_adjusted)
    else:
        return qsu.ssu(str(table), str(phylogeny), 'generalized_fp32',
                       variance_adjusted, alpha, bypass_tips, threads)
Пример #10
0
    def test_ssu_bad_method(self):
        t1 = self.get_data_path('t1.newick')
        e1 = self.get_data_path('e1.biom')

        with self.assertRaisesRegex(ValueError, "Unknown method."):
            ssu(e1, t1, 'unweightedfoo', False, 1.0, False, 1)
Пример #11
0
 def test_ssu_bad_table(self):
     t1 = self.get_data_path('t1.newick')
     with self.assertRaisesRegex(IOError, "Table file not found."):
         ssu('bad-file', t1, 'unweighted', False, 1.0, False, 1)
Пример #12
0
 def test_ssu_bad_tree(self):
     e1 = self.get_data_path('e1.biom')
     with self.assertRaisesRegex(IOError, "Tree file not found."):
         ssu(e1, 'bad-file', 'unweighted', False, 1.0, False, 1)