예제 #1
0
def ilr_transform(table: pd.DataFrame, tree: skbio.TreeNode) -> pd.DataFrame:
    """Performs isometric logratio (ilr) transformation on feature-table.

    This creates a new table with balances (groups of features) that
    distinguish samples. Zeros must first be removed from the table
    (e.g. add-pseudocount). For source documentation check out:
    https://numpydoc.readthedocs.io/en/latest/

    Parameters
    -----------
    table : pd.DataFrame
        Dataframe of the feature table where rows correspond to samples
        and columns are features. The values within the table must be
        positive and nonzero.
    tree : skbio.TreeNode
        A tree relating all of the features to balances or
        log-contrasts (hierarchy). This tree must be bifurcating
        (i.e. has exactly 2 nodes). The internal nodes of the tree
         will be renamed.

    Returns
    --------
    balances : pd.DataFrame
         Balances calculated from the feature table. Balance represents
         the log ratio of subchildren values below the specified internal node.
    """
    _table, _tree = match_tips(table, tree)
    basis, nodes = balance_basis(_tree)
    balances = ilr(_table.values, basis)
    in_nodes = [n.name for n in _tree.levelorder() if not n.is_tip()]
    return pd.DataFrame(balances, columns=in_nodes, index=table.index)
예제 #2
0
def ilr_transform(table: pd.DataFrame, tree: skbio.TreeNode) -> pd.DataFrame:
    """Performs isometric logratio (ilr) transformation on feature-table.

    This creates a new table with balances (groups of features) that
    distinguish samples. Zeros must first be removed from the table
    (e.g. add-pseudocount). For source documentation check out:
    https://numpydoc.readthedocs.io/en/latest/

    Parameters
    -----------
    table : pd.DataFrame
        Dataframe of the feature table where rows correspond to samples
        and columns are features. The values within the table must be
        positive and nonzero.
    tree : skbio.TreeNode
        A tree relating all of the features to balances or
        log-contrasts (hierarchy). This tree must be bifurcating
        (i.e. has exactly 2 nodes). The internal nodes of the tree
         will be renamed.

    Returns
    --------
    balances : pd.DataFrame
         Balances calculated from the feature table. Balance represents
         the log ratio of subchildren values below the specified internal node.
    """
    _table, _tree = match_tips(table, tree)
    basis, nodes = balance_basis(_tree)
    balances = ilr(_table.values, basis)
    in_nodes = [n.name for n in _tree.levelorder() if not n.is_tip()]
    return pd.DataFrame(balances,
                        columns=in_nodes,
                        index=table.index)
예제 #3
0
    def test_regression_results_residuals_projection(self):
        tree = TreeNode.read([r'(c, (a, b)Y2)Y1;'])
        basis, _ = balance_basis(tree)
        exp_resid = pd.DataFrame(
            {
                's1': [-0.986842, -0.236842],
                's2': [-0.065789, -1.815789],
                's3': [1.473684, 0.473684],
                's4': [1.394737, -1.105263],
                's5': [-1.065789, 1.184211],
                's6': [-1.144737, -0.394737],
                's7': [0.394737, 1.894737]
            },
            index=['Y1', 'Y2']).T
        exp_resid = pd.DataFrame(
            ilr_inv(exp_resid, basis),
            index=['s1', 's2', 's3', 's4', 's5', 's6', 's7'],
            columns=['c', 'a', 'b'])

        submodels = [self.model1, self.model2]
        res = submock(Y=self.balances, Xs=None)
        submock.submodels = submodels
        res.fit()
        res_resid = res.residuals(tree).sort_index()
        pdt.assert_frame_equal(res_resid,
                               exp_resid,
                               check_exact=False,
                               check_less_precise=True)
예제 #4
0
파일: _model.py 프로젝트: biocore/gneiss
    def coefficients(self, tree=None):
        """ Returns coefficients from fit.

        Parameters
        ----------
        tree : skbio.TreeNode, optional
            The tree used to perform the ilr transformation.  If this
            is specified, then the prediction will be represented as
            proportions. Otherwise, if this is not specified, the prediction
            will be represented as balances. (default: None).

        Returns
        -------
        pd.DataFrame
            A table of coefficients where rows are covariates,
            and the columns are balances. If `tree` is specified, then
            the columns are proportions.
        """
        if not self._fitted:
            ValueError(('Model not fitted - coefficients not calculated.'
                        'See `fit()`'))
        coef = self._beta
        if tree is not None:
            basis, _ = balance_basis(tree)
            c = ilr_inv(coef.values, basis=basis)
            ids = [n.name for n in tree.tips()]
            return pd.DataFrame(c, columns=ids, index=coef.index)
        else:
            return coef
예제 #5
0
    def coefficients(self, tree=None):
        """ Returns coefficients from fit.

        Parameters
        ----------
        tree : skbio.TreeNode, optional
            The tree used to perform the ilr transformation.  If this
            is specified, then the prediction will be represented as
            proportions. Otherwise, if this is not specified, the prediction
            will be represented as balances. (default: None).

        Returns
        -------
        pd.DataFrame
            A table of coefficients where rows are covariates,
            and the columns are balances. If `tree` is specified, then
            the columns are proportions.
        """
        coef = pd.DataFrame()

        for r in self.results:
            c = r.params
            c.name = r.model.endog_names
            coef = coef.append(c)

        if tree is not None:
            basis, _ = balance_basis(tree)
            c = ilr_inv(coef.values.T, basis=basis).T

            return pd.DataFrame(c, index=[n.name for n in tree.tips()],
                                columns=coef.columns)
        else:
            return coef.T
예제 #6
0
    def coefficients(self, tree=None):
        """ Returns coefficients from fit.

        Parameters
        ----------
        tree : skbio.TreeNode, optional
            The tree used to perform the ilr transformation.  If this
            is specified, then the prediction will be represented as
            proportions. Otherwise, if this is not specified, the prediction
            will be represented as balances. (default: None).

        Returns
        -------
        pd.DataFrame
            A table of coefficients where rows are covariates,
            and the columns are balances. If `tree` is specified, then
            the columns are proportions.
        """
        if not self._fitted:
            ValueError(('Model not fitted - coefficients not calculated.'
                        'See `fit()`'))
        coef = self._beta
        if tree is not None:
            basis, _ = balance_basis(tree)
            c = ilr_inv(coef.values, basis=basis)
            ids = [n.name for n in tree.tips()]
            return pd.DataFrame(c, columns=ids, index=coef.index)
        else:
            return coef
예제 #7
0
def _to_balances(table, tree):
    """ Converts a table of abundances to balances given a tree.

    Parameters
    ----------
    table : pd.DataFrame
        Contingency table where samples correspond to rows and
        features correspond to columns.
    tree : skbio.TreeNode
        Tree object where the leaves correspond to the columns contained in
        the table.

    Returns
    -------
    pd.DataFrame
        Contingency table where samples correspond to rows and
        balances correspond to columns.
    np.array
        Orthonormal basis in the Aitchison simplex generated from `tree`.
    """
    non_tips = [n.name for n in tree.levelorder() if not n.is_tip()]
    basis, _ = balance_basis(tree)

    mat = ilr(table.values, basis=basis)
    ilr_table = pd.DataFrame(mat, columns=non_tips, index=table.index)
    return ilr_table, basis
예제 #8
0
 def test_balance_basis_large1(self):
     fname = get_data_path('large_tree.nwk', subfolder='data')
     t = TreeNode.read(fname)
     # note that the basis is in reverse level order
     exp_basis = np.loadtxt(
         get_data_path('large_tree_basis.txt', subfolder='data'))
     res_basis, res_keys = balance_basis(t)
     npt.assert_allclose(exp_basis[:, ::-1], res_basis)
예제 #9
0
def ilr_transform(table: pd.DataFrame, tree: skbio.TreeNode) -> pd.DataFrame:
    _table, _tree = match_tips(table, tree)
    basis, _ = balance_basis(_tree)
    balances = ilr(_table.values, basis)
    in_nodes = [n.name for n in _tree.levelorder() if not n.is_tip()]
    return pd.DataFrame(balances,
                        columns=in_nodes,
                        index=table.index)
예제 #10
0
    def test_balance_basis_base_case(self):
        tree = u"(a,b);"
        t = TreeNode.read([tree])
        exp_keys = [t.name]
        exp_basis = np.array([0.19557032, 0.80442968])
        res_basis, res_keys = balance_basis(t)

        npt.assert_allclose(exp_basis, res_basis)
        self.assertListEqual(exp_keys, res_keys)
예제 #11
0
    def test_balance_basis_base_case(self):
        tree = u"(a,b);"
        t = TreeNode.read([tree])
        exp_keys = [t.name]
        exp_basis = np.array([0.19557032, 0.80442968])
        res_basis, res_keys = balance_basis(t)

        npt.assert_allclose(exp_basis, res_basis)
        self.assertListEqual(exp_keys, res_keys)
예제 #12
0
 def test_balance_basis_large1(self):
     fname = get_data_path('large_tree.nwk',
                           subfolder='data')
     t = TreeNode.read(fname)
     # note that the basis is in reverse level order
     exp_basis = np.loadtxt(
         get_data_path('large_tree_basis.txt',
                       subfolder='data'))
     res_basis, res_keys = balance_basis(t)
     npt.assert_allclose(exp_basis[:, ::-1], res_basis)
예제 #13
0
    def test_balance_basis_unbalanced(self):
        tree = u"((a,b)c, d);"
        t = TreeNode.read([tree])
        exp_keys = [t.name, t[0].name]
        exp_basis = np.array([[0.18507216, 0.18507216, 0.62985567],
                              [0.14002925, 0.57597535, 0.28399541]])

        res_basis, res_keys = balance_basis(t)

        npt.assert_allclose(exp_basis, res_basis)
        self.assertListEqual(exp_keys, list(res_keys))
예제 #14
0
    def test_balance_basis_unbalanced(self):
        tree = u"((a,b)c, d);"
        t = TreeNode.read([tree])
        exp_keys = [t.name, t[0].name]
        exp_basis = np.array([[0.18507216, 0.18507216, 0.62985567],
                              [0.14002925, 0.57597535, 0.28399541]])

        res_basis, res_keys = balance_basis(t)

        npt.assert_allclose(exp_basis, res_basis)
        self.assertListEqual(exp_keys, list(res_keys))
예제 #15
0
    def setUp(self):
        self.results = "results"
        if not os.path.exists(self.results):
            os.mkdir(self.results)
        self.balances = pd.DataFrame(
            {
                'a': [-2, -1, 0, 1, 2],
                'b': [-2, 0, 0, 0, 0]
            },
            index=['a1', 'a2', 'a3', 'a4', 'a5'])
        self.tree = TreeNode.read([r'((k, q)d, ((x, y)a, z)b)c;'])

        self.taxonomy = pd.DataFrame(
            [['foo;barf;a;b;c;d;e', 1], ['foo;bark;f;g;h;i;j', 1],
             ['foo;bark;f;g;h;w;j', 1], ['nom;tu;k;l;m;n;o', 0.9],
             ['nom;tu;k;l;m;t;o', 0.9]],
            columns=['Taxon', 'Confidence'],
            index=['x', 'y', 'z', 'k', 'q'])

        self.balances = pd.DataFrame(
            [[1, 2, 3, 4, 5, 6, 7], [-3.1, -2.9, -3, 3, 2.9, 3.2, 3.1],
             [1, 1, 1, 1, 1, 1, 1], [3, 2, 1, 0, -1, -2, -3]],
            index=['d', 'a', 'b', 'c'],
            columns=['s1', 's2', 's3', 's4', 's5', 's6', 's7']).T
        basis, _ = balance_basis(self.tree)
        self.table = pd.DataFrame(
            ilr_inv(self.balances, basis),
            columns=['x', 'y', 'z', 'k', 'q'],
            index=['s1', 's2', 's3', 's4', 's5', 's6', 's7'])

        index = pd.Index(['s1', 's2', 's3', 's4', 's5', 's6', 's7'], name='id')
        self.categorical = CategoricalMetadataColumn(
            pd.Series(['a', 'a', 'a', 'b', 'b', 'b', 'b'],
                      index=index,
                      name='categorical'))
        self.multi_categorical = CategoricalMetadataColumn(
            pd.Series(['a', 'a', 'c', 'b', 'b', 'b', 'c'],
                      index=index,
                      name='multi_categorical'))
        self.partial_numerical_categorical = CategoricalMetadataColumn(
            pd.Series(['1', '1', '1', '2', '2', '2', 'a'],
                      index=index,
                      name='multi_categorical'))
        self.full_numerical_categorical = CategoricalMetadataColumn(
            pd.Series(['1', '1', '1.0', '2', '2', '2.0', '3'],
                      index=index,
                      name='numerical_categorical'))
        self.continuous = NumericMetadataColumn(
            pd.Series(np.arange(7), index=index, name='continuous'))
예제 #16
0
 def setUp(self):
     self.pickle_fname = "test.pickle"
     self.data = pd.DataFrame(
         [[1, 1, 1], [3, 2, 3], [4, 3, 2], [5, 4, 4], [2, 5, 3], [3, 6, 5],
          [4, 7, 4]],
         index=['s1', 's2', 's3', 's4', 's5', 's6', 's7'],
         columns=['Y1', 'Y2', 'X'])
     self.model1 = smf.ols(formula="Y1 ~ X", data=self.data)
     self.model2 = smf.ols(formula="Y2 ~ X", data=self.data)
     self.tree = TreeNode.read(['((a,b)Y1, c)Y2;'])
     self.basis = pd.DataFrame(clr_inv(balance_basis(self.tree)[0]),
                               columns=['a', 'b', 'c'],
                               index=['Y1', 'Y2'])
     self.balances = pd.DataFrame(self.data[['Y1', 'Y2']],
                                  index=self.data.index,
                                  columns=['Y1', 'Y2'])
예제 #17
0
    def predict(self, X=None, tree=None, **kwargs):
        """ Performs a prediction based on model.

        Parameters
        ----------
        X : pd.DataFrame, optional
            Input table of covariates, where columns are covariates, and
            rows are samples.  If not specified, then the fitted values
            calculated from training the model will be returned.
        tree : skbio.TreeNode, optional
            The tree used to perform the ilr transformation.  If this
            is specified, then the prediction will be represented
            as proportions. Otherwise, if this is not specified,
            the prediction will be represented as balances. (default: None).
        **kwargs : dict
            Other arguments to be passed into the model prediction.

        Returns
        -------
        pd.DataFrame
            A table of predicted values where rows are covariates,
            and the columns are balances. If `tree` is specified, then
            the columns are proportions.
        """
        prediction = pd.DataFrame()
        for m in self.results:
            # check if X is none.
            p = pd.Series(m.predict(X, **kwargs))
            p.name = m.model.endog_names
            if X is not None:
                p.index = X.index
            else:
                p.index = m.fittedvalues.index
            prediction = prediction.append(p)

        if tree is not None:
            basis, _ = balance_basis(tree)
            proj_prediction = ilr_inv(prediction.values.T, basis=basis)
            return pd.DataFrame(proj_prediction,
                                columns=[n.name for n in tree.tips()],
                                index=prediction.columns)
        else:
            return prediction.T
예제 #18
0
    def residuals(self, tree=None):
        """ Returns calculated residuals from fit.

        Parameters
        ----------
        X : pd.DataFrame, optional
            Input table of covariates.  If not specified, then the
            fitted values calculated from training the model will be
            returned.
        tree : skbio.TreeNode, optional
            The tree used to perform the ilr transformation.  If this
            is specified, then the prediction will be represented
            as proportions. Otherwise, if this is not specified,
            the prediction will be represented as balances. (default: None).

        Returns
        -------
        pd.DataFrame
            A table of residuals where rows are covariates,
            and the columns are balances. If `tree` is specified, then
            the columns are proportions.

        References
        ----------
        .. [1] Aitchison, J. "A concise guide to compositional data analysis,
           CDA work." Girona 24 (2003): 73-81.
        """
        resid = pd.DataFrame()

        for r in self.results:
            err = r.resid
            err.name = r.model.endog_names
            resid = resid.append(err)

        if tree is not None:
            basis, _ = balance_basis(tree)
            proj_resid = ilr_inv(resid.values.T, basis=basis).T
            return pd.DataFrame(proj_resid,
                                index=[n.name for n in tree.tips()],
                                columns=resid.columns).T
        else:
            return resid.T
예제 #19
0
파일: test_ols.py 프로젝트: biocore/gneiss
    def test_ols_ilr_inv_test(self):

        model = ols('x1 + x2', self.Y, self.X)
        model.fit()
        basis, _ = balance_basis(self.tree)
        # test pvalues
        exp = pd.DataFrame({'y1': self.r1_.pvalues, 'y2': self.r2_.pvalues})
        pdt.assert_frame_equal(model.pvalues, exp)

        # test coefficients
        exp = pd.DataFrame({'y1': self.r1_.params, 'y2': self.r2_.params})

        exp = pd.DataFrame(ilr_inv(exp, basis),
                           columns=['c', 'b', 'a'],
                           index=self.X.columns)

        res = model.coefficients(tree=self.tree)
        pdt.assert_frame_equal(res, exp)

        # test residuals
        exp = pd.DataFrame({
            'y1': self.r1_.resid,
            'y2': self.r2_.resid
        },
                           index=self.Y.index)
        exp = pd.DataFrame(ilr_inv(exp, basis),
                           index=self.Y.index,
                           columns=['c', 'b', 'a'])
        res = model.residuals(tree=self.tree)
        pdt.assert_frame_equal(res, exp)

        # test prediction
        exp = pd.DataFrame({
            'y1': self.r1_.predict(),
            'y2': self.r2_.predict()
        },
                           index=self.Y.index)
        exp = pd.DataFrame(ilr_inv(exp, basis),
                           index=self.Y.index,
                           columns=['c', 'b', 'a'])
        res = model.predict(tree=self.tree)
        pdt.assert_frame_equal(res, exp)
예제 #20
0
파일: _model.py 프로젝트: biocore/gneiss
    def predict(self, X=None, tree=None, **kwargs):
        """ Performs a prediction based on model.

        Parameters
        ----------
        X : pd.DataFrame, optional
            Input table of covariates, where columns are covariates, and
            rows are samples.  If not specified, then the fitted values
            calculated from training the model will be returned.
        tree : skbio.TreeNode, optional
            The tree used to perform the ilr transformation.  If this
            is specified, then the prediction will be represented
            as proportions. Otherwise, if this is not specified,
            the prediction will be represented as balances. (default: None).
        **kwargs : dict
            Other arguments to be passed into the model prediction.

        Returns
        -------
        pd.DataFrame
            A table of predicted values where rows are covariates,
            and the columns are balances. If `tree` is specified, then
            the columns are proportions.

        """
        if not self._fitted:
            ValueError(('Model not fitted - coefficients not calculated.'
                        'See `fit()`'))
        if X is None:
            X = self.design_matrices

        prediction = X.dot(self._beta)
        if tree is not None:
            basis, _ = balance_basis(tree)
            proj_prediction = ilr_inv(prediction.values, basis=basis)
            ids = [n.name for n in tree.tips()]
            return pd.DataFrame(proj_prediction,
                                columns=ids,
                                index=prediction.index)
        else:
            return prediction
예제 #21
0
    def predict(self, X=None, tree=None, **kwargs):
        """ Performs a prediction based on model.

        Parameters
        ----------
        X : pd.DataFrame, optional
            Input table of covariates, where columns are covariates, and
            rows are samples.  If not specified, then the fitted values
            calculated from training the model will be returned.
        tree : skbio.TreeNode, optional
            The tree used to perform the ilr transformation.  If this
            is specified, then the prediction will be represented
            as proportions. Otherwise, if this is not specified,
            the prediction will be represented as balances. (default: None).
        **kwargs : dict
            Other arguments to be passed into the model prediction.

        Returns
        -------
        pd.DataFrame
            A table of predicted values where columns are coefficients,
            and the rows are balances. If `tree` is specified, then
            the rows are proportions.

        """
        if not self._fitted:
            ValueError(('Model not fitted - coefficients not calculated.'
                        'See `fit()`'))
        if X is None:
            X = self.design_matrices

        prediction = X.dot(self._beta)
        if tree is not None:
            basis, _ = balance_basis(tree)
            proj_prediction = ilr_inv(prediction.values, basis=basis)
            ids = [n.name for n in tree.tips()]
            return pd.DataFrame(proj_prediction,
                                columns=ids,
                                index=prediction.index)
        else:
            return prediction
예제 #22
0
파일: _model.py 프로젝트: biocore/gneiss
    def residuals(self, tree=None):
        """ Returns calculated residuals from fit.

        Parameters
        ----------
        X : pd.DataFrame, optional
            Input table of covariates.  If not specified, then the
            fitted values calculated from training the model will be
            returned.
        tree : skbio.TreeNode, optional
            The tree used to perform the ilr transformation.  If this
            is specified, then the prediction will be represented
            as proportions. Otherwise, if this is not specified,
            the prediction will be represented as balances. (default: None).

        Returns
        -------
        pd.DataFrame
            A table of residuals where rows are covariates,
            and the columns are balances. If `tree` is specified, then
            the columns are proportions.

        References
        ----------
        .. [1] Aitchison, J. "A concise guide to compositional data analysis,
           CDA work." Girona 24 (2003): 73-81.
        """
        if not self._fitted:
            ValueError(('Model not fitted - coefficients not calculated.'
                        'See `fit()`'))
        resid = self._resid
        if tree is not None:
            basis, _ = balance_basis(tree)
            proj_resid = ilr_inv(resid.values, basis=basis)
            ids = [n.name for n in tree.tips()]
            return pd.DataFrame(proj_resid,
                                columns=ids,
                                index=resid.index)
        else:
            return resid
예제 #23
0
파일: test_ols.py 프로젝트: biocore/gneiss
    def test_ols_ilr_inv_test(self):

        model = ols('x1 + x2', self.Y, self.X)
        model.fit()
        basis, _ = balance_basis(self.tree)
        # test pvalues
        exp = pd.DataFrame({'y1': self.r1_.pvalues,
                            'y2': self.r2_.pvalues})
        pdt.assert_frame_equal(model.pvalues, exp)

        # test coefficients
        exp = pd.DataFrame({'y1': self.r1_.params,
                            'y2': self.r2_.params})

        exp = pd.DataFrame(ilr_inv(exp, basis),
                           columns=['c', 'b', 'a'],
                           index=self.X.columns)

        res = model.coefficients(tree=self.tree)
        pdt.assert_frame_equal(res, exp)

        # test residuals
        exp = pd.DataFrame({'y1': self.r1_.resid,
                            'y2': self.r2_.resid},
                           index=self.Y.index)
        exp = pd.DataFrame(ilr_inv(exp, basis),
                           index=self.Y.index,
                           columns=['c', 'b', 'a'])
        res = model.residuals(tree=self.tree)
        pdt.assert_frame_equal(res, exp)

        # test prediction
        exp = pd.DataFrame({'y1': self.r1_.predict(),
                            'y2': self.r2_.predict()},
                           index=self.Y.index)
        exp = pd.DataFrame(ilr_inv(exp, basis),
                           index=self.Y.index,
                           columns=['c', 'b', 'a'])
        res = model.predict(tree=self.tree)
        pdt.assert_frame_equal(res, exp)
예제 #24
0
    def residuals(self, tree=None):
        """ Returns calculated residuals from fit.

        Parameters
        ----------
        X : pd.DataFrame, optional
            Input table of covariates.  If not specified, then the
            fitted values calculated from training the model will be
            returned.
        tree : skbio.TreeNode, optional
            The tree used to perform the ilr transformation.  If this
            is specified, then the prediction will be represented
            as proportions. Otherwise, if this is not specified,
            the prediction will be represented as balances. (default: None).

        Returns
        -------
        pd.DataFrame
            A table of residuals where rows are covariates,
            and the columns are balances. If `tree` is specified, then
            the columns are proportions.

        References
        ----------
        .. [1] Aitchison, J. "A concise guide to compositional data analysis,
           CDA work." Girona 24 (2003): 73-81.
        """
        if not self._fitted:
            ValueError(('Model not fitted - coefficients not calculated.'
                        'See `fit()`'))
        resid = self._resid
        if tree is not None:
            basis, _ = balance_basis(tree)
            proj_resid = ilr_inv(resid.values, basis=basis)
            ids = [n.name for n in tree.tips()]
            return pd.DataFrame(proj_resid, columns=ids, index=resid.index)
        else:
            return resid