Exemplo n.º 1
0
    def test_clr_inv(self):
        npt.assert_allclose(clr_inv(self.rdata1), self.ortho1)
        npt.assert_allclose(clr(clr_inv(self.rdata1)), self.rdata1)

        # make sure that inplace modification is not occurring
        clr_inv(self.rdata1)
        npt.assert_allclose(
            self.rdata1,
            np.array([[0.70710678, -0.70710678, 0., 0.],
                      [0.40824829, 0.40824829, -0.81649658, 0.],
                      [0.28867513, 0.28867513, 0.28867513, -0.8660254]]))
Exemplo n.º 2
0
    def test_clr_inv(self):
        npt.assert_allclose(clr_inv(self.rdata1), self.ortho1)
        npt.assert_allclose(clr(clr_inv(self.rdata1)), self.rdata1)

        # make sure that inplace modification is not occurring
        clr_inv(self.rdata1)
        npt.assert_allclose(self.rdata1,
                            np.array([[0.70710678, -0.70710678, 0., 0.],
                                      [0.40824829, 0.40824829,
                                       -0.81649658, 0.],
                                      [0.28867513, 0.28867513,
                                       0.28867513, -0.8660254]]))
Exemplo n.º 3
0
    def setUp(self):
        np.random.seed(1)
        res = random_multimodal(num_microbes=8,
                                num_metabolites=8,
                                num_samples=150,
                                latent_dim=2,
                                sigmaQ=2,
                                microbe_total=1000,
                                metabolite_total=10000,
                                seed=1)
        (self.microbes, self.metabolites, self.X, self.B, self.U, self.Ubias,
         self.V, self.Vbias) = res
        n, d1 = self.microbes.shape
        n, d2 = self.metabolites.shape

        self.microbes = biom.Table(self.microbes.values.T,
                                   self.microbes.columns, self.microbes.index)
        self.metabolites = biom.Table(self.metabolites.values.T,
                                      self.metabolites.columns,
                                      self.metabolites.index)
        U_ = np.hstack((np.ones((self.U.shape[0], 1)), self.Ubias, self.U))
        V_ = np.vstack((self.Vbias, np.ones((1, self.V.shape[1])), self.V))

        uv = U_ @ V_
        h = np.zeros((d1, 1))
        self.exp_ranks = clr_inv(np.hstack((h, uv)))
Exemplo n.º 4
0
    def test_fit(self):
        np.random.seed(1)
        tf.reset_default_graph()
        tf.set_random_seed(0)
        latent_dim = 2
        res_ranks, res_biplot = paired_omics(self.microbes,
                                             self.metabolites,
                                             epochs=1000,
                                             latent_dim=latent_dim,
                                             min_feature_count=1,
                                             learning_rate=0.1)
        res_ranks = clr_inv(res_ranks.T)
        s_r, s_p = spearmanr(np.ravel(res_ranks), np.ravel(self.exp_ranks))

        self.assertGreater(s_r, 0.5)
        self.assertLess(s_p, 1e-2)

        # make sure the biplot is of the correct dimensions
        npt.assert_allclose(res_biplot.samples.shape,
                            np.array([self.microbes.shape[0], latent_dim]))
        npt.assert_allclose(res_biplot.features.shape,
                            np.array([self.metabolites.shape[0], latent_dim]))

        # make sure that the biplot has the correct ordering
        self.assertGreater(res_biplot.proportion_explained[0],
                           res_biplot.proportion_explained[1])
        self.assertGreater(res_biplot.eigvals[0], res_biplot.eigvals[1])
Exemplo n.º 5
0
 def test_regression_results_residuals_projection(self):
     A = np.array  # aliasing np.array for the sake of pep8
     exp_resid = pd.DataFrame(
         {
             's1': ilr_inv(A([-0.986842, -0.236842])),
             's2': ilr_inv(A([-0.065789, -1.815789])),
             's3': ilr_inv(A([1.473684, 0.473684])),
             's4': ilr_inv(A([1.394737, -1.105263])),
             's5': ilr_inv(A([-1.065789, 1.184211])),
             's6': ilr_inv(A([-1.144737, -0.394737])),
             's7': ilr_inv(A([0.394737, 1.894737]))
         },
         index=['a', 'b', 'c']).T
     # note that in the example, the basis is not strictly
     # equivalent to the tree
     basis = pd.DataFrame(clr_inv(_gram_schmidt_basis(3)),
                          index=['Y1', 'Y2'],
                          columns=['a', 'b', 'c'])
     submodels = [self.model1, self.model2]
     res = submock(submodels=submodels,
                   basis=basis,
                   tree=self.tree,
                   balances=self.balances)
     res.fit()
     pdt.assert_frame_equal(res.residuals(project=True),
                            exp_resid,
                            check_exact=False,
                            check_less_precise=True)
Exemplo n.º 6
0
    def test_regression_results_residuals(self):
        exp_resid = pd.DataFrame(
            {
                's1': [-0.986842, -0.236842],
                's2': [-0.065789, -1.815789],
                's3': [1.473684, 0.473684],
                's4': [1.394737, -1.105263],
                's5': [-1.065789, 1.184211],
                's6': [-1.144737, -0.394737],
                's7': [0.394737, 1.894737]
            },
            index=['Y1', 'Y2']).T
        basis = pd.DataFrame(clr_inv(_gram_schmidt_basis(3)),
                             index=['Y1', 'Y2'],
                             columns=['a', 'b', 'c'])
        submodels = [self.model1, self.model2]
        res = submock(submodels=submodels,
                      basis=basis,
                      tree=self.tree,
                      balances=self.balances)
        res.fit()

        pdt.assert_frame_equal(res.residuals(),
                               exp_resid,
                               check_exact=False,
                               check_less_precise=True)
Exemplo n.º 7
0
    def test_regression_results_predict_projection(self):
        basis = pd.DataFrame(clr_inv(_gram_schmidt_basis(3)),
                             index=['Y1', 'Y2'],
                             columns=['a', 'b', 'c'])
        submodels = [self.model1, self.model2]
        res = submock(submodels=submodels,
                      basis=basis,
                      tree=self.tree,
                      balances=self.balances)
        res.fit()

        res_predict = res.predict(self.data[['X']], project=True)
        A = np.array  # aliasing np.array for the sake of pep8
        exp_predict = pd.DataFrame(
            {
                's1': ilr_inv(A([1.986842, 1.236842])),
                's2': ilr_inv(A([3.065789, 3.815789])),
                's3': ilr_inv(A([2.526316, 2.526316])),
                's4': ilr_inv(A([3.605263, 5.105263])),
                's5': ilr_inv(A([3.065789, 3.815789])),
                's6': ilr_inv(A([4.144737, 6.394737])),
                's7': ilr_inv(A([3.605263, 5.105263]))
            },
            index=['a', 'b', 'c']).T

        pdt.assert_frame_equal(res_predict, exp_predict)
Exemplo n.º 8
0
    def test_regression_results_predict_none(self):
        basis = pd.DataFrame(clr_inv(_gram_schmidt_basis(3)),
                             index=['Y1', 'Y2'],
                             columns=['a', 'b', 'c'])
        submodels = [self.model1, self.model2]
        res = submock(submodels=submodels,
                      basis=basis,
                      tree=self.tree,
                      balances=self.balances)
        res.fit()
        res_predict = res.predict()

        exp_predict = pd.DataFrame(
            {
                's1': [1.986842, 1.236842],
                's2': [3.065789, 3.815789],
                's3': [2.526316, 2.526316],
                's4': [3.605263, 5.105263],
                's5': [3.065789, 3.815789],
                's6': [4.144737, 6.394737],
                's7': [3.605263, 5.105263]
            },
            index=['Y1', 'Y2']).T

        pdt.assert_frame_equal(res_predict, exp_predict)
Exemplo n.º 9
0
 def test_biplot(self):
     exp = clr(centralize(clr_inv(self.beta)))
     res = regression_biplot(self.beta)
     self.assertIsInstance(res, OrdinationResults)
     u = res.samples.values
     v = res.features.values.T
     npt.assert_allclose(u @ v, np.array(exp), atol=0.5, rtol=0.5)
Exemplo n.º 10
0
def balance_basis(tree_node):
    """
    Determines the basis based on bifurcating tree.

    This is commonly referred to as sequential binary partition [1]_.
    Given a binary tree relating a list of features, this module can
    be used to calculate an orthonormal basis, which is used to
    calculate the ilr transform.

    Parameters
    ----------
    treenode : skbio.TreeNode
        Input bifurcating tree.  Must be strictly bifurcating
        (i.e. every internal node needs to have exactly 2 children).

    Returns
    -------
    basis : np.array
        Returns a set of orthonormal bases in the Aitchison simplex
        corresponding to the tree. The order of the
        basis is index by the level order of the internal nodes.
    nodes : list, skbio.TreeNode
        List of tree nodes indicating the ordering in the basis.

    Raises
    ------
    ValueError
        The tree doesn't contain two branches.

    Examples
    --------
    >>> from gneiss.balances import balance_basis
    >>> from skbio import TreeNode
    >>> tree = u"((b,c)a, d)root;"
    >>> t = TreeNode.read([tree])
    >>> basis, nodes = balance_basis(t)
    >>> basis
    array([[ 0.18507216,  0.18507216,  0.62985567],
           [ 0.14002925,  0.57597535,  0.28399541]])

    Notes
    -----
    The tree must be strictly bifurcating, meaning that
    every internal node has exactly 2 children.

    See Also
    --------
    skbio.stats.composition.ilr

    References
    ----------
    .. [1] J.J. Egozcue and V. Pawlowsky-Glahn "Exploring Compositional Data
        with the CoDa-Dendrogram" (2011)

    """
    basis, nodes = _balance_basis(tree_node)
    basis = clr_inv(basis)
    return basis, nodes
Exemplo n.º 11
0
def balance_basis(tree_node):
    """
    Determines the basis based on bifurcating tree.

    This is commonly referred to as sequential binary partition [1]_.
    Given a binary tree relating a list of features, this module can
    be used to calculate an orthonormal basis, which is used to
    calculate the ilr transform.

    Parameters
    ----------
    treenode : skbio.TreeNode
        Input bifurcating tree.  Must be strictly bifurcating
        (i.e. every internal node needs to have exactly 2 children).

    Returns
    -------
    basis : np.array
        Returns a set of orthonormal bases in the Aitchison simplex
        corresponding to the tree. The order of the
        basis is index by the level order of the internal nodes.
    nodes : list, skbio.TreeNode
        List of tree nodes indicating the ordering in the basis.

    Raises
    ------
    ValueError
        The tree doesn't contain two branches.

    Examples
    --------
    >>> from gneiss.balances import balance_basis
    >>> from skbio import TreeNode
    >>> tree = u"((b,c)a, d)root;"
    >>> t = TreeNode.read([tree])
    >>> basis, nodes = balance_basis(t)
    >>> basis
    array([[0.18507216, 0.18507216, 0.62985567],
           [0.14002925, 0.57597535, 0.28399541]])

    Notes
    -----
    The tree must be strictly bifurcating, meaning that
    every internal node has exactly 2 children.

    See Also
    --------
    skbio.stats.composition.ilr

    References
    ----------
    .. [1] J.J. Egozcue and V. Pawlowsky-Glahn "Exploring Compositional Data
        with the CoDa-Dendrogram" (2011)

    """
    basis, nodes = _balance_basis(tree_node)
    basis = clr_inv(basis)
    return basis, nodes
Exemplo n.º 12
0
def multinomial(table: biom.Table,
                metadata: Metadata,
                formula: str,
                training_column: str = None,
                num_random_test_examples: int = 10,
                epoch: int = 10,
                batch_size: int = 5,
                beta_prior: float = 1,
                learning_rate: float = 0.1,
                clipnorm: float = 10,
                min_sample_count: int = 10,
                min_feature_count: int = 10,
                summary_interval: int = 60) -> (pd.DataFrame):

    # load metadata and tables
    metadata = metadata.to_dataframe()

    # match them
    table, metadata, design = match_and_filter(table, metadata, formula,
                                               training_column,
                                               num_random_test_examples,
                                               min_sample_count,
                                               min_feature_count)

    # convert to dense representation
    dense_table = table.to_dataframe().to_dense().T

    # split up training and testing
    trainX, testX, trainY, testY = split_training(dense_table, metadata,
                                                  design, training_column,
                                                  num_random_test_examples)

    model = MultRegression(learning_rate=learning_rate,
                           clipnorm=clipnorm,
                           beta_mean=beta_prior,
                           batch_size=batch_size,
                           save_path=None)
    with tf.Graph().as_default(), tf.Session() as session:
        model(session, trainX, trainY, testX, testY)

        model.fit(epoch=epoch,
                  summary_interval=summary_interval,
                  checkpoint_interval=None)

    md_ids = np.array(design.columns)
    obs_ids = table.ids(axis='observation')

    beta_ = clr(clr_inv(np.hstack((np.zeros((model.p, 1)), model.B))))

    beta_ = pd.DataFrame(
        beta_.T,
        columns=md_ids,
        index=obs_ids,
    )
    return beta_
Exemplo n.º 13
0
 def test_fit(self):
     tf.set_random_seed(0)
     md = self.md
     md.name = 'sampleid'
     md = qiime2.Metadata(md)
     exp_beta = clr(clr_inv(np.hstack((np.zeros((2, 1)), self.beta.T))))
     res_beta = multinomial(table=self.table,
                            metadata=md,
                            formula="X",
                            epoch=50000)
     npt.assert_allclose(exp_beta, res_beta.T, atol=0.5, rtol=0.5)
Exemplo n.º 14
0
    def test_fit_float_summary_interval(self):
        tf.set_random_seed(0)
        md = self.md

        multregression = songbird_plugin.actions['multinomial']

        md.name = 'sampleid'
        md = qiime2.Metadata(md)

        # See issue #31
        exp_beta = clr(clr_inv(np.hstack((np.zeros((2, 1)), self.beta.T))))

        q2_table = qiime2.Artifact.import_data('FeatureTable[Frequency]',
                                               self.table)

        q2_res_beta, q2_res_stats, q2_res_biplot = multregression(
            table=q2_table,
            metadata=md,
            min_sample_count=0,
            min_feature_count=0,
            formula="X",
            epochs=1000,
            summary_interval=0.5,
        )

        # try-except is for helpful error message if q2-coercion fails
        try:
            res_biplot = q2_res_biplot.view(OrdinationResults)
        except Exception:
            raise AssertionError('res_biplot unable to be coerced to '
                                 'OrdinationResults')
        try:
            res_beta = q2_res_beta.view(pd.DataFrame)
        except Exception:
            raise AssertionError('res_beta unable to be coerced to '
                                 'pd.DataFrame')
        try:
            res_stats = q2_res_stats.view(qiime2.Metadata)
        except Exception:
            raise AssertionError('res_stats unable to be coerced to '
                                 'qiime2.Metadata')

        u = res_biplot.samples.values
        v = res_biplot.features.values.T
        npt.assert_allclose(u @ v, res_beta.values, atol=0.5, rtol=0.5)

        npt.assert_allclose(exp_beta, res_beta.T, atol=0.6, rtol=0.6)
        self.assertGreater(len(res_stats.to_dataframe().index), 1)
Exemplo n.º 15
0
 def setUp(self):
     self.pickle_fname = "test.pickle"
     self.data = pd.DataFrame(
         [[1, 1, 1], [3, 2, 3], [4, 3, 2], [5, 4, 4], [2, 5, 3], [3, 6, 5],
          [4, 7, 4]],
         index=['s1', 's2', 's3', 's4', 's5', 's6', 's7'],
         columns=['Y1', 'Y2', 'X'])
     self.model1 = smf.ols(formula="Y1 ~ X", data=self.data)
     self.model2 = smf.ols(formula="Y2 ~ X", data=self.data)
     self.tree = TreeNode.read(['((a,b)Y1, c)Y2;'])
     self.basis = pd.DataFrame(clr_inv(balance_basis(self.tree)[0]),
                               columns=['a', 'b', 'c'],
                               index=['Y1', 'Y2'])
     self.balances = pd.DataFrame(self.data[['Y1', 'Y2']],
                                  index=self.data.index,
                                  columns=['Y1', 'Y2'])
Exemplo n.º 16
0
def regression_biplot(coefficients: pd.DataFrame) -> skbio.OrdinationResults:
    coefs = clr(centralize(clr_inv(coefficients)))
    u, s, v = np.linalg.svd(coefs)
    pc_ids = ['PC%d' % i for i in range(len(s))]
    samples = pd.DataFrame(u[:, :len(s)] @ np.diag(s),
                           columns=pc_ids, index=coefficients.index)
    features = pd.DataFrame(v.T[:, :len(s)],
                            columns=pc_ids, index=coefficients.columns)
    short_method_name = 'regression_biplot'
    long_method_name = 'Multinomial regression biplot'
    eigvals = pd.Series(s, index=pc_ids)
    proportion_explained = eigvals / eigvals.sum()
    res = OrdinationResults(short_method_name, long_method_name, eigvals,
                            samples=samples, features=features,
                            proportion_explained=proportion_explained)
    return res
Exemplo n.º 17
0
    def test_regression_results_coefficient_projection(self):
        exp_coef = pd.DataFrame(
            {'Intercept': ilr_inv(np.array([[1.447368, -0.052632]])),
             'X': ilr_inv(np.array([[0.539474, 1.289474]]))},
            index=['a', 'b', 'c'])
        # note that in the example, the basis is not strictly
        # equivalent to the tree
        basis = pd.DataFrame(clr_inv(_gram_schmidt_basis(3)),
                             index=['Y1', 'Y2'],
                             columns=['a', 'b', 'c'])

        submodels = [self.model1, self.model2]
        res = submock(submodels=submodels, basis=basis,
                      tree=self.tree, balances=self.balances)
        res.fit()
        pdt.assert_frame_equal(res.coefficients(project=True), exp_coef,
                               check_exact=False,
                               check_less_precise=True)
Exemplo n.º 18
0
    def test_regression_results_predict_extrapolate(self):
        basis = pd.DataFrame(clr_inv(_gram_schmidt_basis(3)),
                             index=['Y1', 'Y2'],
                             columns=['a', 'b', 'c'])
        submodels = [self.model1, self.model2]
        res = submock(submodels=submodels, basis=basis,
                      tree=self.tree, balances=self.balances)
        res.fit()

        extrapolate = pd.DataFrame({'X': [8, 9, 10]},
                                   index=['k1', 'k2', 'k3'])
        res_predict = res.predict(extrapolate)

        exp_predict = pd.DataFrame({'k1': [5.76315789, 10.26315789],
                                    'k2': [6.30263158, 11.55263158],
                                    'k3': [6.84210526, 12.84210526]},
                                   index=['Y1', 'Y2']).T

        pdt.assert_frame_equal(res_predict, exp_predict)
Exemplo n.º 19
0
def phylogenetic_basis(tree_node):
    """
    Determines the basis based on phylogenetic tree

    Parameters
    ----------
    treenode : skbio.TreeNode
        Phylogenetic tree.  Must be a strictly bifurcating tree
    Returns
    -------
    basis : np.array
        Returns a set of orthonormal bases in the Aitchison simplex
        corresponding to the phylogenetic tree. The order of the
        basis is index by the level order of the internal nodes.
    nodes : list, skbio.TreeNode
        List of tree nodes indicating the ordering in the basis.

    Raises
    ------
    ValueError
        The tree doesn't contain two branches

    Examples
    --------
    >>> from canvas.phylogeny import phylogenetic_basis
    >>> from skbio import TreeNode
    >>> tree = u"((b,c)a, d)root;"
    >>> t = TreeNode.read([tree])
    >>> basis, nodes = phylogenetic_basis(t)
    >>> basis
    array([[ 0.62985567,  0.18507216,  0.18507216],
           [ 0.28399541,  0.57597535,  0.14002925]])

    Notes
    -----
    The tree must be strictly bifurcating, meaning that
    every internal node has exactly 2 children.
    """
    basis, nodes = _balance_basis(tree_node)
    basis = clr_inv(basis)
    return basis, nodes
Exemplo n.º 20
0
def phylogenetic_basis(tree_node):
    """
    Determines the basis based on phylogenetic tree

    Parameters
    ----------
    treenode : skbio.TreeNode
        Phylogenetic tree.  Must be a strictly bifurcating tree
    Returns
    -------
    basis : np.array
        Returns a set of orthonormal bases in the Aitchison simplex
        corresponding to the phylogenetic tree. The order of the
        basis is index by the level order of the internal nodes.
    nodes : list, skbio.TreeNode
        List of tree nodes indicating the ordering in the basis.

    Raises
    ------
    ValueError
        The tree doesn't contain two branches

    Examples
    --------
    >>> from canvas.phylogeny import phylogenetic_basis
    >>> from skbio import TreeNode
    >>> tree = u"((b,c)a, d)root;"
    >>> t = TreeNode.read([tree])
    >>> basis, nodes = phylogenetic_basis(t)
    >>> basis
    array([[ 0.62985567,  0.18507216,  0.18507216],
           [ 0.28399541,  0.57597535,  0.14002925]])

    Notes
    -----
    The tree must be strictly bifurcating, meaning that
    every internal node has exactly 2 children.
    """
    basis, nodes = _balance_basis(tree_node)
    basis = clr_inv(basis)
    return basis, nodes
Exemplo n.º 21
0
def cross_validation(md, beta, gamma, data, k=50):
    """ Computes two cross validation metrics

    1) Rank difference
    2) Mean squared error on observed entries

    Parameters
    ----------
    md : np.array
       Design matrix
    beta : np.array
       Regression coefficients
    gamma : np.array
       Regression intercepts
    data : np.array
       Dense matrix of counts.  Samples are rows
       and features are columns.
    k : int
       Top k ranks to compare

    Returns
    -------
    mse : float
       Mean squared error across all of the cells in the matrix
    mrc : float
       Mean rank correlation.  This take the average spearman
       correlation across every sample.  This boils down to matching
       rank species curves per sample.
    """
    n = data.sum(axis=1).reshape(-1, 1)
    pred = np.multiply(n, clr_inv(md @ beta + gamma))
    mse = np.mean([cityblock(data[i], pred[i])
                   for i in range(data.shape[0])]) / data.shape[1]
    rc = []
    for i in range(data.shape[0]):
        idx = np.argsort(data[i, :])[-k:]
        r = spearmanr(data[i, idx], pred[i, idx])
        rc.append(r.correlation)
    mrc = np.mean(rc)
    return mse, mrc
Exemplo n.º 22
0
    def test_fit(self):
        tf.set_random_seed(0)
        md = self.md

        md.name = 'sampleid'
        md = qiime2.Metadata(md)

        exp_beta = clr(clr_inv(np.hstack((np.zeros((2, 1)), self.beta.T))))

        res_beta, res_stats, res_biplot = multinomial(table=self.table,
                                                      metadata=md,
                                                      min_sample_count=0,
                                                      min_feature_count=0,
                                                      formula="X",
                                                      epochs=1000)

        # test biplot
        self.assertIsInstance(res_biplot, OrdinationResults)
        u = res_biplot.samples.values
        v = res_biplot.features.values.T
        npt.assert_allclose(u @ v, res_beta.values, atol=0.5, rtol=0.5)

        npt.assert_allclose(exp_beta, res_beta.T, atol=0.6, rtol=0.6)
        self.assertGreater(len(res_stats.to_dataframe().index), 1)
Exemplo n.º 23
0
def multinomial(
    table: biom.Table,
    metadata: Metadata,
    formula: str,
    training_column: str = None,
    num_random_test_examples: int = 5,
    epochs: int = 1000,
    batch_size: int = 5,
    differential_prior: float = 1,
    learning_rate: float = 1e-3,
    clipnorm: float = 10,
    min_sample_count: int = 1000,
    min_feature_count: int = 10,
    summary_interval: int = 60
) -> (pd.DataFrame, qiime2.Metadata, skbio.OrdinationResults):

    # load metadata and tables
    metadata = metadata.to_dataframe()
    # match them
    table, metadata, design = match_and_filter(table, metadata, formula,
                                               min_sample_count,
                                               min_feature_count)

    # convert to dense representation
    dense_table = table.to_dataframe().to_dense().T

    # split up training and testing
    trainX, testX, trainY, testY = split_training(dense_table, metadata,
                                                  design, training_column,
                                                  num_random_test_examples)

    model = MultRegression(learning_rate=learning_rate,
                           clipnorm=clipnorm,
                           beta_mean=differential_prior,
                           batch_size=batch_size,
                           save_path=None)
    with tf.Graph().as_default(), tf.Session() as session:
        model(session, trainX, trainY, testX, testY)

        loss, cv, its = model.fit(epochs=epochs,
                                  summary_interval=summary_interval,
                                  checkpoint_interval=None)

    md_ids = np.array(design.columns)
    obs_ids = table.ids(axis='observation')

    beta_ = clr(clr_inv(np.hstack((np.zeros((model.p, 1)), model.B))))

    differentials = pd.DataFrame(
        beta_.T,
        columns=md_ids,
        index=obs_ids,
    )
    convergence_stats = pd.DataFrame({
        'loglikehood': loss,
        'cross-validation': cv,
        'iteration': its
    })

    convergence_stats.index.name = 'id'
    convergence_stats.index = convergence_stats.index.astype(np.str)

    c = convergence_stats['loglikehood'].astype(np.float)
    convergence_stats['loglikehood'] = c

    c = convergence_stats['cross-validation'].astype(np.float)
    convergence_stats['cross-validation'] = c

    c = convergence_stats['iteration'].astype(np.int)
    convergence_stats['iteration'] = c

    # regression biplot
    if differentials.shape[-1] > 1:
        u, s, v = np.linalg.svd(differentials)
        pc_ids = ['PC%d' % i for i in range(len(s))]
        samples = pd.DataFrame(u[:, :len(s)] @ np.diag(s),
                               columns=pc_ids,
                               index=differentials.index)
        features = pd.DataFrame(v.T[:, :len(s)],
                                columns=pc_ids,
                                index=differentials.columns)
        short_method_name = 'regression_biplot'
        long_method_name = 'Multinomial regression biplot'
        eigvals = pd.Series(s, index=pc_ids)
        proportion_explained = eigvals**2 / (eigvals**2).sum()
        biplot = OrdinationResults(short_method_name,
                                   long_method_name,
                                   eigvals,
                                   samples=samples,
                                   features=features,
                                   proportion_explained=proportion_explained)
    else:
        # this is to handle the edge case with only intercepts
        biplot = OrdinationResults('', '', pd.Series(), pd.DataFrame())

    return differentials, qiime2.Metadata(convergence_stats), biplot
Exemplo n.º 24
0
            basetmp_sub = base_truth.loc[(
                rank_,
                power_,
                depth_,
            ), :].copy().T
            # sub sampled
            subtmp_sub = subtmp.copy()
            #meta on cluster
            meta = np.array([1] * int(subtmp.shape[0] / 2) +
                            [2] * int(subtmp.shape[0] / 2)).T
            meta = pd.DataFrame(meta, index=subtmp.index, columns=['group'])

            # test KL with rcl
            X_sparse = rclr().fit_transform(subtmp_sub.copy())
            U, s, V = OptSpace(iteration=1000).fit_transform(X_sparse)
            clr_res = clr_inv(np.dot(np.dot(U, s), V.T))
            # use just kl_div here because already closed
            kl_clr = entropy(closure(basetmp_sub).T, clr_res.T).mean()
            results[(rank_, power_, depth_, 'rclr', 'KL-Div')] = [kl_clr]

            # test KL without rclr
            X_spn = np.array(subtmp_sub.copy()).astype(float)
            X_spn[X_spn == 0] = np.nan
            U_, s_, V_ = OptSpace(iteration=1000).fit_transform(X_spn)
            res_raw = np.dot(np.dot(U_, s_), V_.T)
            res_raw[res_raw <= 0] = 1
            kl_raw = entropy(closure(basetmp_sub).T, closure(res_raw).T).mean()
            results[(rank_, power_, depth_, 'Raw Counts', 'KL-Div')] = [kl_raw]

            # f-stat
            resfclr = permanova(DistanceMatrix(distance.cdist(U, U)),
Exemplo n.º 25
0
def band_table(num_samples,
               num_features,
               tree=None,
               low=2,
               high=10,
               sigma=2,
               alpha=6,
               seed=0):
    """ Generates a simulated table of counts.

    Each organism is modeled as a Gaussian distribution.  Then counts
    are simulated using a Poisson distribution.

    Parameters
    ----------
    num_samples : int
        Number of samples to simulate
    num_features : int
        Number of features to simulate
    tree : skbio.TreeNode
        Tree used as a scaffold for the ilr transform.
        If None, then the gram_schmidt_basis will be used.
    low : float
        Smallest gradient value.
    high : float
        Largest gradient value.
    sigma : float
        Variance of each species distribution
    alpha : int
        Global count bias.  This bias is added to every cell in the matrix.
    seed : int or np.random.RandomState
        Random seed

    Returns
    -------
    biom.Table
        Biom representation of the count table.
    pd.DataFrame
        DataFrame containing relevant metadata.
    beta : np.array
        Regression parameter estimates.
    theta : np.array
        Bias per sample.
    """
    state = np.random.RandomState(seed)

    # measured gradient values for each sample
    gradient = np.linspace(low, high, num_samples)
    # optima for features (i.e. optimal ph for species)
    mu = np.linspace(low, high, num_features)
    sigma = np.array([sigma] * num_features)
    # construct species distributions
    table = chain_interactions(gradient, mu, sigma)
    samp_ids = ['S%d' % i for i in range(num_samples)]

    # obtain basis required to convert from balances to proportions.
    if tree is None:
        basis = _gram_schmidt_basis(num_features)
        feat_ids = ['F%d' % i for i in range(num_features)]
        table = pd.DataFrame(table, index=samp_ids, columns=feat_ids)
    else:
        feat_ids = [n.name for n in tree.tips()]
        table = pd.DataFrame(table, index=samp_ids, columns=feat_ids)
        basis = sparse_balance_basis(tree)[0].todense()

    # construct balances from gaussian distribution.
    # this will be necessary when refitting parameters later.
    Y = ilr(table, basis=clr_inv(basis))
    X = gradient.reshape(-1, 1)
    X = np.hstack((np.ones(len(X)).reshape(-1, 1), X.reshape(-1, 1)))
    pY, resid, B = ols(Y, X)
    gamma = B[0]
    beta = B[1].reshape(1, -1)
    # parameter estimates
    r = beta.shape[1]
    # Normal distribution to simulate linear regression
    M = np.eye(r)
    # Generate covariance matrix from inverse wishart
    Sigma = invwishart.rvs(df=r + 2, scale=M.dot(M.T), random_state=state)
    w, v = eigsh(Sigma, k=2)
    # Low rank covariance matrix
    sim_L = (v @ np.diag(w)).T

    # sample
    y = X.dot(B)
    Ys = np.vstack(
        [state.multivariate_normal(y[i, :], Sigma) for i in range(y.shape[0])])
    Yp = Ys @ basis
    # calculate bias terms
    theta = -np.log(np.exp(Yp).sum(axis=1)) + alpha

    # multinomial sample the entries
    #table = np.vstack(multinomial(nd, Yp[i, :]) for i in range(y.shape[0]))

    # poisson sample the entries
    table = np.vstack(
        state.poisson(np.exp(Yp[i, :] + theta[i]))
        for i in range(y.shape[0])).T

    table = Table(table, feat_ids, samp_ids)
    metadata = pd.DataFrame({'G': gradient}, index=samp_ids)
    return table, metadata, beta, theta, gamma
Exemplo n.º 26
0
def mmvec(microbes: biom.Table,
          metabolites: biom.Table,
          metadata: Metadata = None,
          training_column: str = None,
          num_testing_examples: int = 5,
          min_feature_count: int = 10,
          epochs: int = 100,
          batch_size: int = 50,
          latent_dim: int = 3,
          input_prior: float = 1,
          output_prior: float = 1,
          learning_rate: float = 0.001,
          summary_interval: int = 60) -> (pd.DataFrame, OrdinationResults):

    if metadata is not None:
        metadata = metadata.to_dataframe()

    # Note: there are a couple of biom -> pandas conversions taking
    # place here.  This is currently done on purpose, since we
    # haven't figured out how to handle sparse matrix multiplication
    # in the context of this algorithm.  That is a future consideration.
    res = split_tables(microbes,
                       metabolites,
                       metadata=metadata,
                       training_column=training_column,
                       num_test=num_testing_examples,
                       min_samples=min_feature_count)

    (train_microbes_df, test_microbes_df, train_metabolites_df,
     test_metabolites_df) = res

    train_microbes_coo = coo_matrix(train_microbes_df.values)
    test_microbes_coo = coo_matrix(test_microbes_df.values)

    with tf.Graph().as_default(), tf.Session() as session:
        model = MMvec(latent_dim=latent_dim,
                      u_scale=input_prior,
                      v_scale=output_prior,
                      learning_rate=learning_rate)
        model(session, train_microbes_coo, train_metabolites_df.values,
              test_microbes_coo, test_metabolites_df.values)

        loss, cv = model.fit(epoch=epochs, summary_interval=summary_interval)

        U, V = model.U, model.V

        U_ = np.hstack((np.ones(
            (model.U.shape[0], 1)), model.Ubias.reshape(-1, 1), U))
        V_ = np.vstack(
            (model.Vbias.reshape(1, -1), np.ones((1, model.V.shape[1])), V))

        ranks = pd.DataFrame(clr(
            clr_inv(np.hstack((np.zeros((model.U.shape[0], 1)), U_ @ V_)))),
                             index=train_microbes_df.columns,
                             columns=train_metabolites_df.columns)

        ranks = ranks - ranks.mean(axis=0)
        u, s, v = svds(ranks, k=latent_dim)

        microbe_embed = u @ np.diag(s)
        metabolite_embed = v.T

        pc_ids = ['PC%d' % i for i in range(microbe_embed.shape[1])]
        features = pd.DataFrame(microbe_embed,
                                columns=pc_ids,
                                index=train_microbes_df.columns)
        samples = pd.DataFrame(metabolite_embed,
                               columns=pc_ids,
                               index=train_metabolites_df.columns)
        short_method_name = 'mmvec biplot'
        long_method_name = 'Multiomics mmvec biplot'
        eigvals = pd.Series(s, index=pc_ids)
        proportion_explained = pd.Series(s**2 / np.sum(s**2), index=pc_ids)
        biplot = OrdinationResults(short_method_name,
                                   long_method_name,
                                   eigvals,
                                   samples=samples,
                                   features=features,
                                   proportion_explained=proportion_explained)

        return ranks, biplot