Пример #1
0
def center_log_ratio(exp: Experiment, method=lambda matrix: matrix + 1, centralize=False, inplace=False):
    """ Performs a clr transform to each sample.

    Parameters
    ----------
    method : callable, optional
        An optional function to specify how the pseudocount method should be
        handled (to deal with zeros in the matrix)
    centralize : bool, optional
        centralize feature-wise to zero or not
    inplace : bool, optional
        False (default) to create a new experiment, True to normalize in place

    Returns
    -------
    Experiment
        The normalized experiment. Note that all features are clr normalized.

    See Also
    --------
    skbio.stats.composition.clr
    skbio.stats.composition.centralize
    """
    from skbio.stats.composition import clr, centralize as skbio_centralize

    logger.debug('clr transforming the data')
    if not inplace:
        exp = deepcopy(exp)
    if exp.sparse:
        exp.sparse = False
    if centralize:
        exp.data = clr(skbio_centralize(method(exp.data)))
    else:
        exp.data = clr(method(exp.data))
    return exp
Пример #2
0
    def test_clr(self):
        cmat = clr(closure(self.data1))
        A = np.array([.2, .2, .6])
        B = np.array([.4, .4, .2])

        npt.assert_allclose(cmat,
                            [np.log(A / np.exp(np.log(A).mean())),
                             np.log(B / np.exp(np.log(B).mean()))])
        cmat = clr(closure(self.data2))
        A = np.array([.2, .2, .6])
        npt.assert_allclose(cmat,
                            np.log(A / np.exp(np.log(A).mean())))

        cmat = clr(closure(self.data5))
        A = np.array([.2, .2, .6])
        B = np.array([.4, .4, .2])

        npt.assert_allclose(cmat,
                            [np.log(A / np.exp(np.log(A).mean())),
                             np.log(B / np.exp(np.log(B).mean()))])
        with self.assertRaises(ValueError):
            clr(self.bad1)
        with self.assertRaises(ValueError):
            clr(self.bad2)

        # make sure that inplace modification is not occurring
        clr(self.data2)
        npt.assert_allclose(self.data2, np.array([2, 2, 6]))
Пример #3
0
    def test_clr(self):
        cmat = clr(closure(self.cdata1))
        A = np.array([.2, .2, .6])
        B = np.array([.4, .4, .2])

        npt.assert_allclose(cmat, [
            np.log(A / np.exp(np.log(A).mean())),
            np.log(B / np.exp(np.log(B).mean()))
        ])
        cmat = clr(closure(self.cdata2))
        A = np.array([.2, .2, .6])
        npt.assert_allclose(cmat, np.log(A / np.exp(np.log(A).mean())))

        cmat = clr(closure(self.cdata5))
        A = np.array([.2, .2, .6])
        B = np.array([.4, .4, .2])

        npt.assert_allclose(cmat, [
            np.log(A / np.exp(np.log(A).mean())),
            np.log(B / np.exp(np.log(B).mean()))
        ])
        with self.assertRaises(ValueError):
            clr(self.bad1)
        with self.assertRaises(ValueError):
            clr(self.bad2)

        # make sure that inplace modification is not occurring
        clr(self.cdata2)
        npt.assert_allclose(self.cdata2, np.array([2, 2, 6]))
Пример #4
0
def test_convert_beta_coordinates():
    # Total: (n draws x p covariates x d features)
    # Each draw: (p covariates x d features)
    draw1 = np.array([[0.1, 0.2, 0.3, 0.4], [0.3, 0.1, 0.1, 0.5],
                      [0.2, 0.2, 0.2, 0.3], [0.5, 0.1, 0.2, 0.2]])
    draw2 = np.array([[0.2, 0.2, 0.3, 0.3], [0.1, 0.6, 0.2, 0.1],
                      [0.4, 0.4, 0.1, 0.1], [0.1, 0.1, 0.1, 0.7]])
    alr_coords = np.stack([alr(draw1), alr(draw2)])  # 2 x 4 x 3
    clr_coords = util.convert_beta_coordinates(alr_coords)  # 2 x 4 x 4
    exp_coords = np.stack([clr(draw1), clr(draw2)])
    np.testing.assert_array_almost_equal(clr_coords, exp_coords)

    clr_coords_sums = clr_coords.sum(axis=2)
    exp_clr_coords_sums = np.zeros((2, 4))
    np.testing.assert_array_almost_equal(exp_clr_coords_sums, clr_coords_sums)
Пример #5
0
    def normalize_transform(self, mode='clr'):
        """
        Some operations may require transformed data.
        This function performs normalization and
        a clr transform on all OTU tables in a Batch object.
        It returns a deep copy of the original Batch object,
        so the original file is not modified.

        :param mode: transformation mode; clr (centered log-ratio) or ilr (isometric log-ratio)
        :return: Transformed copy of Batch object.
        """
        batchcopy = copy.deepcopy(self)
        try:
            for x in list(self.otu):
                # normalizes the data by samples
                normbiom = batchcopy.otu[x].norm(axis='sample', inplace=False)
                mat = csr_matrix.toarray(normbiom.matrix_data)
                # replaces all zeros with a small value
                # multiplicative replacement preserves ratios between values
                mat = multiplicative_replacement(mat)
                if mode is 'clr':
                    mat = clr(mat)
                elif mode is 'ilr':
                    mat = ilr(mat)
                else:
                    raise ValueError("Only CLR and ILR transformations are currently supported.")
                normbiom._data = csc_matrix(mat)
                batchcopy.otu[x] = normbiom
        except Exception:
            logger.error("Failed to normalize data", exc_info=True)
        return batchcopy
Пример #6
0
def clr_wrapper(state: PipelineState):
    # Unfortunately, clr needs pseudocounts or it crashes out.
    clr_data = clr(state.df.to_numpy() + .5)
    new_df = pd.DataFrame(data=clr_data,
                          index=state.df.index,
                          columns=state.df.columns)
    return state.update_df(new_df)
Пример #7
0
 def test_build(self):
     """Test building a tensor from metadata (multi-mode) & matrix_rclr."""
     # flatten tensor into matrix
     matrix_counts = self.tensor_true.transpose([0, 2, 1])
     reshape_shape = matrix_counts.shape
     matrix_counts = matrix_counts.reshape(9, 2)
     # build mapping and table dataframe to rebuild
     mapping = np.array([[0, 0, 0, 1, 1, 1, 2, 2, 2],
                         [0, 1, 2, 0, 1, 2, 0, 1, 2]])
     mapping = pd.DataFrame(mapping.T, columns=['ID', 'conditional'])
     table = pd.DataFrame(matrix_counts.T)
     # rebuild the tensor
     tensor = build()
     tensor.construct(table, mapping, 'ID', ['conditional'])
     # ensure rebuild tensor is the same as it started
     npt.assert_allclose(tensor.counts, self.tensor_true.astype(float))
     # test tensor is ordered correctly in every dimension
     self.assertListEqual(tensor.subject_order, list(range(3)))
     self.assertListEqual(tensor.feature_order, list(range(2)))
     self.assertListEqual(tensor.condition_orders[0], list(range(3)))
     # test that flattened matrix has the same clr
     # transform as the tensor tensor_rclr
     tensor_clr_true = clr(matrix_counts).reshape(reshape_shape)
     tensor_clr_true = tensor_clr_true.transpose([0, 2, 1])
     npt.assert_allclose(tensor_rclr(tensor.counts), tensor_clr_true)
Пример #8
0
 def test_biplot(self):
     exp = clr(centralize(clr_inv(self.beta)))
     res = regression_biplot(self.beta)
     self.assertIsInstance(res, OrdinationResults)
     u = res.samples.values
     v = res.features.values.T
     npt.assert_allclose(u @ v, np.array(exp), atol=0.5, rtol=0.5)
Пример #9
0
def clrdata(data):
    logger.debug('clr transforming data')
    data[data == 0] = 1
    clrdata = np.zeros(np.shape(data))
    for ncol in range(np.shape(data)[1]):
        clrdata[:, ncol] = clr(data[:, ncol])
    return clrdata
Пример #10
0
def pls_balances_cmd(table_file, metadata_file, category, output_file):
    metadata = pd.read_table(metadata_file, index_col=0)
    table = load_table(table_file)
    table = pd.DataFrame(np.array(table.matrix_data.todense()).T,
                         index=table.ids(axis='sample'),
                         columns=table.ids(axis='observation'))

    ctable = pd.DataFrame(clr(centralize(table + 1)),
                          index=table.index,
                          columns=table.columns)

    rfc = PLSRegression(n_components=1)
    if metadata[category].dtype != np.float:
        cats = np.unique(metadata[category])
        groups = (metadata[category] == cats[0]).astype(np.int)
    else:
        groups = metadata[category]

    rfc.fit(X=ctable.values, Y=groups)

    pls_df = pd.DataFrame(rfc.x_weights_,
                          index=ctable.columns,
                          columns=['PLS1'])
    l, r = round_balance(pls_df.values,
                         means_init=[[pls_df.PLS1.min()], [0],
                                     [pls_df.PLS1.max()]],
                         n_init=100)
    num = pls_df.loc[pls_df.PLS1 > r]
    denom = pls_df.loc[pls_df.PLS1 < l]
    diff_features = list(num.index.values)
    diff_features += list(denom.index.values)

    with open(output_file, 'w') as f:
        f.write(','.join(diff_features))
Пример #11
0
 def test_center_log(self):
     dat = np.array([[10, 20, 1, 20, 5, 100, 844, 100],
                     [10, 20, 2, 19, 0, 100, 849, 200],
                     [10, 20, 3, 18, 5, 100, 844, 300],
                     [10, 20, 4, 17, 0, 100, 849, 400],
                     [10, 20, 5, 16, 4, 100, 845, 500],
                     [10, 20, 6, 15, 0, 100, 849, 600],
                     [10, 20, 7, 14, 3, 100, 846, 700],
                     [10, 20, 8, 13, 0, 100, 849, 800],
                     [10, 20, 9, 12, 7, 100, 842, 900]]) + 1
     obs = self.test2.center_log()
     exp = clr(dat)
     assert_array_almost_equal(exp, obs.data)
     obs = self.test2.center_log(centralize=True)
     exp = clr(centralize(dat))
     assert_array_almost_equal(exp, obs.data)
Пример #12
0
    def clrtransform(self, dataframe):
        """
        Performs zero imputations to fill in the zeroes followed by centred-log-ratio (clr) transformation.
        
        Parameters
        ------------
        dataframe: pandas dataframe,
            microbiome count data

        
        Returns
        ------------
        X_clr: pandas dataframe,
            dataframe containing the clr transformed values of the count data. 

        """
        df = dataframe.copy()
        ### impute zeroes with 0.55
        df.fillna(0, inplace=True)
        X_imputed = df.replace(0, 0.55)
        ### clr transform data
        X_clr = composition.clr(X_imputed)

        return pd.DataFrame(X_clr, columns=df.columns,
                            index=df.index).sort_index()
Пример #13
0
def globalCLRPermTest(otuDf,
                      labels,
                      statfunc=_sumRhoStat,
                      nperms=999,
                      seed=110820,
                      binary=False):
    """Calculates centered-log-ratios (CLR) for each sample and performs global
    permutation tests to determine if there is a significant correlation
    over all log-median-ratios, with respect to the label variable of interest.

    Parameters
    ----------
    otuDf : pd.DataFrame [samples x OTUs]
        Contains relative abundance [0-1] for all samples (rows) and OTUs (colums)
    labels: pd.Series (float)
        Contains binary variable indicating membership into one of two categories
        (e.g. treatment conditions). Must share index with otuDf.
    statfunc : function
        Takes a np.ndarray [n x k] and float index [n] as parameters and
        returns a float summarizing over k.
    nperms : int
        Number of iterations for the permutation test.
    seed :int
        Seed for random permutation generation.
    
    Returns:
    --------
    pvalue : float
        Global p-value for a significant association of OTU log-median-ratios
        with label, based on the summary statistic.
    obs : float
        Statistic summarizing the label difference."""

    nSamples, nOTUs = otuDf.shape

    if binary:
        labelValues = labels.values.astype(bool)
    else:
        labelValues = labels.values.astype(float)

    # Make proportions
    otuDf = otuDf / otuDf.sum()
    # Apply multiplicative replacement for zero values
    otuMR = multiplicative_replacement(otuDf.values)
    # Calculate the CLR
    otuCLR = clr(otuMR)
    # Make into a DataFrame
    otuCLR = pd.DataFrame(otuCLR, index=otuDf.index, columns=otuDf.columns)

    np.random.seed(seed)
    obs = statfunc(otuCLR.values, labelValues)
    samples = np.array([
        statfunc(otuCLR.values, labelValues[np.random.permutation(nSamples)])
        for permi in range(nperms)
    ])
    """Since test is based on the abs statistic it is inherently two-sided"""
    pvalue = ((np.abs(samples) >= np.abs(obs)).sum() + 1) / (nperms + 1)

    return pvalue, obs
Пример #14
0
 def test_ilr_inv_basis_one_dimension_error(self):
     basis = clr(np.array([[0.80442968, 0.19557032]]))
     table = np.array([[np.log(1/10)*np.sqrt(1/2),
                        np.log(1.14141414 / 9.90909091)*np.sqrt(1/2),
                        np.log(1.28282828 / 9.81818182)*np.sqrt(1/2),
                        np.log(1.42424242 / 9.72727273)*np.sqrt(1/2),
                        np.log(1.56565657 / 9.63636364)*np.sqrt(1/2)]]).T
     with self.assertRaises(ValueError):
         ilr_inv(table, basis=basis)
Пример #15
0
 def test_ilr_inv_basis_one_dimension_error(self):
     basis = clr(np.array([[0.80442968, 0.19557032]]))
     table = np.array([[np.log(1/10)*np.sqrt(1/2),
                        np.log(1.14141414 / 9.90909091)*np.sqrt(1/2),
                        np.log(1.28282828 / 9.81818182)*np.sqrt(1/2),
                        np.log(1.42424242 / 9.72727273)*np.sqrt(1/2),
                        np.log(1.56565657 / 9.63636364)*np.sqrt(1/2)]]).T
     with self.assertRaises(ValueError):
         ilr_inv(table, basis=basis)
Пример #16
0
 def test_OptSpace_illformatted_raises(self):
     """Tests ValueError for OptSpace() no infs."""
     # test inf
     try:
         MatrixCompletion().fit(clr(self.test_table))
     except ValueError:
         pass
     else:
         raise AssertionError("ValueError was not raised")
Пример #17
0
    def test_center_log_ration(self):
        from skbio.stats.composition import clr, centralize

        dat = np.array([[10, 20, 1, 20, 5, 100, 844, 100],
                        [10, 20, 2, 19, 0, 100, 849, 200],
                        [10, 20, 3, 18, 5, 100, 844, 300],
                        [10, 20, 4, 17, 0, 100, 849, 400],
                        [10, 20, 5, 16, 4, 100, 845, 500],
                        [10, 20, 6, 15, 0, 100, 849, 600],
                        [10, 20, 7, 14, 3, 100, 846, 700],
                        [10, 20, 8, 13, 0, 100, 849, 800],
                        [10, 20, 9, 12, 7, 100, 842, 900]]) + 1
        obs = self.test2.center_log_ratio()
        exp = clr(dat)
        assert_array_almost_equal(exp, obs.data)
        obs = self.test2.center_log_ratio(centralize=True)
        exp = clr(centralize(dat))
        assert_array_almost_equal(exp, obs.data)
Пример #18
0
def dendrogram_heatmap(output_dir: str, table: pd.DataFrame,
                       tree: TreeNode, metadata: MetadataCategory,
                       ndim=10, method='clr', color_map='viridis'):

    nodes = [n.name for n in tree.levelorder() if not n.is_tip()]

    nlen = min(ndim, len(nodes))
    numerator_color, denominator_color = '#fb9a99', '#e31a1c'
    highlights = pd.DataFrame([[numerator_color, denominator_color]] * nlen,
                              index=nodes[:nlen])
    if method == 'clr':
        mat = pd.DataFrame(clr(centralize(table)),
                           index=table.index,
                           columns=table.columns)
    elif method == 'log':
        mat = pd.DataFrame(np.log(table),
                           index=table.index,
                           columns=table.columns)

    # TODO: There are a few hard-coded constants here
    # will need to have some adaptive defaults set in the future
    fig = heatmap(mat, tree, metadata.to_series(), highlights, cmap=color_map,
                  highlight_width=0.01, figsize=(12, 8))
    fig.savefig(os.path.join(output_dir, 'heatmap.svg'))
    fig.savefig(os.path.join(output_dir, 'heatmap.pdf'))

    css = r"""
        .square {
          float: left;
          width: 100px;
          height: 20px;
          margin: 5px;
          border: 1px solid rgba(0, 0, 0, .2);
        }

        .numerator {
          background: %s;
        }

        .denominator {
          background: %s;
        }
    """ % (numerator_color, denominator_color)

    index_fp = os.path.join(output_dir, 'index.html')
    with open(index_fp, 'w') as index_f:
        index_f.write('<html><body>\n')
        index_f.write('<h1>Dendrogram heatmap</h1>\n')
        index_f.write('<img src="heatmap.svg" alt="heatmap">')
        index_f.write('<a href="heatmap.pdf">')
        index_f.write('Download as PDF</a><br>\n')
        index_f.write('<style>%s</style>' % css)
        index_f.write('<div class="square numerator">'
                      'Numerator<br/></div>')
        index_f.write('<div class="square denominator">'
                      'Denominator<br/></div>')
        index_f.write('</body></html>\n')
Пример #19
0
def balance_classify(table, cats, num_folds, **init_kwds):
    """
    Builds a balance classifier. If categorical, it is assumed
    that the classes are binary.
    """
    skf = KFold(n_splits=num_folds, shuffle=True)

    ctable = pd.DataFrame(clr(centralize(table)),
                          index=table.index,
                          columns=table.columns)

    cv = pd.DataFrame(columns=['Q2', 'AUROC'], index=np.arange(num_folds))
    for i, (train, test) in enumerate(skf.split(ctable.values, cats.values)):

        X_train, X_test = ctable.iloc[train], ctable.iloc[test]
        Y_train, Y_test = cats.iloc[train], cats.iloc[test]
        plsc = PLSRegression(n_components=1)
        plsc.fit(X=X_train, Y=Y_train)
        pls_df = pd.DataFrame(plsc.x_weights_,
                              index=ctable.columns,
                              columns=['PLS1'])

        l, r = round_balance(pls_df, **init_kwds)
        denom = pls_df.loc[pls_df.PLS1 < l]
        num = pls_df.loc[pls_df.PLS1 > r]

        # make the prediction and evaluate the accuracy
        idx = table.index[test]
        pls_balance = (np.log(table.loc[idx, num.index] + 1).mean(axis=1) -
                       np.log(table.loc[idx, denom.index] + 1).mean(axis=1))

        group_fpr, group_tpr, thresholds = roc_curve(y_true=1 -
                                                     (Y_test == 1).astype(int),
                                                     y_score=pls_balance)

        auroc = auc(group_tpr, group_fpr)
        press = ((pls_balance - Y_test)**2).sum()
        tss = ((Y_test.mean() - Y_test)**2).sum()
        Q2 = 1 - (press / tss)

        cv.loc[i, 'Q2'] = Q2
        cv.loc[i, 'AUROC'] = auroc

    # build model on entire dataset
    plsc = PLSRegression(n_components=1)
    plsc.fit(X=table.values, Y=cats.values)
    pls_df = pd.DataFrame(plsc.x_weights_,
                          index=ctable.columns,
                          columns=['PLS1'])
    l, r = round_balance(pls_df, **init_kwds)
    denom = pls_df.loc[pls_df.PLS1 < l]
    num = pls_df.loc[pls_df.PLS1 > r]
    pls_balance = (np.log(table.loc[:, num.index]).mean(axis=1) -
                   np.log(table.loc[:, denom.index]).mean(axis=1))

    return num, denom, pls_balance, cv
Пример #20
0
    def clr_transform_cags_via_mult_rep_method(self):
        """
        NOT GENERALIZABLE - DELETE
        uses multiplicative replacement to replace zeros with half of
        the lowest non-zero relative abundance value. Then performs clr
        transformation.

        Arguments
        ---------
        taxonomic_level : string
            "phlyum" through "species"

        Assigns
        -------
        self.cags_dict : dictionary
        dictionary keyed on 'cags' with the following attributes:
            1. cags_wide_df -  relative abundances
            2. cags_wide_mr_clr_df - clr transformed
               abundances (uses multiplicative replacement)
            3. half_nzra - on-zero relative abundance (NZRA) used for Mult Rep step
        """

        cag_wide = self._pivot_cags()
        # one solution is to use the lowest non-zero relative abundance (NZRA), or more typically NZRA/2
        nzra = np.min(cag_wide.values.flatten()[cag_wide.values.flatten() > 0])
        half_nzra = nzra / 2
        # multiplicative replacement adds small value to non-zero entries while maintaining row sums equal to 1
        cag_wide_mr = multiplicative_replacement(cag_wide, delta=half_nzra)
        # clr transform
        cag_wide_mr_clr = clr(cag_wide_mr)
        # clr transform array to data.frame with index and column matching mp_wide_taxa
        cag_wide_mr_clr_df = pd.DataFrame(cag_wide_mr_clr)
        cag_wide_mr_clr_df.columns = cag_wide.columns
        cag_wide_mr_clr_df.index = cag_wide.index

        self.cags_dict["cags"] = {
            "cags_wide_df": cag_wide,
            "cags_wide_mr_clr_df": cag_wide_mr_clr_df,
            "half_nzra": half_nzra
        }
        return cag_wide_mr_clr_df

        def fetch_metaphlan_result(self, clr=True, taxonomic_level="phylum"):
            """
            getter
            """
            if clr:
                key = 'mp_wide_taxa_mr_clr_df'
            else:
                key = 'mp_wide_taxa_df'
            try:
                return (self.metaphlan_dict[taxonomic_level][key])
            except KeyError:
                print(
                    "NO METAPHLAN MATRIX CREATED SEE clr_transform_metaphlan_via_mult_rep_method()"
                )
Пример #21
0
def test_alr_to_clr():
    mat = np.array([[0.1, 0.2, 0.3, 0.4, 0.3], [0.3, 0.1, 0.1, 0.2, 0.5],
                    [0.4, 0.3, 0.5, 0.1, 0.1], [0.2, 0.4, 0.1, 0.3, 0.1]])

    # skbio alr & clr take rows as compositions, columns as components
    alr_mat = alr(mat.T, 0)  # 5 x 3
    clr_mat = util.alr_to_clr(alr_mat.T).T  # 5 x 4
    exp_clr = clr(mat.T)  # 5 x 4

    np.testing.assert_array_almost_equal(clr_mat, exp_clr)
Пример #22
0
def test_clr_to_alr():
    mat = np.array([[0.1, 0.2, 0.3, 0.4, 0.3], [0.3, 0.1, 0.1, 0.2, 0.5],
                    [0.4, 0.3, 0.5, 0.1, 0.1], [0.2, 0.4, 0.1, 0.3, 0.1]])

    # skbio alr & clr take rows as compositions, columns as components
    clr_mat = clr(mat.T)
    alr_mat = util.clr_to_alr(clr_mat.T).T
    exp_alr = alr(mat.T)

    np.testing.assert_array_almost_equal(alr_mat, exp_alr)
Пример #23
0
 def test_matrix_tensor_rclr(self):
     """Test matrix == tensor matrix_rclr."""
     # test clr works the same if there are no zeros
     npt.assert_allclose(
         tensor_rclr(self.count_data_one.T).T, clr(self.count_data_one))
     # test a case with zeros
     tensor_rclr(self.count_data_two)
     # test negatives throw ValueError
     with self.assertRaises(ValueError):
         tensor_rclr(self.tensor_true * -1)
Пример #24
0
def balance_regression(table, cats, num_folds, **init_kwds):
    """
    Builds a balance classifier. If categorical, it is assumed
    that the classes are binary.
    """
    skf = KFold(n_splits=num_folds, shuffle=True)
    cats = cats * -1  # wtf??
    ctable = pd.DataFrame(clr(centralize(table)),
                          index=table.index,
                          columns=table.columns)

    cv = pd.DataFrame(columns=['Q2'], index=np.arange(num_folds))
    for i, (train, test) in enumerate(skf.split(ctable.values, cats.values)):

        X_train, X_test = ctable.iloc[train], ctable.iloc[test]
        Y_train, Y_test = cats.iloc[train], cats.iloc[test]
        plsc = PLSRegression(n_components=1)
        plsc.fit(X=X_train, Y=Y_train)
        pls_df = pd.DataFrame(plsc.x_weights_,
                              index=ctable.columns,
                              columns=['PLS1'])

        l, r = round_balance(pls_df, **init_kwds)
        denom = pls_df.loc[pls_df.PLS1 < l]
        num = pls_df.loc[pls_df.PLS1 > r]

        idx = table.index[train]
        pls_balance = (np.log(table.loc[idx, num.index] + 1).mean(axis=1) -
                       np.log(table.loc[idx, denom.index] + 1).mean(axis=1))
        b_, int_, _, _, _ = linregress(pls_balance, Y_train)

        idx = table.index[test]
        pls_balance = (np.log(table.loc[idx, num.index] + 1).mean(axis=1) -
                       np.log(table.loc[idx, denom.index] + 1).mean(axis=1))
        pred = pls_balance * b_ + int_

        press = ((pred - Y_test)**2).sum()
        tss = ((Y_test.mean() - Y_test)**2).sum()
        Q2 = 1 - (press / tss)

        cv.loc[i, 'Q2'] = Q2

    # build model on entire dataset
    plsc = PLSRegression(n_components=1)
    plsc.fit(X=table.values, Y=cats.values)
    pls_df = pd.DataFrame(plsc.x_weights_,
                          index=ctable.columns,
                          columns=['PLS1'])
    l, r = round_balance(pls_df, **init_kwds)
    denom = pls_df.loc[pls_df.PLS1 < l]
    num = pls_df.loc[pls_df.PLS1 > r]
    pls_balance = (np.log(table.loc[:, num.index]).mean(axis=1) -
                   np.log(table.loc[:, denom.index]).mean(axis=1))

    return num, denom, pls_balance, cv
Пример #25
0
def globalCLRPermTest(otuDf, labels, statfunc=_sumRhoStat, nperms=999, seed=110820, binary=False):
    """Calculates centered-log-ratios (CLR) for each sample and performs global
    permutation tests to determine if there is a significant correlation
    over all log-median-ratios, with respect to the label variable of interest.

    Parameters
    ----------
    otuDf : pd.DataFrame [samples x OTUs]
        Contains relative abundance [0-1] for all samples (rows) and OTUs (colums)
    labels: pd.Series (float)
        Contains binary variable indicating membership into one of two categories
        (e.g. treatment conditions). Must share index with otuDf.
    statfunc : function
        Takes a np.ndarray [n x k] and float index [n] as parameters and
        returns a float summarizing over k.
    nperms : int
        Number of iterations for the permutation test.
    seed :int
        Seed for random permutation generation.
    
    Returns:
    --------
    pvalue : float
        Global p-value for a significant association of OTU log-median-ratios
        with label, based on the summary statistic.
    obs : float
        Statistic summarizing the label difference."""

    nSamples, nOTUs = otuDf.shape

    if binary:
        labelValues = labels.values.astype(bool)
    else:
        labelValues = labels.values.astype(float)

    # Make proportions
    otuDf = otuDf / otuDf.sum()
    # Apply multiplicative replacement for zero values
    otuMR = multiplicative_replacement(otuDf.values)
    # Calculate the CLR
    otuCLR = clr(otuMR)
    # Make into a DataFrame
    otuCLR = pd.DataFrame(otuCLR, index=otuDf.index, columns=otuDf.columns)

    np.random.seed(seed)
    obs = statfunc(otuCLR.values, labelValues)
    samples = np.array([
        statfunc(otuCLR.values, labelValues[np.random.permutation(nSamples)])
        for permi in range(nperms)
    ])
    
    """Since test is based on the abs statistic it is inherently two-sided"""
    pvalue = ((np.abs(samples) >= np.abs(obs)).sum() + 1) / (nperms + 1)

    return pvalue, obs
Пример #26
0
def multinomial(table: biom.Table,
                metadata: Metadata,
                formula: str,
                training_column: str = None,
                num_random_test_examples: int = 10,
                epoch: int = 10,
                batch_size: int = 5,
                beta_prior: float = 1,
                learning_rate: float = 0.1,
                clipnorm: float = 10,
                min_sample_count: int = 10,
                min_feature_count: int = 10,
                summary_interval: int = 60) -> (pd.DataFrame):

    # load metadata and tables
    metadata = metadata.to_dataframe()

    # match them
    table, metadata, design = match_and_filter(table, metadata, formula,
                                               training_column,
                                               num_random_test_examples,
                                               min_sample_count,
                                               min_feature_count)

    # convert to dense representation
    dense_table = table.to_dataframe().to_dense().T

    # split up training and testing
    trainX, testX, trainY, testY = split_training(dense_table, metadata,
                                                  design, training_column,
                                                  num_random_test_examples)

    model = MultRegression(learning_rate=learning_rate,
                           clipnorm=clipnorm,
                           beta_mean=beta_prior,
                           batch_size=batch_size,
                           save_path=None)
    with tf.Graph().as_default(), tf.Session() as session:
        model(session, trainX, trainY, testX, testY)

        model.fit(epoch=epoch,
                  summary_interval=summary_interval,
                  checkpoint_interval=None)

    md_ids = np.array(design.columns)
    obs_ids = table.ids(axis='observation')

    beta_ = clr(clr_inv(np.hstack((np.zeros((model.p, 1)), model.B))))

    beta_ = pd.DataFrame(
        beta_.T,
        columns=md_ids,
        index=obs_ids,
    )
    return beta_
def normalize_clr(data):

    "replace zeros and apply clr"

    assert data.shape[0]< data.shape[1], "samples should be indexes, I don't think you have"

    normalized=composition.clr(composition.multiplicative_replacement(data))
    normalized= pd.DataFrame(normalized,
                             index= data.index,columns= data.columns)

    return normalized
Пример #28
0
    def test_clr_inv(self):
        npt.assert_allclose(clr_inv(self.rdata1), self.ortho1)
        npt.assert_allclose(clr(clr_inv(self.rdata1)), self.rdata1)

        # make sure that inplace modification is not occurring
        clr_inv(self.rdata1)
        npt.assert_allclose(
            self.rdata1,
            np.array([[0.70710678, -0.70710678, 0., 0.],
                      [0.40824829, 0.40824829, -0.81649658, 0.],
                      [0.28867513, 0.28867513, 0.28867513, -0.8660254]]))
Пример #29
0
 def test_fit(self):
     tf.set_random_seed(0)
     md = self.md
     md.name = 'sampleid'
     md = qiime2.Metadata(md)
     exp_beta = clr(clr_inv(np.hstack((np.zeros((2, 1)), self.beta.T))))
     res_beta = multinomial(table=self.table,
                            metadata=md,
                            formula="X",
                            epoch=50000)
     npt.assert_allclose(exp_beta, res_beta.T, atol=0.5, rtol=0.5)
Пример #30
0
 def _clr_transform_via_mult_rep_method(self, df):
     nzra = np.min(df.values.flatten()[df.values.flatten() > 0])
     half_nzra = nzra / 2
     # multiplicative replacement adds small value to non-zero entries while maintaining row sums equal to 1
     df_mr = multiplicative_replacement(df, delta=half_nzra)
     # clr transform
     mr_clr = clr(df_mr)
     # clr transform array to data.frame with index and column matching mp_wide_taxa
     mr_clr_df = pd.DataFrame(mr_clr)
     mr_clr_df.columns = df.columns
     mr_clr_df.index = df.index
     return mr_clr_df
Пример #31
0
    def test_rclr(self):

        # test clr works the same if there are no zeros
        cmat = self._rclr.fit_transform(self.cdata1)
        npt.assert_allclose(cmat, clr(self.cdata1.copy()))

        # test a case with zeros :)
        cmat = self._rclr.fit_transform(self.cdata2)
        npt.assert_allclose(cmat, self.true2)

        with self.assertRaises(ValueError):
            self._rclr.fit_transform(self.bad1)
Пример #32
0
    def test_clr_inv(self):
        npt.assert_allclose(clr_inv(self.rdata1), self.ortho1)
        npt.assert_allclose(clr(clr_inv(self.rdata1)), self.rdata1)

        # make sure that inplace modification is not occurring
        clr_inv(self.rdata1)
        npt.assert_allclose(self.rdata1,
                            np.array([[0.70710678, -0.70710678, 0., 0.],
                                      [0.40824829, 0.40824829,
                                       -0.81649658, 0.],
                                      [0.28867513, 0.28867513,
                                       0.28867513, -0.8660254]]))
def preprocess_df(df, rep, state):
    """
    Aitchi transformed subset of data.
    """
    df_subset = df[select_rep_state_intensities(rep, state)]
    cols = df_subset.columns
    df_subset = drop_zero_rows(
        df_subset)  #index should be the same as protein/peptides
    index = df_subset.index
    df_subset = multiplicative_replacement(df_subset)
    df_subset = clr(df_subset)
    df_subset = pd.DataFrame(df_subset, index=index, columns=cols)
    return df_subset
Пример #34
0
def rhoMetric(npArray):
    nColumns=npArray.shape[-1]
    tempArray = np.zeros(shape=(nColumns,nColumns))
    clrVals=clr(npArray)
    for i in range(nColumns):
        for j in range(nColumns):
            columnI = clrVals[:,i]
            columnJ = clrVals[:,j]
            tempArray[i,j] = 1-(columnI-columnJ).var()/(columnI.var()+columnJ.var())


  
    return tempArray
Пример #35
0
 def aitchison(x, y, **kwds):
     return euclidean(clr(x), clr(y))
Пример #36
0
def CLRPermTest(otuDf, labels, statfunc=_rhoStat, nperms=999, adjMethod='fdr_bh', seed=110820, binary=False):
    """Calculates centered-log-ratio (CLR) for all OTUs and performs
    permutation tests to determine if there is a significant correlation
    in OTU ratios with respect to the label variable of interest.

    Parameters
    ----------
    otuDf : pd.DataFrame [samples x OTUs]
        Contains relative abundance [0-1] for all samples (rows) and OTUs (colums)
    labels: pd.Series (float)
        Contains binary variable indicating membership into one of two categories
        (e.g. treatment conditions). Must share index with otuDf.
    statfunc : function
        Takes a np.array [n x k] and float index [n] as parameters and
        returns a 1-D array of the statistic [k].
    nperms : int
        Number of iterations for the permutation test.
    adjMethod : string
        Passed to sm.stats.multipletests for p-value multiplicity adjustment.
        If value is None then no adjustment is made.
    seed :int
        Seed for random permutation generation.
    
    Returns:
    --------
    qvalues : pd.Series [index: OTU]
        Q/P-values for each OTU computed.
    observed : pd.Series [index: OTU]
        Log-ratio statistic summarizing across samples."""

    nSamples, nOTUs = otuDf.shape

    if binary:
        labelValues = labels.values.astype(bool)
    else:
        labelValues = labels.values.astype(float)

    # Make proportions
    otuDf = otuDf / otuDf.sum()
    # Apply multiplicative replacement for zero values
    otuMR = multiplicative_replacement(otuDf.values)
    # Calculate the CLR
    otuCLR = clr(otuMR)
    # Make into a DataFrame
    otuCLR = pd.DataFrame(otuCLR, index=otuDf.index, columns=otuDf.columns)

    obs = statfunc(otuCLR.values, labelValues)

    np.random.seed(seed)
    samples = np.zeros((nperms, nOTUs))

    for permi in range(nperms):
        samples[permi, :] = statfunc(
            otuCLR.values,
            labelValues[np.random.permutation(nSamples)]
        )

    pvalues = ((np.abs(samples) >= np.abs(obs[None, :])).sum(
        axis=0) + 1) / (nperms + 1)

    if adjMethod is None or adjMethod.lower() == 'none':
        qvalues = pvalues
    else:
        qvalues = _pvalueAdjust(pvalues, method=adjMethod)

    qvalues = pd.Series(qvalues, index=otuDf.columns)
    observed = pd.Series(obs, index=otuDf.columns)

    return qvalues, observed