예제 #1
0
    def test_split_training_random(self):
        np.random.seed(0)
        design = pd.DataFrame(
            np.vstack(
                (
                    np.ones(6),
                    np.array([0, 0, 1, 1, 0, 0]),
                    np.arange(6)
                )
            ).T,
            columns=['Intercept', 'C(categorical)[T.b]', 'continuous'],
            index=['s1', 's2', 's3', 's4', 's5', 's6']
        )
        res = split_training(self.table.to_dataframe().T,
                             self.trimmed_metadata, design,
                             training_column=None,
                             num_random_test_examples=2)

        trainX, testX, trainY, testY = res
        # print(trainX.shape, testX.shape, trainY.shape, testY.shape)
        npt.assert_allclose(trainX.shape, np.array([4, 3]))
        npt.assert_allclose(trainY.shape, np.array([4, 7]))

        npt.assert_allclose(testX.shape, np.array([2, 3]))
        npt.assert_allclose(testY.shape, np.array([2, 7]))
예제 #2
0
    def test_split_training_fixed(self):
        np.random.seed(0)
        design = pd.DataFrame(
            np.vstack(
                (
                    np.ones(6),
                    np.array([0, 0, 1, 1, 0, 0]),
                    np.arange(6)
                )
            ).T,
            columns=['Intercept', 'C(categorical)[T.b]', 'continuous'],
            index=['s1', 's2', 's3', 's4', 's5', 's6']
        )
        t = self.table.to_dataframe().T
        res = split_training(t,
                             self.metadata, design,
                             training_column='train',
                             num_random_test_examples=2)

        exp_trainX = design.iloc[2:].values
        exp_testX = design.iloc[:2].values
        exp_trainY = t.iloc[2:].values
        exp_testY = t.iloc[:2].values

        res_trainX, res_testX, res_trainY, res_testY = res

        npt.assert_allclose(exp_trainX, res_trainX)
        npt.assert_allclose(exp_trainY, res_trainY)
        npt.assert_allclose(exp_testX, res_testX)
        npt.assert_allclose(exp_testY, res_testY)
예제 #3
0
def multinomial(table: biom.Table,
                metadata: Metadata,
                formula: str,
                training_column: str = None,
                num_random_test_examples: int = 10,
                epoch: int = 10,
                batch_size: int = 5,
                beta_prior: float = 1,
                learning_rate: float = 0.1,
                clipnorm: float = 10,
                min_sample_count: int = 10,
                min_feature_count: int = 10,
                summary_interval: int = 60) -> (pd.DataFrame):

    # load metadata and tables
    metadata = metadata.to_dataframe()

    # match them
    table, metadata, design = match_and_filter(table, metadata, formula,
                                               training_column,
                                               num_random_test_examples,
                                               min_sample_count,
                                               min_feature_count)

    # convert to dense representation
    dense_table = table.to_dataframe().to_dense().T

    # split up training and testing
    trainX, testX, trainY, testY = split_training(dense_table, metadata,
                                                  design, training_column,
                                                  num_random_test_examples)

    model = MultRegression(learning_rate=learning_rate,
                           clipnorm=clipnorm,
                           beta_mean=beta_prior,
                           batch_size=batch_size,
                           save_path=None)
    with tf.Graph().as_default(), tf.Session() as session:
        model(session, trainX, trainY, testX, testY)

        model.fit(epoch=epoch,
                  summary_interval=summary_interval,
                  checkpoint_interval=None)

    md_ids = np.array(design.columns)
    obs_ids = table.ids(axis='observation')

    beta_ = clr(clr_inv(np.hstack((np.zeros((model.p, 1)), model.B))))

    beta_ = pd.DataFrame(
        beta_.T,
        columns=md_ids,
        index=obs_ids,
    )
    return beta_
예제 #4
0
def multinomial(table: biom.Table,
                metadata: Metadata,
                formula: str,
                training_column: str = DEFAULTS["training-column"],
                num_random_test_examples: int = (
                    DEFAULTS["num-random-test-examples"]
                ),
                epochs: int = DEFAULTS["epochs"],
                batch_size: int = DEFAULTS["batch-size"],
                differential_prior: float = DEFAULTS["differential-prior"],
                learning_rate: float = DEFAULTS["learning-rate"],
                clipnorm: float = DEFAULTS["clipnorm"],
                min_sample_count: int = DEFAULTS["min-sample-count"],
                min_feature_count: int = DEFAULTS["min-feature-count"],
                summary_interval: int = DEFAULTS["summary-interval"],
                random_seed: int = DEFAULTS["random-seed"],
                ) -> (
                    pd.DataFrame, qiime2.Metadata, skbio.OrdinationResults
                ):

    # load metadata and tables
    metadata = metadata.to_dataframe()
    # match them
    table, metadata, design = match_and_filter(
        table, metadata,
        formula, min_sample_count, min_feature_count
    )

    # convert to dense representation
    dense_table = table.to_dataframe().to_dense().T

    # split up training and testing
    trainX, testX, trainY, testY = split_training(
        dense_table, metadata, design,
        training_column, num_random_test_examples,
        seed=random_seed,
    )

    model = MultRegression(learning_rate=learning_rate, clipnorm=clipnorm,
                           beta_mean=differential_prior,
                           batch_size=batch_size,
                           save_path=None)
    with tf.Graph().as_default(), tf.Session() as session:
        tf.set_random_seed(random_seed)
        model(session, trainX, trainY, testX, testY)

        loss, cv, its = model.fit(
            epochs=epochs,
            summary_interval=summary_interval,
            checkpoint_interval=None)

    md_ids = np.array(design.columns)
    obs_ids = table.ids(axis='observation')

    beta_ = np.hstack((np.zeros((model.p, 1)), model.B))
    beta_ = beta_ - beta_.mean(axis=1).reshape(-1, 1)

    differentials = pd.DataFrame(
        beta_.T, columns=md_ids, index=obs_ids,
    )
    differentials.index.name = 'featureid'

    convergence_stats = pd.DataFrame(
        {
            'loss': loss,
            'cross-validation': cv,
            'iteration': its
        }
    )

    convergence_stats.index.name = 'id'
    convergence_stats.index = convergence_stats.index.astype(np.str)

    c = convergence_stats['loss'].astype(np.float)
    convergence_stats['loss'] = c

    c = convergence_stats['cross-validation'].astype(np.float)
    convergence_stats['cross-validation'] = c

    c = convergence_stats['iteration'].astype(np.int)
    convergence_stats['iteration'] = c

    # regression biplot
    if differentials.shape[-1] > 1:
        u, s, v = np.linalg.svd(differentials)
        pc_ids = ['PC%d' % i for i in range(len(s))]
        samples = pd.DataFrame(u[:, :len(s)] @ np.diag(s),
                               columns=pc_ids, index=differentials.index)
        features = pd.DataFrame(v.T[:, :len(s)],
                                columns=pc_ids, index=differentials.columns)
        short_method_name = 'regression_biplot'
        long_method_name = 'Multinomial regression biplot'
        eigvals = pd.Series(s, index=pc_ids)
        proportion_explained = eigvals**2 / (eigvals**2).sum()
        biplot = OrdinationResults(
            short_method_name, long_method_name, eigvals,
            samples=samples, features=features,
            proportion_explained=proportion_explained)
    else:
        # this is to handle the edge case with only intercepts
        biplot = OrdinationResults('', '', pd.Series(), pd.DataFrame())

    return differentials, qiime2.Metadata(convergence_stats), biplot