def test_split_training_random(self): np.random.seed(0) design = pd.DataFrame( np.vstack( ( np.ones(6), np.array([0, 0, 1, 1, 0, 0]), np.arange(6) ) ).T, columns=['Intercept', 'C(categorical)[T.b]', 'continuous'], index=['s1', 's2', 's3', 's4', 's5', 's6'] ) res = split_training(self.table.to_dataframe().T, self.trimmed_metadata, design, training_column=None, num_random_test_examples=2) trainX, testX, trainY, testY = res # print(trainX.shape, testX.shape, trainY.shape, testY.shape) npt.assert_allclose(trainX.shape, np.array([4, 3])) npt.assert_allclose(trainY.shape, np.array([4, 7])) npt.assert_allclose(testX.shape, np.array([2, 3])) npt.assert_allclose(testY.shape, np.array([2, 7]))
def test_split_training_fixed(self): np.random.seed(0) design = pd.DataFrame( np.vstack( ( np.ones(6), np.array([0, 0, 1, 1, 0, 0]), np.arange(6) ) ).T, columns=['Intercept', 'C(categorical)[T.b]', 'continuous'], index=['s1', 's2', 's3', 's4', 's5', 's6'] ) t = self.table.to_dataframe().T res = split_training(t, self.metadata, design, training_column='train', num_random_test_examples=2) exp_trainX = design.iloc[2:].values exp_testX = design.iloc[:2].values exp_trainY = t.iloc[2:].values exp_testY = t.iloc[:2].values res_trainX, res_testX, res_trainY, res_testY = res npt.assert_allclose(exp_trainX, res_trainX) npt.assert_allclose(exp_trainY, res_trainY) npt.assert_allclose(exp_testX, res_testX) npt.assert_allclose(exp_testY, res_testY)
def multinomial(table: biom.Table, metadata: Metadata, formula: str, training_column: str = None, num_random_test_examples: int = 10, epoch: int = 10, batch_size: int = 5, beta_prior: float = 1, learning_rate: float = 0.1, clipnorm: float = 10, min_sample_count: int = 10, min_feature_count: int = 10, summary_interval: int = 60) -> (pd.DataFrame): # load metadata and tables metadata = metadata.to_dataframe() # match them table, metadata, design = match_and_filter(table, metadata, formula, training_column, num_random_test_examples, min_sample_count, min_feature_count) # convert to dense representation dense_table = table.to_dataframe().to_dense().T # split up training and testing trainX, testX, trainY, testY = split_training(dense_table, metadata, design, training_column, num_random_test_examples) model = MultRegression(learning_rate=learning_rate, clipnorm=clipnorm, beta_mean=beta_prior, batch_size=batch_size, save_path=None) with tf.Graph().as_default(), tf.Session() as session: model(session, trainX, trainY, testX, testY) model.fit(epoch=epoch, summary_interval=summary_interval, checkpoint_interval=None) md_ids = np.array(design.columns) obs_ids = table.ids(axis='observation') beta_ = clr(clr_inv(np.hstack((np.zeros((model.p, 1)), model.B)))) beta_ = pd.DataFrame( beta_.T, columns=md_ids, index=obs_ids, ) return beta_
def multinomial(table: biom.Table, metadata: Metadata, formula: str, training_column: str = DEFAULTS["training-column"], num_random_test_examples: int = ( DEFAULTS["num-random-test-examples"] ), epochs: int = DEFAULTS["epochs"], batch_size: int = DEFAULTS["batch-size"], differential_prior: float = DEFAULTS["differential-prior"], learning_rate: float = DEFAULTS["learning-rate"], clipnorm: float = DEFAULTS["clipnorm"], min_sample_count: int = DEFAULTS["min-sample-count"], min_feature_count: int = DEFAULTS["min-feature-count"], summary_interval: int = DEFAULTS["summary-interval"], random_seed: int = DEFAULTS["random-seed"], ) -> ( pd.DataFrame, qiime2.Metadata, skbio.OrdinationResults ): # load metadata and tables metadata = metadata.to_dataframe() # match them table, metadata, design = match_and_filter( table, metadata, formula, min_sample_count, min_feature_count ) # convert to dense representation dense_table = table.to_dataframe().to_dense().T # split up training and testing trainX, testX, trainY, testY = split_training( dense_table, metadata, design, training_column, num_random_test_examples, seed=random_seed, ) model = MultRegression(learning_rate=learning_rate, clipnorm=clipnorm, beta_mean=differential_prior, batch_size=batch_size, save_path=None) with tf.Graph().as_default(), tf.Session() as session: tf.set_random_seed(random_seed) model(session, trainX, trainY, testX, testY) loss, cv, its = model.fit( epochs=epochs, summary_interval=summary_interval, checkpoint_interval=None) md_ids = np.array(design.columns) obs_ids = table.ids(axis='observation') beta_ = np.hstack((np.zeros((model.p, 1)), model.B)) beta_ = beta_ - beta_.mean(axis=1).reshape(-1, 1) differentials = pd.DataFrame( beta_.T, columns=md_ids, index=obs_ids, ) differentials.index.name = 'featureid' convergence_stats = pd.DataFrame( { 'loss': loss, 'cross-validation': cv, 'iteration': its } ) convergence_stats.index.name = 'id' convergence_stats.index = convergence_stats.index.astype(np.str) c = convergence_stats['loss'].astype(np.float) convergence_stats['loss'] = c c = convergence_stats['cross-validation'].astype(np.float) convergence_stats['cross-validation'] = c c = convergence_stats['iteration'].astype(np.int) convergence_stats['iteration'] = c # regression biplot if differentials.shape[-1] > 1: u, s, v = np.linalg.svd(differentials) pc_ids = ['PC%d' % i for i in range(len(s))] samples = pd.DataFrame(u[:, :len(s)] @ np.diag(s), columns=pc_ids, index=differentials.index) features = pd.DataFrame(v.T[:, :len(s)], columns=pc_ids, index=differentials.columns) short_method_name = 'regression_biplot' long_method_name = 'Multinomial regression biplot' eigvals = pd.Series(s, index=pc_ids) proportion_explained = eigvals**2 / (eigvals**2).sum() biplot = OrdinationResults( short_method_name, long_method_name, eigvals, samples=samples, features=features, proportion_explained=proportion_explained) else: # this is to handle the edge case with only intercepts biplot = OrdinationResults('', '', pd.Series(), pd.DataFrame()) return differentials, qiime2.Metadata(convergence_stats), biplot