def test_split_tables_bad_column(self): with self.assertRaises(Exception): split_tables(self.otu_table, self.metabolite_table, metadata=self.metadata, training_column='bad', num_test=10, min_samples=0)
def test_split_tables_random_filter(self): res = split_tables(self.otu_table, self.metabolite_table, num_test=2, min_samples=2) (train_microbes, test_microbes, train_metabolites, test_metabolites) = res npt.assert_allclose(train_microbes.shape, np.array([3, 6])) npt.assert_allclose(test_microbes.shape, np.array([2, 6])) npt.assert_allclose(train_metabolites.shape, np.array([3, 9])) npt.assert_allclose(test_metabolites.shape, np.array([2, 9]))
def test_split_tables_train_column(self): res = split_tables(self.otu_table, self.metabolite_table, metadata=self.metadata, training_column='testing', num_test=10, min_samples=0) (train_microbes, test_microbes, train_metabolites, test_metabolites) = res npt.assert_allclose(train_microbes.shape, np.array([3, 7])) npt.assert_allclose(test_microbes.shape, np.array([2, 7])) npt.assert_allclose(train_metabolites.shape, np.array([3, 9])) npt.assert_allclose(test_metabolites.shape, np.array([2, 9]))
def mmvec(microbes: biom.Table, metabolites: biom.Table, metadata: Metadata = None, training_column: str = None, num_testing_examples: int = 5, min_feature_count: int = 10, epochs: int = 100, batch_size: int = 50, latent_dim: int = 3, input_prior: float = 1, output_prior: float = 1, learning_rate: float = 0.001, summary_interval: int = 60) -> (pd.DataFrame, OrdinationResults): if metadata is not None: metadata = metadata.to_dataframe() # Note: there are a couple of biom -> pandas conversions taking # place here. This is currently done on purpose, since we # haven't figured out how to handle sparse matrix multiplication # in the context of this algorithm. That is a future consideration. res = split_tables(microbes, metabolites, metadata=metadata, training_column=training_column, num_test=num_testing_examples, min_samples=min_feature_count) (train_microbes_df, test_microbes_df, train_metabolites_df, test_metabolites_df) = res train_microbes_coo = coo_matrix(train_microbes_df.values) test_microbes_coo = coo_matrix(test_microbes_df.values) with tf.Graph().as_default(), tf.Session() as session: model = MMvec(latent_dim=latent_dim, u_scale=input_prior, v_scale=output_prior, learning_rate=learning_rate) model(session, train_microbes_coo, train_metabolites_df.values, test_microbes_coo, test_metabolites_df.values) loss, cv = model.fit(epoch=epochs, summary_interval=summary_interval) U, V = model.U, model.V U_ = np.hstack((np.ones( (model.U.shape[0], 1)), model.Ubias.reshape(-1, 1), U)) V_ = np.vstack( (model.Vbias.reshape(1, -1), np.ones((1, model.V.shape[1])), V)) ranks = pd.DataFrame(np.hstack((np.zeros( (model.U.shape[0], 1)), U_ @ V_)), index=train_microbes_df.columns, columns=train_metabolites_df.columns) ranks = ranks - ranks.mean(axis=1).values.reshape(-1, 1) ranks = ranks - ranks.mean(axis=0) u, s, v = svds(ranks, k=latent_dim) microbe_embed = u @ np.diag(s) metabolite_embed = v.T pc_ids = ['PC%d' % i for i in range(microbe_embed.shape[1])] features = pd.DataFrame(microbe_embed, columns=pc_ids, index=train_microbes_df.columns) samples = pd.DataFrame(metabolite_embed, columns=pc_ids, index=train_metabolites_df.columns) short_method_name = 'mmvec biplot' long_method_name = 'Multiomics mmvec biplot' eigvals = pd.Series(s, index=pc_ids) proportion_explained = pd.Series(s**2 / np.sum(s**2), index=pc_ids) biplot = OrdinationResults(short_method_name, long_method_name, eigvals, samples=samples, features=features, proportion_explained=proportion_explained) return ranks, biplot