def test_soils(self): np.random.seed(1) tf.reset_default_graph() n, d1 = self.trainX.shape n, d2 = self.trainY.shape with tf.Graph().as_default(), tf.Session() as session: set_random_seed(0) model = MMvec(beta_1=0.8, beta_2=0.9, latent_dim=1, learning_rate=1e-3) model(session, coo_matrix(self.trainX.values), self.trainY.values, coo_matrix(self.testX.values), self.testY.values) model.fit(epoch=1000) ranks = pd.DataFrame( model.ranks(), index=self.microbes.ids(axis='observation'), columns=self.metabolites.ids(axis='observation')) microcoleus_metabolites = [ '(3-methyladenine)', '7-methyladenine', '4-guanidinobutanoate', 'uracil', 'xanthine', 'hypoxanthine', '(N6-acetyl-lysine)', 'cytosine', 'N-acetylornithine', 'N-acetylornithine', 'succinate', 'adenosine', 'guanine', 'adenine'] mprobs = ranks.loc['rplo 1 (Cyanobacteria)'] self.assertEqual(np.sum(mprobs.loc[microcoleus_metabolites] > 0), len(microcoleus_metabolites))
def test_fit(self): np.random.seed(1) tf.reset_default_graph() n, d1 = self.trainX.shape n, d2 = self.trainY.shape with tf.Graph().as_default(), tf.Session() as session: set_random_seed(0) model = MMvec(beta_1=0.8, beta_2=0.9, latent_dim=2) model(session, coo_matrix(self.trainX.values), self.trainY.values, coo_matrix(self.testX.values), self.testY.values) model.fit(epoch=1000) U_ = np.hstack( (np.ones((self.U.shape[0], 1)), self.Ubias, self.U)) V_ = np.vstack( (self.Vbias, np.ones((1, self.V.shape[1])), self.V)) u_r, u_p = spearmanr(pdist(model.U), pdist(self.U)) v_r, v_p = spearmanr(pdist(model.V.T), pdist(self.V.T)) res = softmax(model.ranks()) exp = softmax(np.hstack((np.zeros((d1, 1)), U_ @ V_))) s_r, s_p = spearmanr(np.ravel(res), np.ravel(exp)) self.assertGreater(u_r, 0.5) self.assertGreater(v_r, 0.5) self.assertGreater(s_r, 0.5) self.assertLess(u_p, 5e-2) self.assertLess(v_p, 5e-2) self.assertLess(s_p, 5e-2) # sanity check cross validation self.assertLess(model.cv.eval(), 500)
def test_cpu(self): print('CPU run') np.random.seed(1) tf.reset_default_graph() n, d1 = self.trainX.shape n, d2 = self.trainY.shape with tf.Graph().as_default(), tf.Session() as session: set_random_seed(0) model = MMvec(beta_1=0.8, beta_2=0.9, latent_dim=2, batch_size=2000) model(session, coo_matrix(self.trainX.values), self.trainY.values, coo_matrix(self.testX.values), self.testY.values) model.fit(epoch=10000)
def paired_omics(microbes: biom.Table, metabolites: biom.Table, metadata: Metadata = None, training_column: str = None, num_testing_examples: int = 5, min_feature_count: int = 10, epochs: int = 100, batch_size: int = 50, latent_dim: int = 3, input_prior: float = 1, output_prior: float = 1, learning_rate: float = 1e-3, equalize_biplot: float = False, arm_the_gpu: bool = False, summary_interval: int = 60) -> ( pd.DataFrame, OrdinationResults, qiime2.Metadata ): if metadata is not None: metadata = metadata.to_dataframe() if arm_the_gpu: # pick out the first GPU device_name = '/device:GPU:0' else: device_name = '/cpu:0' # Note: there are a couple of biom -> pandas conversions taking # place here. This is currently done on purpose, since we # haven't figured out how to handle sparse matrix multiplication # in the context of this algorithm. That is a future consideration. res = split_tables( microbes, metabolites, metadata=metadata, training_column=training_column, num_test=num_testing_examples, min_samples=min_feature_count) (train_microbes_df, test_microbes_df, train_metabolites_df, test_metabolites_df) = res train_microbes_coo = coo_matrix(train_microbes_df.values) test_microbes_coo = coo_matrix(test_microbes_df.values) with tf.Graph().as_default(), tf.Session() as session: model = MMvec( latent_dim=latent_dim, u_scale=input_prior, v_scale=output_prior, batch_size=batch_size, device_name=device_name, learning_rate=learning_rate) model(session, train_microbes_coo, train_metabolites_df.values, test_microbes_coo, test_metabolites_df.values) loss, cv = model.fit(epoch=epochs, summary_interval=summary_interval) ranks = pd.DataFrame(model.ranks(), index=train_microbes_df.columns, columns=train_metabolites_df.columns) if latent_dim > 0: u, s, v = svds(ranks - ranks.mean(axis=0), k=latent_dim) else: # fake it until you make it u, s, v = svds(ranks - ranks.mean(axis=0), k=1) ranks = ranks.T ranks.index.name = 'featureid' s = s[::-1] u = u[:, ::-1] v = v[::-1, :] if equalize_biplot: microbe_embed = u @ np.sqrt(np.diag(s)) metabolite_embed = v.T @ np.sqrt(np.diag(s)) else: microbe_embed = u @ np.diag(s) metabolite_embed = v.T pc_ids = ['PC%d' % i for i in range(microbe_embed.shape[1])] features = pd.DataFrame( microbe_embed, columns=pc_ids, index=train_microbes_df.columns) samples = pd.DataFrame( metabolite_embed, columns=pc_ids, index=train_metabolites_df.columns) short_method_name = 'mmvec biplot' long_method_name = 'Multiomics mmvec biplot' eigvals = pd.Series(s, index=pc_ids) proportion_explained = pd.Series(s**2 / np.sum(s**2), index=pc_ids) biplot = OrdinationResults( short_method_name, long_method_name, eigvals, samples=samples, features=features, proportion_explained=proportion_explained) its = np.arange(len(loss)) convergence_stats = pd.DataFrame( { 'loss': loss, 'cross-validation': cv, 'iteration': its } ) convergence_stats.index.name = 'id' convergence_stats.index = convergence_stats.index.astype(np.str) c = convergence_stats['loss'].astype(np.float) convergence_stats['loss'] = c c = convergence_stats['cross-validation'].astype(np.float) convergence_stats['cross-validation'] = c c = convergence_stats['iteration'].astype(np.int) convergence_stats['iteration'] = c return ranks, biplot, qiime2.Metadata(convergence_stats)
def paired_omics( microbes: biom.Table, metabolites: biom.Table, metadata: Metadata = None, training_column: str = None, num_testing_examples: int = 5, min_feature_count: int = 10, epochs: int = 100, batch_size: int = 50, latent_dim: int = 3, input_prior: float = 1, output_prior: float = 1, learning_rate: float = 0.001, summary_interval: int = 60) -> (pd.DataFrame, OrdinationResults): if metadata is not None: metadata = metadata.to_dataframe() # Note: there are a couple of biom -> pandas conversions taking # place here. This is currently done on purpose, since we # haven't figured out how to handle sparse matrix multiplication # in the context of this algorithm. That is a future consideration. res = split_tables(microbes, metabolites, metadata=metadata, training_column=training_column, num_test=num_testing_examples, min_samples=min_feature_count) (train_microbes_df, test_microbes_df, train_metabolites_df, test_metabolites_df) = res train_microbes_coo = coo_matrix(train_microbes_df.values) test_microbes_coo = coo_matrix(test_microbes_df.values) with tf.Graph().as_default(), tf.Session() as session: model = MMvec(latent_dim=latent_dim, u_scale=input_prior, v_scale=output_prior, learning_rate=learning_rate) model(session, train_microbes_coo, train_metabolites_df.values, test_microbes_coo, test_metabolites_df.values) loss, cv = model.fit(epoch=epochs, summary_interval=summary_interval) U, V = model.U, model.V U_ = np.hstack((np.ones( (model.U.shape[0], 1)), model.Ubias.reshape(-1, 1), U)) V_ = np.vstack( (model.Vbias.reshape(1, -1), np.ones((1, model.V.shape[1])), V)) ranks = pd.DataFrame(np.hstack((np.zeros( (model.U.shape[0], 1)), U_ @ V_)), index=train_microbes_df.columns, columns=train_metabolites_df.columns) ranks = ranks - ranks.mean(axis=1).values.reshape(-1, 1) ranks = ranks - ranks.mean(axis=0) u, s, v = svds(ranks, k=latent_dim) s = s[::-1] u = u[:, ::-1] v = v[::-1, :] microbe_embed = u @ np.diag(s) metabolite_embed = v.T pc_ids = ['PC%d' % i for i in range(microbe_embed.shape[1])] features = pd.DataFrame(microbe_embed, columns=pc_ids, index=train_microbes_df.columns) samples = pd.DataFrame(metabolite_embed, columns=pc_ids, index=train_metabolites_df.columns) short_method_name = 'mmvec biplot' long_method_name = 'Multiomics mmvec biplot' eigvals = pd.Series(s, index=pc_ids) proportion_explained = pd.Series(s**2 / np.sum(s**2), index=pc_ids) biplot = OrdinationResults(short_method_name, long_method_name, eigvals, samples=samples, features=features, proportion_explained=proportion_explained) return ranks, biplot