示例#1
0
    def test_soils(self):
        np.random.seed(1)
        tf.reset_default_graph()
        n, d1 = self.trainX.shape
        n, d2 = self.trainY.shape

        with tf.Graph().as_default(), tf.Session() as session:
            set_random_seed(0)
            model = MMvec(beta_1=0.8, beta_2=0.9, latent_dim=1,
                          learning_rate=1e-3)
            model(session,
                  coo_matrix(self.trainX.values), self.trainY.values,
                  coo_matrix(self.testX.values), self.testY.values)
            model.fit(epoch=1000)

            ranks = pd.DataFrame(
                model.ranks(),
                index=self.microbes.ids(axis='observation'),
                columns=self.metabolites.ids(axis='observation'))

            microcoleus_metabolites = [
                '(3-methyladenine)', '7-methyladenine', '4-guanidinobutanoate',
                'uracil', 'xanthine', 'hypoxanthine', '(N6-acetyl-lysine)',
                'cytosine', 'N-acetylornithine', 'N-acetylornithine',
                'succinate', 'adenosine', 'guanine', 'adenine']
            mprobs = ranks.loc['rplo 1 (Cyanobacteria)']
            self.assertEqual(np.sum(mprobs.loc[microcoleus_metabolites] > 0),
                             len(microcoleus_metabolites))
示例#2
0
    def test_fit(self):
        np.random.seed(1)
        tf.reset_default_graph()
        n, d1 = self.trainX.shape
        n, d2 = self.trainY.shape
        with tf.Graph().as_default(), tf.Session() as session:
            set_random_seed(0)
            model = MMvec(beta_1=0.8, beta_2=0.9, latent_dim=2)
            model(session,
                  coo_matrix(self.trainX.values), self.trainY.values,
                  coo_matrix(self.testX.values), self.testY.values)
            model.fit(epoch=1000)

            U_ = np.hstack(
                (np.ones((self.U.shape[0], 1)), self.Ubias, self.U))
            V_ = np.vstack(
                (self.Vbias, np.ones((1, self.V.shape[1])), self.V))

            u_r, u_p = spearmanr(pdist(model.U), pdist(self.U))
            v_r, v_p = spearmanr(pdist(model.V.T), pdist(self.V.T))

            res = softmax(model.ranks())
            exp = softmax(np.hstack((np.zeros((d1, 1)), U_ @ V_)))
            s_r, s_p = spearmanr(np.ravel(res), np.ravel(exp))

            self.assertGreater(u_r, 0.5)
            self.assertGreater(v_r, 0.5)
            self.assertGreater(s_r, 0.5)
            self.assertLess(u_p, 5e-2)
            self.assertLess(v_p, 5e-2)
            self.assertLess(s_p, 5e-2)

            # sanity check cross validation
            self.assertLess(model.cv.eval(), 500)
示例#3
0
    def test_cpu(self):
        print('CPU run')
        np.random.seed(1)
        tf.reset_default_graph()
        n, d1 = self.trainX.shape
        n, d2 = self.trainY.shape

        with tf.Graph().as_default(), tf.Session() as session:
            set_random_seed(0)
            model = MMvec(beta_1=0.8, beta_2=0.9, latent_dim=2,
                          batch_size=2000)
            model(session,
                  coo_matrix(self.trainX.values), self.trainY.values,
                  coo_matrix(self.testX.values), self.testY.values)
            model.fit(epoch=10000)
示例#4
0
def paired_omics(microbes: biom.Table,
                 metabolites: biom.Table,
                 metadata: Metadata = None,
                 training_column: str = None,
                 num_testing_examples: int = 5,
                 min_feature_count: int = 10,
                 epochs: int = 100,
                 batch_size: int = 50,
                 latent_dim: int = 3,
                 input_prior: float = 1,
                 output_prior: float = 1,
                 learning_rate: float = 1e-3,
                 equalize_biplot: float = False,
                 arm_the_gpu: bool = False,
                 summary_interval: int = 60) -> (
                     pd.DataFrame, OrdinationResults, qiime2.Metadata
                 ):

    if metadata is not None:
        metadata = metadata.to_dataframe()

    if arm_the_gpu:
        # pick out the first GPU
        device_name = '/device:GPU:0'
    else:
        device_name = '/cpu:0'

    # Note: there are a couple of biom -> pandas conversions taking
    # place here.  This is currently done on purpose, since we
    # haven't figured out how to handle sparse matrix multiplication
    # in the context of this algorithm.  That is a future consideration.
    res = split_tables(
        microbes, metabolites,
        metadata=metadata, training_column=training_column,
        num_test=num_testing_examples,
        min_samples=min_feature_count)

    (train_microbes_df, test_microbes_df,
     train_metabolites_df, test_metabolites_df) = res

    train_microbes_coo = coo_matrix(train_microbes_df.values)
    test_microbes_coo = coo_matrix(test_microbes_df.values)

    with tf.Graph().as_default(), tf.Session() as session:
        model = MMvec(
            latent_dim=latent_dim,
            u_scale=input_prior, v_scale=output_prior,
            batch_size=batch_size,
            device_name=device_name,
            learning_rate=learning_rate)
        model(session,
              train_microbes_coo, train_metabolites_df.values,
              test_microbes_coo, test_metabolites_df.values)

        loss, cv = model.fit(epoch=epochs, summary_interval=summary_interval)
        ranks = pd.DataFrame(model.ranks(), index=train_microbes_df.columns,
                             columns=train_metabolites_df.columns)
        if latent_dim > 0:
            u, s, v = svds(ranks - ranks.mean(axis=0), k=latent_dim)
        else:
            # fake it until you make it
            u, s, v = svds(ranks - ranks.mean(axis=0), k=1)

        ranks = ranks.T
        ranks.index.name = 'featureid'
        s = s[::-1]
        u = u[:, ::-1]
        v = v[::-1, :]
        if equalize_biplot:
            microbe_embed = u @ np.sqrt(np.diag(s))
            metabolite_embed = v.T @ np.sqrt(np.diag(s))
        else:
            microbe_embed = u @ np.diag(s)
            metabolite_embed = v.T

        pc_ids = ['PC%d' % i for i in range(microbe_embed.shape[1])]
        features = pd.DataFrame(
            microbe_embed, columns=pc_ids,
            index=train_microbes_df.columns)
        samples = pd.DataFrame(
            metabolite_embed, columns=pc_ids,
            index=train_metabolites_df.columns)
        short_method_name = 'mmvec biplot'
        long_method_name = 'Multiomics mmvec biplot'
        eigvals = pd.Series(s, index=pc_ids)
        proportion_explained = pd.Series(s**2 / np.sum(s**2), index=pc_ids)
        biplot = OrdinationResults(
            short_method_name, long_method_name, eigvals,
            samples=samples, features=features,
            proportion_explained=proportion_explained)

        its = np.arange(len(loss))
        convergence_stats = pd.DataFrame(
            {
                'loss': loss,
                'cross-validation': cv,
                'iteration': its
            }
        )

        convergence_stats.index.name = 'id'
        convergence_stats.index = convergence_stats.index.astype(np.str)

        c = convergence_stats['loss'].astype(np.float)
        convergence_stats['loss'] = c

        c = convergence_stats['cross-validation'].astype(np.float)
        convergence_stats['cross-validation'] = c

        c = convergence_stats['iteration'].astype(np.int)
        convergence_stats['iteration'] = c

        return ranks, biplot, qiime2.Metadata(convergence_stats)
示例#5
0
def paired_omics(
        microbes: biom.Table,
        metabolites: biom.Table,
        metadata: Metadata = None,
        training_column: str = None,
        num_testing_examples: int = 5,
        min_feature_count: int = 10,
        epochs: int = 100,
        batch_size: int = 50,
        latent_dim: int = 3,
        input_prior: float = 1,
        output_prior: float = 1,
        learning_rate: float = 0.001,
        summary_interval: int = 60) -> (pd.DataFrame, OrdinationResults):

    if metadata is not None:
        metadata = metadata.to_dataframe()

    # Note: there are a couple of biom -> pandas conversions taking
    # place here.  This is currently done on purpose, since we
    # haven't figured out how to handle sparse matrix multiplication
    # in the context of this algorithm.  That is a future consideration.
    res = split_tables(microbes,
                       metabolites,
                       metadata=metadata,
                       training_column=training_column,
                       num_test=num_testing_examples,
                       min_samples=min_feature_count)

    (train_microbes_df, test_microbes_df, train_metabolites_df,
     test_metabolites_df) = res

    train_microbes_coo = coo_matrix(train_microbes_df.values)
    test_microbes_coo = coo_matrix(test_microbes_df.values)

    with tf.Graph().as_default(), tf.Session() as session:
        model = MMvec(latent_dim=latent_dim,
                      u_scale=input_prior,
                      v_scale=output_prior,
                      learning_rate=learning_rate)
        model(session, train_microbes_coo, train_metabolites_df.values,
              test_microbes_coo, test_metabolites_df.values)

        loss, cv = model.fit(epoch=epochs, summary_interval=summary_interval)

        U, V = model.U, model.V

        U_ = np.hstack((np.ones(
            (model.U.shape[0], 1)), model.Ubias.reshape(-1, 1), U))
        V_ = np.vstack(
            (model.Vbias.reshape(1, -1), np.ones((1, model.V.shape[1])), V))

        ranks = pd.DataFrame(np.hstack((np.zeros(
            (model.U.shape[0], 1)), U_ @ V_)),
                             index=train_microbes_df.columns,
                             columns=train_metabolites_df.columns)

        ranks = ranks - ranks.mean(axis=1).values.reshape(-1, 1)
        ranks = ranks - ranks.mean(axis=0)
        u, s, v = svds(ranks, k=latent_dim)
        s = s[::-1]
        u = u[:, ::-1]
        v = v[::-1, :]
        microbe_embed = u @ np.diag(s)
        metabolite_embed = v.T

        pc_ids = ['PC%d' % i for i in range(microbe_embed.shape[1])]
        features = pd.DataFrame(microbe_embed,
                                columns=pc_ids,
                                index=train_microbes_df.columns)
        samples = pd.DataFrame(metabolite_embed,
                               columns=pc_ids,
                               index=train_metabolites_df.columns)
        short_method_name = 'mmvec biplot'
        long_method_name = 'Multiomics mmvec biplot'
        eigvals = pd.Series(s, index=pc_ids)
        proportion_explained = pd.Series(s**2 / np.sum(s**2), index=pc_ids)
        biplot = OrdinationResults(short_method_name,
                                   long_method_name,
                                   eigvals,
                                   samples=samples,
                                   features=features,
                                   proportion_explained=proportion_explained)

        return ranks, biplot