def test_clr_inv(self): npt.assert_allclose(clr_inv(self.rdata1), self.ortho1) npt.assert_allclose(clr(clr_inv(self.rdata1)), self.rdata1) # make sure that inplace modification is not occurring clr_inv(self.rdata1) npt.assert_allclose( self.rdata1, np.array([[0.70710678, -0.70710678, 0., 0.], [0.40824829, 0.40824829, -0.81649658, 0.], [0.28867513, 0.28867513, 0.28867513, -0.8660254]]))
def test_clr_inv(self): npt.assert_allclose(clr_inv(self.rdata1), self.ortho1) npt.assert_allclose(clr(clr_inv(self.rdata1)), self.rdata1) # make sure that inplace modification is not occurring clr_inv(self.rdata1) npt.assert_allclose(self.rdata1, np.array([[0.70710678, -0.70710678, 0., 0.], [0.40824829, 0.40824829, -0.81649658, 0.], [0.28867513, 0.28867513, 0.28867513, -0.8660254]]))
def setUp(self): np.random.seed(1) res = random_multimodal(num_microbes=8, num_metabolites=8, num_samples=150, latent_dim=2, sigmaQ=2, microbe_total=1000, metabolite_total=10000, seed=1) (self.microbes, self.metabolites, self.X, self.B, self.U, self.Ubias, self.V, self.Vbias) = res n, d1 = self.microbes.shape n, d2 = self.metabolites.shape self.microbes = biom.Table(self.microbes.values.T, self.microbes.columns, self.microbes.index) self.metabolites = biom.Table(self.metabolites.values.T, self.metabolites.columns, self.metabolites.index) U_ = np.hstack((np.ones((self.U.shape[0], 1)), self.Ubias, self.U)) V_ = np.vstack((self.Vbias, np.ones((1, self.V.shape[1])), self.V)) uv = U_ @ V_ h = np.zeros((d1, 1)) self.exp_ranks = clr_inv(np.hstack((h, uv)))
def test_fit(self): np.random.seed(1) tf.reset_default_graph() tf.set_random_seed(0) latent_dim = 2 res_ranks, res_biplot = paired_omics(self.microbes, self.metabolites, epochs=1000, latent_dim=latent_dim, min_feature_count=1, learning_rate=0.1) res_ranks = clr_inv(res_ranks.T) s_r, s_p = spearmanr(np.ravel(res_ranks), np.ravel(self.exp_ranks)) self.assertGreater(s_r, 0.5) self.assertLess(s_p, 1e-2) # make sure the biplot is of the correct dimensions npt.assert_allclose(res_biplot.samples.shape, np.array([self.microbes.shape[0], latent_dim])) npt.assert_allclose(res_biplot.features.shape, np.array([self.metabolites.shape[0], latent_dim])) # make sure that the biplot has the correct ordering self.assertGreater(res_biplot.proportion_explained[0], res_biplot.proportion_explained[1]) self.assertGreater(res_biplot.eigvals[0], res_biplot.eigvals[1])
def test_regression_results_residuals_projection(self): A = np.array # aliasing np.array for the sake of pep8 exp_resid = pd.DataFrame( { 's1': ilr_inv(A([-0.986842, -0.236842])), 's2': ilr_inv(A([-0.065789, -1.815789])), 's3': ilr_inv(A([1.473684, 0.473684])), 's4': ilr_inv(A([1.394737, -1.105263])), 's5': ilr_inv(A([-1.065789, 1.184211])), 's6': ilr_inv(A([-1.144737, -0.394737])), 's7': ilr_inv(A([0.394737, 1.894737])) }, index=['a', 'b', 'c']).T # note that in the example, the basis is not strictly # equivalent to the tree basis = pd.DataFrame(clr_inv(_gram_schmidt_basis(3)), index=['Y1', 'Y2'], columns=['a', 'b', 'c']) submodels = [self.model1, self.model2] res = submock(submodels=submodels, basis=basis, tree=self.tree, balances=self.balances) res.fit() pdt.assert_frame_equal(res.residuals(project=True), exp_resid, check_exact=False, check_less_precise=True)
def test_regression_results_residuals(self): exp_resid = pd.DataFrame( { 's1': [-0.986842, -0.236842], 's2': [-0.065789, -1.815789], 's3': [1.473684, 0.473684], 's4': [1.394737, -1.105263], 's5': [-1.065789, 1.184211], 's6': [-1.144737, -0.394737], 's7': [0.394737, 1.894737] }, index=['Y1', 'Y2']).T basis = pd.DataFrame(clr_inv(_gram_schmidt_basis(3)), index=['Y1', 'Y2'], columns=['a', 'b', 'c']) submodels = [self.model1, self.model2] res = submock(submodels=submodels, basis=basis, tree=self.tree, balances=self.balances) res.fit() pdt.assert_frame_equal(res.residuals(), exp_resid, check_exact=False, check_less_precise=True)
def test_regression_results_predict_projection(self): basis = pd.DataFrame(clr_inv(_gram_schmidt_basis(3)), index=['Y1', 'Y2'], columns=['a', 'b', 'c']) submodels = [self.model1, self.model2] res = submock(submodels=submodels, basis=basis, tree=self.tree, balances=self.balances) res.fit() res_predict = res.predict(self.data[['X']], project=True) A = np.array # aliasing np.array for the sake of pep8 exp_predict = pd.DataFrame( { 's1': ilr_inv(A([1.986842, 1.236842])), 's2': ilr_inv(A([3.065789, 3.815789])), 's3': ilr_inv(A([2.526316, 2.526316])), 's4': ilr_inv(A([3.605263, 5.105263])), 's5': ilr_inv(A([3.065789, 3.815789])), 's6': ilr_inv(A([4.144737, 6.394737])), 's7': ilr_inv(A([3.605263, 5.105263])) }, index=['a', 'b', 'c']).T pdt.assert_frame_equal(res_predict, exp_predict)
def test_regression_results_predict_none(self): basis = pd.DataFrame(clr_inv(_gram_schmidt_basis(3)), index=['Y1', 'Y2'], columns=['a', 'b', 'c']) submodels = [self.model1, self.model2] res = submock(submodels=submodels, basis=basis, tree=self.tree, balances=self.balances) res.fit() res_predict = res.predict() exp_predict = pd.DataFrame( { 's1': [1.986842, 1.236842], 's2': [3.065789, 3.815789], 's3': [2.526316, 2.526316], 's4': [3.605263, 5.105263], 's5': [3.065789, 3.815789], 's6': [4.144737, 6.394737], 's7': [3.605263, 5.105263] }, index=['Y1', 'Y2']).T pdt.assert_frame_equal(res_predict, exp_predict)
def test_biplot(self): exp = clr(centralize(clr_inv(self.beta))) res = regression_biplot(self.beta) self.assertIsInstance(res, OrdinationResults) u = res.samples.values v = res.features.values.T npt.assert_allclose(u @ v, np.array(exp), atol=0.5, rtol=0.5)
def balance_basis(tree_node): """ Determines the basis based on bifurcating tree. This is commonly referred to as sequential binary partition [1]_. Given a binary tree relating a list of features, this module can be used to calculate an orthonormal basis, which is used to calculate the ilr transform. Parameters ---------- treenode : skbio.TreeNode Input bifurcating tree. Must be strictly bifurcating (i.e. every internal node needs to have exactly 2 children). Returns ------- basis : np.array Returns a set of orthonormal bases in the Aitchison simplex corresponding to the tree. The order of the basis is index by the level order of the internal nodes. nodes : list, skbio.TreeNode List of tree nodes indicating the ordering in the basis. Raises ------ ValueError The tree doesn't contain two branches. Examples -------- >>> from gneiss.balances import balance_basis >>> from skbio import TreeNode >>> tree = u"((b,c)a, d)root;" >>> t = TreeNode.read([tree]) >>> basis, nodes = balance_basis(t) >>> basis array([[ 0.18507216, 0.18507216, 0.62985567], [ 0.14002925, 0.57597535, 0.28399541]]) Notes ----- The tree must be strictly bifurcating, meaning that every internal node has exactly 2 children. See Also -------- skbio.stats.composition.ilr References ---------- .. [1] J.J. Egozcue and V. Pawlowsky-Glahn "Exploring Compositional Data with the CoDa-Dendrogram" (2011) """ basis, nodes = _balance_basis(tree_node) basis = clr_inv(basis) return basis, nodes
def balance_basis(tree_node): """ Determines the basis based on bifurcating tree. This is commonly referred to as sequential binary partition [1]_. Given a binary tree relating a list of features, this module can be used to calculate an orthonormal basis, which is used to calculate the ilr transform. Parameters ---------- treenode : skbio.TreeNode Input bifurcating tree. Must be strictly bifurcating (i.e. every internal node needs to have exactly 2 children). Returns ------- basis : np.array Returns a set of orthonormal bases in the Aitchison simplex corresponding to the tree. The order of the basis is index by the level order of the internal nodes. nodes : list, skbio.TreeNode List of tree nodes indicating the ordering in the basis. Raises ------ ValueError The tree doesn't contain two branches. Examples -------- >>> from gneiss.balances import balance_basis >>> from skbio import TreeNode >>> tree = u"((b,c)a, d)root;" >>> t = TreeNode.read([tree]) >>> basis, nodes = balance_basis(t) >>> basis array([[0.18507216, 0.18507216, 0.62985567], [0.14002925, 0.57597535, 0.28399541]]) Notes ----- The tree must be strictly bifurcating, meaning that every internal node has exactly 2 children. See Also -------- skbio.stats.composition.ilr References ---------- .. [1] J.J. Egozcue and V. Pawlowsky-Glahn "Exploring Compositional Data with the CoDa-Dendrogram" (2011) """ basis, nodes = _balance_basis(tree_node) basis = clr_inv(basis) return basis, nodes
def multinomial(table: biom.Table, metadata: Metadata, formula: str, training_column: str = None, num_random_test_examples: int = 10, epoch: int = 10, batch_size: int = 5, beta_prior: float = 1, learning_rate: float = 0.1, clipnorm: float = 10, min_sample_count: int = 10, min_feature_count: int = 10, summary_interval: int = 60) -> (pd.DataFrame): # load metadata and tables metadata = metadata.to_dataframe() # match them table, metadata, design = match_and_filter(table, metadata, formula, training_column, num_random_test_examples, min_sample_count, min_feature_count) # convert to dense representation dense_table = table.to_dataframe().to_dense().T # split up training and testing trainX, testX, trainY, testY = split_training(dense_table, metadata, design, training_column, num_random_test_examples) model = MultRegression(learning_rate=learning_rate, clipnorm=clipnorm, beta_mean=beta_prior, batch_size=batch_size, save_path=None) with tf.Graph().as_default(), tf.Session() as session: model(session, trainX, trainY, testX, testY) model.fit(epoch=epoch, summary_interval=summary_interval, checkpoint_interval=None) md_ids = np.array(design.columns) obs_ids = table.ids(axis='observation') beta_ = clr(clr_inv(np.hstack((np.zeros((model.p, 1)), model.B)))) beta_ = pd.DataFrame( beta_.T, columns=md_ids, index=obs_ids, ) return beta_
def test_fit(self): tf.set_random_seed(0) md = self.md md.name = 'sampleid' md = qiime2.Metadata(md) exp_beta = clr(clr_inv(np.hstack((np.zeros((2, 1)), self.beta.T)))) res_beta = multinomial(table=self.table, metadata=md, formula="X", epoch=50000) npt.assert_allclose(exp_beta, res_beta.T, atol=0.5, rtol=0.5)
def test_fit_float_summary_interval(self): tf.set_random_seed(0) md = self.md multregression = songbird_plugin.actions['multinomial'] md.name = 'sampleid' md = qiime2.Metadata(md) # See issue #31 exp_beta = clr(clr_inv(np.hstack((np.zeros((2, 1)), self.beta.T)))) q2_table = qiime2.Artifact.import_data('FeatureTable[Frequency]', self.table) q2_res_beta, q2_res_stats, q2_res_biplot = multregression( table=q2_table, metadata=md, min_sample_count=0, min_feature_count=0, formula="X", epochs=1000, summary_interval=0.5, ) # try-except is for helpful error message if q2-coercion fails try: res_biplot = q2_res_biplot.view(OrdinationResults) except Exception: raise AssertionError('res_biplot unable to be coerced to ' 'OrdinationResults') try: res_beta = q2_res_beta.view(pd.DataFrame) except Exception: raise AssertionError('res_beta unable to be coerced to ' 'pd.DataFrame') try: res_stats = q2_res_stats.view(qiime2.Metadata) except Exception: raise AssertionError('res_stats unable to be coerced to ' 'qiime2.Metadata') u = res_biplot.samples.values v = res_biplot.features.values.T npt.assert_allclose(u @ v, res_beta.values, atol=0.5, rtol=0.5) npt.assert_allclose(exp_beta, res_beta.T, atol=0.6, rtol=0.6) self.assertGreater(len(res_stats.to_dataframe().index), 1)
def setUp(self): self.pickle_fname = "test.pickle" self.data = pd.DataFrame( [[1, 1, 1], [3, 2, 3], [4, 3, 2], [5, 4, 4], [2, 5, 3], [3, 6, 5], [4, 7, 4]], index=['s1', 's2', 's3', 's4', 's5', 's6', 's7'], columns=['Y1', 'Y2', 'X']) self.model1 = smf.ols(formula="Y1 ~ X", data=self.data) self.model2 = smf.ols(formula="Y2 ~ X", data=self.data) self.tree = TreeNode.read(['((a,b)Y1, c)Y2;']) self.basis = pd.DataFrame(clr_inv(balance_basis(self.tree)[0]), columns=['a', 'b', 'c'], index=['Y1', 'Y2']) self.balances = pd.DataFrame(self.data[['Y1', 'Y2']], index=self.data.index, columns=['Y1', 'Y2'])
def regression_biplot(coefficients: pd.DataFrame) -> skbio.OrdinationResults: coefs = clr(centralize(clr_inv(coefficients))) u, s, v = np.linalg.svd(coefs) pc_ids = ['PC%d' % i for i in range(len(s))] samples = pd.DataFrame(u[:, :len(s)] @ np.diag(s), columns=pc_ids, index=coefficients.index) features = pd.DataFrame(v.T[:, :len(s)], columns=pc_ids, index=coefficients.columns) short_method_name = 'regression_biplot' long_method_name = 'Multinomial regression biplot' eigvals = pd.Series(s, index=pc_ids) proportion_explained = eigvals / eigvals.sum() res = OrdinationResults(short_method_name, long_method_name, eigvals, samples=samples, features=features, proportion_explained=proportion_explained) return res
def test_regression_results_coefficient_projection(self): exp_coef = pd.DataFrame( {'Intercept': ilr_inv(np.array([[1.447368, -0.052632]])), 'X': ilr_inv(np.array([[0.539474, 1.289474]]))}, index=['a', 'b', 'c']) # note that in the example, the basis is not strictly # equivalent to the tree basis = pd.DataFrame(clr_inv(_gram_schmidt_basis(3)), index=['Y1', 'Y2'], columns=['a', 'b', 'c']) submodels = [self.model1, self.model2] res = submock(submodels=submodels, basis=basis, tree=self.tree, balances=self.balances) res.fit() pdt.assert_frame_equal(res.coefficients(project=True), exp_coef, check_exact=False, check_less_precise=True)
def test_regression_results_predict_extrapolate(self): basis = pd.DataFrame(clr_inv(_gram_schmidt_basis(3)), index=['Y1', 'Y2'], columns=['a', 'b', 'c']) submodels = [self.model1, self.model2] res = submock(submodels=submodels, basis=basis, tree=self.tree, balances=self.balances) res.fit() extrapolate = pd.DataFrame({'X': [8, 9, 10]}, index=['k1', 'k2', 'k3']) res_predict = res.predict(extrapolate) exp_predict = pd.DataFrame({'k1': [5.76315789, 10.26315789], 'k2': [6.30263158, 11.55263158], 'k3': [6.84210526, 12.84210526]}, index=['Y1', 'Y2']).T pdt.assert_frame_equal(res_predict, exp_predict)
def phylogenetic_basis(tree_node): """ Determines the basis based on phylogenetic tree Parameters ---------- treenode : skbio.TreeNode Phylogenetic tree. Must be a strictly bifurcating tree Returns ------- basis : np.array Returns a set of orthonormal bases in the Aitchison simplex corresponding to the phylogenetic tree. The order of the basis is index by the level order of the internal nodes. nodes : list, skbio.TreeNode List of tree nodes indicating the ordering in the basis. Raises ------ ValueError The tree doesn't contain two branches Examples -------- >>> from canvas.phylogeny import phylogenetic_basis >>> from skbio import TreeNode >>> tree = u"((b,c)a, d)root;" >>> t = TreeNode.read([tree]) >>> basis, nodes = phylogenetic_basis(t) >>> basis array([[ 0.62985567, 0.18507216, 0.18507216], [ 0.28399541, 0.57597535, 0.14002925]]) Notes ----- The tree must be strictly bifurcating, meaning that every internal node has exactly 2 children. """ basis, nodes = _balance_basis(tree_node) basis = clr_inv(basis) return basis, nodes
def cross_validation(md, beta, gamma, data, k=50): """ Computes two cross validation metrics 1) Rank difference 2) Mean squared error on observed entries Parameters ---------- md : np.array Design matrix beta : np.array Regression coefficients gamma : np.array Regression intercepts data : np.array Dense matrix of counts. Samples are rows and features are columns. k : int Top k ranks to compare Returns ------- mse : float Mean squared error across all of the cells in the matrix mrc : float Mean rank correlation. This take the average spearman correlation across every sample. This boils down to matching rank species curves per sample. """ n = data.sum(axis=1).reshape(-1, 1) pred = np.multiply(n, clr_inv(md @ beta + gamma)) mse = np.mean([cityblock(data[i], pred[i]) for i in range(data.shape[0])]) / data.shape[1] rc = [] for i in range(data.shape[0]): idx = np.argsort(data[i, :])[-k:] r = spearmanr(data[i, idx], pred[i, idx]) rc.append(r.correlation) mrc = np.mean(rc) return mse, mrc
def test_fit(self): tf.set_random_seed(0) md = self.md md.name = 'sampleid' md = qiime2.Metadata(md) exp_beta = clr(clr_inv(np.hstack((np.zeros((2, 1)), self.beta.T)))) res_beta, res_stats, res_biplot = multinomial(table=self.table, metadata=md, min_sample_count=0, min_feature_count=0, formula="X", epochs=1000) # test biplot self.assertIsInstance(res_biplot, OrdinationResults) u = res_biplot.samples.values v = res_biplot.features.values.T npt.assert_allclose(u @ v, res_beta.values, atol=0.5, rtol=0.5) npt.assert_allclose(exp_beta, res_beta.T, atol=0.6, rtol=0.6) self.assertGreater(len(res_stats.to_dataframe().index), 1)
def multinomial( table: biom.Table, metadata: Metadata, formula: str, training_column: str = None, num_random_test_examples: int = 5, epochs: int = 1000, batch_size: int = 5, differential_prior: float = 1, learning_rate: float = 1e-3, clipnorm: float = 10, min_sample_count: int = 1000, min_feature_count: int = 10, summary_interval: int = 60 ) -> (pd.DataFrame, qiime2.Metadata, skbio.OrdinationResults): # load metadata and tables metadata = metadata.to_dataframe() # match them table, metadata, design = match_and_filter(table, metadata, formula, min_sample_count, min_feature_count) # convert to dense representation dense_table = table.to_dataframe().to_dense().T # split up training and testing trainX, testX, trainY, testY = split_training(dense_table, metadata, design, training_column, num_random_test_examples) model = MultRegression(learning_rate=learning_rate, clipnorm=clipnorm, beta_mean=differential_prior, batch_size=batch_size, save_path=None) with tf.Graph().as_default(), tf.Session() as session: model(session, trainX, trainY, testX, testY) loss, cv, its = model.fit(epochs=epochs, summary_interval=summary_interval, checkpoint_interval=None) md_ids = np.array(design.columns) obs_ids = table.ids(axis='observation') beta_ = clr(clr_inv(np.hstack((np.zeros((model.p, 1)), model.B)))) differentials = pd.DataFrame( beta_.T, columns=md_ids, index=obs_ids, ) convergence_stats = pd.DataFrame({ 'loglikehood': loss, 'cross-validation': cv, 'iteration': its }) convergence_stats.index.name = 'id' convergence_stats.index = convergence_stats.index.astype(np.str) c = convergence_stats['loglikehood'].astype(np.float) convergence_stats['loglikehood'] = c c = convergence_stats['cross-validation'].astype(np.float) convergence_stats['cross-validation'] = c c = convergence_stats['iteration'].astype(np.int) convergence_stats['iteration'] = c # regression biplot if differentials.shape[-1] > 1: u, s, v = np.linalg.svd(differentials) pc_ids = ['PC%d' % i for i in range(len(s))] samples = pd.DataFrame(u[:, :len(s)] @ np.diag(s), columns=pc_ids, index=differentials.index) features = pd.DataFrame(v.T[:, :len(s)], columns=pc_ids, index=differentials.columns) short_method_name = 'regression_biplot' long_method_name = 'Multinomial regression biplot' eigvals = pd.Series(s, index=pc_ids) proportion_explained = eigvals**2 / (eigvals**2).sum() biplot = OrdinationResults(short_method_name, long_method_name, eigvals, samples=samples, features=features, proportion_explained=proportion_explained) else: # this is to handle the edge case with only intercepts biplot = OrdinationResults('', '', pd.Series(), pd.DataFrame()) return differentials, qiime2.Metadata(convergence_stats), biplot
basetmp_sub = base_truth.loc[( rank_, power_, depth_, ), :].copy().T # sub sampled subtmp_sub = subtmp.copy() #meta on cluster meta = np.array([1] * int(subtmp.shape[0] / 2) + [2] * int(subtmp.shape[0] / 2)).T meta = pd.DataFrame(meta, index=subtmp.index, columns=['group']) # test KL with rcl X_sparse = rclr().fit_transform(subtmp_sub.copy()) U, s, V = OptSpace(iteration=1000).fit_transform(X_sparse) clr_res = clr_inv(np.dot(np.dot(U, s), V.T)) # use just kl_div here because already closed kl_clr = entropy(closure(basetmp_sub).T, clr_res.T).mean() results[(rank_, power_, depth_, 'rclr', 'KL-Div')] = [kl_clr] # test KL without rclr X_spn = np.array(subtmp_sub.copy()).astype(float) X_spn[X_spn == 0] = np.nan U_, s_, V_ = OptSpace(iteration=1000).fit_transform(X_spn) res_raw = np.dot(np.dot(U_, s_), V_.T) res_raw[res_raw <= 0] = 1 kl_raw = entropy(closure(basetmp_sub).T, closure(res_raw).T).mean() results[(rank_, power_, depth_, 'Raw Counts', 'KL-Div')] = [kl_raw] # f-stat resfclr = permanova(DistanceMatrix(distance.cdist(U, U)),
def band_table(num_samples, num_features, tree=None, low=2, high=10, sigma=2, alpha=6, seed=0): """ Generates a simulated table of counts. Each organism is modeled as a Gaussian distribution. Then counts are simulated using a Poisson distribution. Parameters ---------- num_samples : int Number of samples to simulate num_features : int Number of features to simulate tree : skbio.TreeNode Tree used as a scaffold for the ilr transform. If None, then the gram_schmidt_basis will be used. low : float Smallest gradient value. high : float Largest gradient value. sigma : float Variance of each species distribution alpha : int Global count bias. This bias is added to every cell in the matrix. seed : int or np.random.RandomState Random seed Returns ------- biom.Table Biom representation of the count table. pd.DataFrame DataFrame containing relevant metadata. beta : np.array Regression parameter estimates. theta : np.array Bias per sample. """ state = np.random.RandomState(seed) # measured gradient values for each sample gradient = np.linspace(low, high, num_samples) # optima for features (i.e. optimal ph for species) mu = np.linspace(low, high, num_features) sigma = np.array([sigma] * num_features) # construct species distributions table = chain_interactions(gradient, mu, sigma) samp_ids = ['S%d' % i for i in range(num_samples)] # obtain basis required to convert from balances to proportions. if tree is None: basis = _gram_schmidt_basis(num_features) feat_ids = ['F%d' % i for i in range(num_features)] table = pd.DataFrame(table, index=samp_ids, columns=feat_ids) else: feat_ids = [n.name for n in tree.tips()] table = pd.DataFrame(table, index=samp_ids, columns=feat_ids) basis = sparse_balance_basis(tree)[0].todense() # construct balances from gaussian distribution. # this will be necessary when refitting parameters later. Y = ilr(table, basis=clr_inv(basis)) X = gradient.reshape(-1, 1) X = np.hstack((np.ones(len(X)).reshape(-1, 1), X.reshape(-1, 1))) pY, resid, B = ols(Y, X) gamma = B[0] beta = B[1].reshape(1, -1) # parameter estimates r = beta.shape[1] # Normal distribution to simulate linear regression M = np.eye(r) # Generate covariance matrix from inverse wishart Sigma = invwishart.rvs(df=r + 2, scale=M.dot(M.T), random_state=state) w, v = eigsh(Sigma, k=2) # Low rank covariance matrix sim_L = (v @ np.diag(w)).T # sample y = X.dot(B) Ys = np.vstack( [state.multivariate_normal(y[i, :], Sigma) for i in range(y.shape[0])]) Yp = Ys @ basis # calculate bias terms theta = -np.log(np.exp(Yp).sum(axis=1)) + alpha # multinomial sample the entries #table = np.vstack(multinomial(nd, Yp[i, :]) for i in range(y.shape[0])) # poisson sample the entries table = np.vstack( state.poisson(np.exp(Yp[i, :] + theta[i])) for i in range(y.shape[0])).T table = Table(table, feat_ids, samp_ids) metadata = pd.DataFrame({'G': gradient}, index=samp_ids) return table, metadata, beta, theta, gamma
def mmvec(microbes: biom.Table, metabolites: biom.Table, metadata: Metadata = None, training_column: str = None, num_testing_examples: int = 5, min_feature_count: int = 10, epochs: int = 100, batch_size: int = 50, latent_dim: int = 3, input_prior: float = 1, output_prior: float = 1, learning_rate: float = 0.001, summary_interval: int = 60) -> (pd.DataFrame, OrdinationResults): if metadata is not None: metadata = metadata.to_dataframe() # Note: there are a couple of biom -> pandas conversions taking # place here. This is currently done on purpose, since we # haven't figured out how to handle sparse matrix multiplication # in the context of this algorithm. That is a future consideration. res = split_tables(microbes, metabolites, metadata=metadata, training_column=training_column, num_test=num_testing_examples, min_samples=min_feature_count) (train_microbes_df, test_microbes_df, train_metabolites_df, test_metabolites_df) = res train_microbes_coo = coo_matrix(train_microbes_df.values) test_microbes_coo = coo_matrix(test_microbes_df.values) with tf.Graph().as_default(), tf.Session() as session: model = MMvec(latent_dim=latent_dim, u_scale=input_prior, v_scale=output_prior, learning_rate=learning_rate) model(session, train_microbes_coo, train_metabolites_df.values, test_microbes_coo, test_metabolites_df.values) loss, cv = model.fit(epoch=epochs, summary_interval=summary_interval) U, V = model.U, model.V U_ = np.hstack((np.ones( (model.U.shape[0], 1)), model.Ubias.reshape(-1, 1), U)) V_ = np.vstack( (model.Vbias.reshape(1, -1), np.ones((1, model.V.shape[1])), V)) ranks = pd.DataFrame(clr( clr_inv(np.hstack((np.zeros((model.U.shape[0], 1)), U_ @ V_)))), index=train_microbes_df.columns, columns=train_metabolites_df.columns) ranks = ranks - ranks.mean(axis=0) u, s, v = svds(ranks, k=latent_dim) microbe_embed = u @ np.diag(s) metabolite_embed = v.T pc_ids = ['PC%d' % i for i in range(microbe_embed.shape[1])] features = pd.DataFrame(microbe_embed, columns=pc_ids, index=train_microbes_df.columns) samples = pd.DataFrame(metabolite_embed, columns=pc_ids, index=train_metabolites_df.columns) short_method_name = 'mmvec biplot' long_method_name = 'Multiomics mmvec biplot' eigvals = pd.Series(s, index=pc_ids) proportion_explained = pd.Series(s**2 / np.sum(s**2), index=pc_ids) biplot = OrdinationResults(short_method_name, long_method_name, eigvals, samples=samples, features=features, proportion_explained=proportion_explained) return ranks, biplot