def test__balance_basis_base_case(self): tree = u"(a,b);" t = TreeNode.read([tree]) exp_basis = np.array([[-np.sqrt(1. / 2), np.sqrt(1. / 2)]]) exp_keys = [t.name] res_basis, res_keys = _balance_basis(t) npt.assert_allclose(exp_basis, res_basis) self.assertListEqual(exp_keys, res_keys)
def test__balance_basis_unbalanced(self): tree = u"((a,b)c, d);" t = TreeNode.read([tree]) exp_basis = np.array( [[-np.sqrt(1. / 6), -np.sqrt(1. / 6), np.sqrt(2. / 3)], [-np.sqrt(1. / 2), np.sqrt(1. / 2), 0]]) exp_keys = [t.name, t[0].name] res_basis, res_keys = _balance_basis(t) npt.assert_allclose(exp_basis, res_basis) self.assertListEqual(exp_keys, res_keys)
def test__balance_basis_unbalanced(self): tree = u"((a,b)c, d);" t = TreeNode.read([tree]) exp_basis = np.array( [[-np.sqrt(1. / 6), -np.sqrt(1. / 6), np.sqrt(2. / 3)], [-np.sqrt(1. / 2), np.sqrt(1. / 2), 0]] ) exp_keys = [t.name, t[0].name] res_basis, res_keys = _balance_basis(t) npt.assert_allclose(exp_basis, res_basis) self.assertListEqual(exp_keys, res_keys)
def ilr_phylogenetic_differential( differential: pd.DataFrame, tree: skbio.TreeNode) -> (pd.DataFrame, skbio.TreeNode): t = tree.copy() t.bifurcate() diff, _tree = match_tips(differential.T, t) _tree = rename_internal_nodes(_tree) in_nodes = [n.name for n in _tree.levelorder() if not n.is_tip()] basis = _balance_basis(_tree)[0] basis = pd.DataFrame(basis.T, index=diff.columns, columns=in_nodes) diff_balances = (diff @ basis).T diff_balances.index.name = 'featureid' return diff_balances, t
def multinomial_bioms(k, D, N, M, min_sv=0.11, max_sv=5.0, sigma_sq=0.1): """ Simulates biom tables from multinomial. Parameters ---------- k : int Number of latent dimensions. D : int Number of microbes. N : int Number of samples. M : int Average sequencing depth. Returns ------- dict of np.array Ground truth parameters. """ dims, hdims, total = D, k, N eigs = min_sv + (max_sv - min_sv) * np.linspace(0, 1, hdims) eigvectors = ortho_group.rvs(dims - 1)[:, :hdims] W = np.matmul(eigvectors, np.diag(np.sqrt(eigs - sigma_sq))) sigma_sq = sigma_sq sigma = np.sqrt(sigma_sq) z = np.random.normal(size=(total, hdims)) eta = np.random.normal(np.matmul(z, W.T), sigma).astype(np.float32) tree = random_linkage(D) Psi = _balance_basis(tree)[0] prob = closure(np.exp(eta @ Psi)) depths = np.random.poisson(M, size=N) Y = np.vstack([np.random.multinomial(depths[i], prob[i]) for i in range(N)]) return dict( sigma=sigma, W=W, Psi=Psi, tree=tree, eta=eta, z=z, Y=Y, depths=depths, eigs=eigs, eigvectors=eigvectors )
def ilr_phylogenetic_ordination( table: pd.DataFrame, tree: skbio.TreeNode, pseudocount: float = 0.5, top_k_var: int = 10, clades: list = None ) -> (OrdinationResults, skbio.TreeNode, pd.DataFrame): t = tree.copy() t.bifurcate() _table, _tree = match_tips(table, t) _tree = rename_internal_nodes(_tree) if not clades: in_nodes = [n.name for n in _tree.levelorder() if not n.is_tip()] basis = _balance_basis(_tree)[0] _table = add_pseudocount(_table, pseudocount) basis = pd.DataFrame(basis.T, index=_table.columns, columns=in_nodes) balances = np.log(_table) @ basis var = balances.var(axis=0).sort_values(ascending=False) clades = var.index[:top_k_var] balances = balances[clades] basis = basis[clades] else: clades = clades[0].split(',') balances, basis = _fast_ilr(_tree, _table, clades, pseudocount=0.5) var = balances.var(axis=0).sort_values(ascending=False) balances.index.name = 'sampleid' # feature metadata eigvals = var prop = var[clades] / var.sum() balances = OrdinationResults( short_method_name='ILR', long_method_name='Phylogenetic Isometric Log Ratio Transform', samples=balances, features=pd.DataFrame(np.eye(len(clades)), index=clades), eigvals=eigvals, proportion_explained=prop) basis.index.name = 'featureid' return balances, _tree, basis
def multinomial_batch_bioms(k, D, N, M, C=2, min_sv=0.11, max_sv=5.0, sigma_sq=0.1): """ Simulates biom tables from multinomial with batch effects Parameters ---------- k : int Number of latent dimensions. D : int Number of microbes. N : int Number of samples. M : int Average sequencing depth. C : int Number of batches. Returns ------- dict of np.array Ground truth parameters. """ dims, hdims, total = D, k, N eigs = min_sv + (max_sv - min_sv) * np.linspace(0, 1, hdims) eigvectors = ortho_group.rvs(dims - 1)[:, :hdims] W = np.matmul(eigvectors, np.diag(np.sqrt(eigs - sigma_sq))) sigma_sq = sigma_sq sigma = np.sqrt(sigma_sq) z = np.random.normal(size=(total, hdims)) eta = np.random.normal(np.matmul(z, W.T), sigma).astype(np.float32) # Create ILR basis tree = random_linkage(D) Psi = _balance_basis(tree)[0] # add batch effects alpha = np.abs(np.random.normal(0, 0.5, size=(D))) alphaILR = np.abs(Psi) @ alpha # variances must always be positive m = np.zeros(D - 1) B = np.random.multivariate_normal(m, np.diag(alphaILR), size=C) batch_idx = np.random.randint(C, size=N) eta = np.vstack([eta[i] + B[batch_idx[i]] for i in range(N)]) # Convert latent variables to observed counts prob = closure(np.exp(eta @ Psi)) depths = np.random.poisson(M, size=N) Y = np.vstack([np.random.multinomial(depths[i], prob[i]) for i in range(N)]) return dict( sigma=sigma, W=W, Psi=Psi, tree=tree, eta=eta, z=z, Y=Y, alpha=alpha, alphaILR=alphaILR, B=B, batch_idx=batch_idx, depths=depths, eigs=eigs, eigvectors=eigvectors )