def test_summary_head(self): A = np.array # aliasing for the sake of pep8 table = pd.DataFrame({ 's1': ilr_inv(A([1., 3.])), 's2': ilr_inv(A([2., 2.])), 's3': ilr_inv(A([1., 3.])), 's4': ilr_inv(A([3., 4.])), 's5': ilr_inv(A([1., 5.]))}, index=['a', 'b', 'c']).T tree = TreeNode.read(['(c, (b,a)Y2)Y1;']) metadata = pd.DataFrame({ 'lame': [1, 2, 1, 4, 1], 'real': [1, 2, 3, 4, 5] }, index=['s1', 's2', 's3', 's4', 's5']) np.random.seed(0) self.maxDiff = None model = ols('real', table, metadata, tree) model.fit() fname = get_data_path('exp_ols_results2.txt') res = str(model.summary(ndim=1)) with open(fname, 'r') as fh: exp = fh.read() self.assertEqual(res, exp)
def test_ilr_inv_basis_one_dimension_error(self): basis = clr(np.array([[0.80442968, 0.19557032]])) table = np.array([[np.log(1/10)*np.sqrt(1/2), np.log(1.14141414 / 9.90909091)*np.sqrt(1/2), np.log(1.28282828 / 9.81818182)*np.sqrt(1/2), np.log(1.42424242 / 9.72727273)*np.sqrt(1/2), np.log(1.56565657 / 9.63636364)*np.sqrt(1/2)]]).T with self.assertRaises(ValueError): ilr_inv(table, basis=basis)
def test_ilr_inv_basis_one_dimension_error(self): basis = clr(np.array([[0.80442968, 0.19557032]])) table = np.array([[np.log(1/10)*np.sqrt(1/2), np.log(1.14141414 / 9.90909091)*np.sqrt(1/2), np.log(1.28282828 / 9.81818182)*np.sqrt(1/2), np.log(1.42424242 / 9.72727273)*np.sqrt(1/2), np.log(1.56565657 / 9.63636364)*np.sqrt(1/2)]]).T with self.assertRaises(ValueError): ilr_inv(table, basis=basis)
def _regression(y, X, basis=None): """ Performs a simplicial ordinary least squares on a set of compositions and a response variable Parameters ---------- y : numpy.ndarray, float a matrix of proportions where rows correspond to samples and columns correspond to features. X : numpy.ndarray, float independent variable Returns ------- predict: pd.DataFrame, float a predicted matrix of proportions where rows correspond to samples and columns correspond to features. b: pd.DataFrame, float a matrix of estimated coefficient compositions resid: pd.DataFrame, float a matrix of compositional residuals r2: float coefficient of determination """ y = np.atleast_2d(y) X = np.atleast_2d(X) # Need to add constant for intercept r, c = X.shape y_ = ilr(y, basis=basis) # Now perform least squares to calculate unknown coefficients inv = np.linalg.pinv(np.dot(X.T, X)) cross = np.dot(inv, X.T) b_ = np.dot(cross, y_) predict_ = np.dot(X, b_) resid = (y_ - predict_) sst = (y_ - y_.mean(axis=0)) r2 = 1 - ((resid**2).sum() / (sst**2).sum()) if len(b_.shape) == 1: b_ = np.atleast_2d(b_).T b = ilr_inv(b_) if len(predict_.shape) == 1: predict_ = np.atleast_2d(predict_).T predict = ilr_inv(predict_) if len(resid.shape) == 1: resid = np.atleast_2d(resid).T resid = ilr_inv(resid) return predict, b, resid, r2
def setUp(self): A = np.array # aliasing for the sake of pep8 self.table = pd.DataFrame({ 's1': ilr_inv(A([1., 1.])), 's2': ilr_inv(A([1., 2.])), 's3': ilr_inv(A([1., 3.])), 's4': ilr_inv(A([1., 4.])), 's5': ilr_inv(A([1., 5.]))}, index=['a', 'b', 'c']).T self.tree = TreeNode.read(['(c, (b,a)Y2)Y1;']) self.unannotated_tree = TreeNode.read(['(c, (b,a));']) self.metadata = pd.DataFrame({ 'lame': [1, 1, 1, 1, 1], 'real': [1, 2, 3, 4, 5] }, index=['s1', 's2', 's3', 's4', 's5'])
def compositional_noise(cov, nsamp, rng=None): """ This is multiplicative noise applied across the entire dataset. The noise is assumed to be Gaussian in the simplex. Parameters ---------- cov: array_like Covariance matrix for the normal distribution in ilr space. This is assumed to be in the default gram-schmidt orthonormal basis. nsamp: int Number of samples to generate rng: np.random.RandomState Numpy random state. Returns ------- np.array: A matrix of probabilities where there are `n` rows and `m` columns where `n` corresponds to the number of samples and `m` corresponds to the number of species. """ if rng is None: rng = RandomState(0) dist = multivariate_normal.rvs(cov=cov, size=nsamp, random_state=rng) return ilr_inv(dist)
def test_regression_results_residuals_projection(self): tree = TreeNode.read([r'(c, (a, b)Y2)Y1;']) basis, _ = balance_basis(tree) exp_resid = pd.DataFrame( { 's1': [-0.986842, -0.236842], 's2': [-0.065789, -1.815789], 's3': [1.473684, 0.473684], 's4': [1.394737, -1.105263], 's5': [-1.065789, 1.184211], 's6': [-1.144737, -0.394737], 's7': [0.394737, 1.894737] }, index=['Y1', 'Y2']).T exp_resid = pd.DataFrame( ilr_inv(exp_resid, basis), index=['s1', 's2', 's3', 's4', 's5', 's6', 's7'], columns=['c', 'a', 'b']) submodels = [self.model1, self.model2] res = submock(Y=self.balances, Xs=None) submock.submodels = submodels res.fit() res_resid = res.residuals(tree).sort_index() pdt.assert_frame_equal(res_resid, exp_resid, check_exact=False, check_less_precise=True)
def coefficients(self, tree=None): """ Returns coefficients from fit. Parameters ---------- tree : skbio.TreeNode, optional The tree used to perform the ilr transformation. If this is specified, then the prediction will be represented as proportions. Otherwise, if this is not specified, the prediction will be represented as balances. (default: None). Returns ------- pd.DataFrame A table of coefficients where rows are covariates, and the columns are balances. If `tree` is specified, then the columns are proportions. """ if not self._fitted: ValueError(('Model not fitted - coefficients not calculated.' 'See `fit()`')) coef = self._beta if tree is not None: basis, _ = balance_basis(tree) c = ilr_inv(coef.values, basis=basis) ids = [n.name for n in tree.tips()] return pd.DataFrame(c, columns=ids, index=coef.index) else: return coef
def coefficients(self, tree=None): """ Returns coefficients from fit. Parameters ---------- tree : skbio.TreeNode, optional The tree used to perform the ilr transformation. If this is specified, then the prediction will be represented as proportions. Otherwise, if this is not specified, the prediction will be represented as balances. (default: None). Returns ------- pd.DataFrame A table of coefficients where rows are covariates, and the columns are balances. If `tree` is specified, then the columns are proportions. """ coef = pd.DataFrame() for r in self.results: c = r.params c.name = r.model.endog_names coef = coef.append(c) if tree is not None: basis, _ = balance_basis(tree) c = ilr_inv(coef.values.T, basis=basis).T return pd.DataFrame(c, index=[n.name for n in tree.tips()], columns=coef.columns) else: return coef.T
def coefficients(self, tree=None): """ Returns coefficients from fit. Parameters ---------- tree : skbio.TreeNode, optional The tree used to perform the ilr transformation. If this is specified, then the prediction will be represented as proportions. Otherwise, if this is not specified, the prediction will be represented as balances. (default: None). Returns ------- pd.DataFrame A table of coefficients where rows are covariates, and the columns are balances. If `tree` is specified, then the columns are proportions. """ if not self._fitted: ValueError(('Model not fitted - coefficients not calculated.' 'See `fit()`')) coef = self._beta if tree is not None: basis, _ = balance_basis(tree) c = ilr_inv(coef.values, basis=basis) ids = [n.name for n in tree.tips()] return pd.DataFrame(c, columns=ids, index=coef.index) else: return coef
def partition_microbes(num_microbes, sigmaQ, microbe_in, state): """ Split up a single microbe abundances into multiple strains. Parameters ---------- num_microbes : int Number of strains to be represented sigmaQ : float The variance of the multivariate distribution microbe_in : np.array The input abundances for a single species state : numpy random state Random number generator Returns ------- microbes_out : np.array Multiple strain abundances. """ num_samples = len(microbe_in) a = state.multivariate_normal(mean=np.zeros(num_microbes - 1), cov=np.diag([sigmaQ] * (num_microbes - 1)), size=num_samples) microbe_partition = ilr_inv(a) microbes_out = np.multiply(microbe_partition, microbe_in.reshape(-1, 1)) return microbes_out
def test_regression_results_coefficient_projection(self): exp_coef = pd.DataFrame( { 'Intercept': ilr_inv(np.array([[1.447368, -0.052632]])), 'X': ilr_inv(np.array([[0.539474, 1.289474]])) }, index=['Z1', 'Z2', 'Z3']) feature_names = ['Z1', 'Z2', 'Z3'] basis = _gram_schmidt_basis(3) res = RegressionResults(self.results, basis=basis, feature_names=feature_names) pdt.assert_frame_equal(res.coefficients(project=True), exp_coef, check_exact=False, check_less_precise=True)
def test_regression_results_residuals_projection(self): A = np.array # aliasing np.array for the sake of pep8 exp_resid = pd.DataFrame( { 's1': ilr_inv(A([-0.986842, -0.236842])), 's2': ilr_inv(A([-0.065789, -1.815789])), 's3': ilr_inv(A([1.473684, 0.473684])), 's4': ilr_inv(A([1.394737, -1.105263])), 's5': ilr_inv(A([-1.065789, 1.184211])), 's6': ilr_inv(A([-1.144737, -0.394737])), 's7': ilr_inv(A([0.394737, 1.894737])) }, index=['a', 'b', 'c']).T # note that in the example, the basis is not strictly # equivalent to the tree basis = pd.DataFrame(clr_inv(_gram_schmidt_basis(3)), index=['Y1', 'Y2'], columns=['a', 'b', 'c']) submodels = [self.model1, self.model2] res = submock(submodels=submodels, basis=basis, tree=self.tree, balances=self.balances) res.fit() pdt.assert_frame_equal(res.residuals(project=True), exp_resid, check_exact=False, check_less_precise=True)
def test_regression_results_predict_projection(self): basis = pd.DataFrame(clr_inv(_gram_schmidt_basis(3)), index=['Y1', 'Y2'], columns=['a', 'b', 'c']) submodels = [self.model1, self.model2] res = submock(submodels=submodels, basis=basis, tree=self.tree, balances=self.balances) res.fit() res_predict = res.predict(self.data[['X']], project=True) A = np.array # aliasing np.array for the sake of pep8 exp_predict = pd.DataFrame( { 's1': ilr_inv(A([1.986842, 1.236842])), 's2': ilr_inv(A([3.065789, 3.815789])), 's3': ilr_inv(A([2.526316, 2.526316])), 's4': ilr_inv(A([3.605263, 5.105263])), 's5': ilr_inv(A([3.065789, 3.815789])), 's6': ilr_inv(A([4.144737, 6.394737])), 's7': ilr_inv(A([3.605263, 5.105263])) }, index=['a', 'b', 'c']).T pdt.assert_frame_equal(res_predict, exp_predict)
def test_regression_results_coefficient_projection(self): exp_coef = pd.DataFrame( {'Intercept': ilr_inv(np.array([[1.447368, -0.052632]])), 'X': ilr_inv(np.array([[0.539474, 1.289474]]))}, index=['a', 'b', 'c']) # note that in the example, the basis is not strictly # equivalent to the tree basis = pd.DataFrame(clr_inv(_gram_schmidt_basis(3)), index=['Y1', 'Y2'], columns=['a', 'b', 'c']) submodels = [self.model1, self.model2] res = submock(submodels=submodels, basis=basis, tree=self.tree, balances=self.balances) res.fit() pdt.assert_frame_equal(res.coefficients(project=True), exp_coef, check_exact=False, check_less_precise=True)
def test_mixedlm_balances(self): np.random.seed(6241) n = 1600 exog = np.random.normal(size=(n, 2)) groups = np.kron(np.arange(n / 16), np.ones(16)) # Build up the random error vector errors = 0 # The random effects exog_re = np.random.normal(size=(n, 2)) slopes = np.random.normal(size=(n / 16, 2)) slopes = np.kron(slopes, np.ones((16, 1))) * exog_re errors += slopes.sum(1) # First variance component errors += np.kron(2 * np.random.normal(size=n // 4), np.ones(4)) # Second variance component errors += np.kron(2 * np.random.normal(size=n // 2), np.ones(2)) # iid errors errors += np.random.normal(size=n) endog = exog.sum(1) + errors df = pd.DataFrame(index=range(n)) df["y1"] = endog df["y2"] = endog + 2 * 2 df["groups"] = groups df["x1"] = exog[:, 0] df["x2"] = exog[:, 1] tree = TreeNode.read(['(c, (b,a)Y2)Y1;']) iv = ilr_inv(df[["y1", "y2"]].values) table = pd.DataFrame(iv, columns=['a', 'b', 'c']) metadata = df[['x1', 'x2', 'groups']] res = mixedlm("x1 + x2", table, metadata, tree, groups="groups") exp_pvalues = pd.DataFrame( [[4.923122e-236, 3.180390e-40, 3.972325e-35, 3.568599e-30], [9.953418e-02, 3.180390e-40, 3.972325e-35, 3.568599e-30]], index=['Y1', 'Y2'], columns=['Intercept', 'Intercept RE', 'x1', 'x2']) pdt.assert_frame_equal(res.pvalues, exp_pvalues, check_less_precise=True) exp_coefficients = pd.DataFrame( [[4.211451, -0.305906, 1.022008, 0.924873], [0.211451, -0.305906, 1.022008, 0.924873]], columns=['Intercept', 'Intercept RE', 'x1', 'x2'], index=['Y1', 'Y2']) pdt.assert_frame_equal(res.coefficients(), exp_coefficients, check_less_precise=True)
def test_ols_ilr_inv_test(self): model = ols('x1 + x2', self.Y, self.X) model.fit() basis, _ = balance_basis(self.tree) # test pvalues exp = pd.DataFrame({'y1': self.r1_.pvalues, 'y2': self.r2_.pvalues}) pdt.assert_frame_equal(model.pvalues, exp) # test coefficients exp = pd.DataFrame({'y1': self.r1_.params, 'y2': self.r2_.params}) exp = pd.DataFrame(ilr_inv(exp, basis), columns=['c', 'b', 'a'], index=self.X.columns) res = model.coefficients(tree=self.tree) pdt.assert_frame_equal(res, exp) # test residuals exp = pd.DataFrame({ 'y1': self.r1_.resid, 'y2': self.r2_.resid }, index=self.Y.index) exp = pd.DataFrame(ilr_inv(exp, basis), index=self.Y.index, columns=['c', 'b', 'a']) res = model.residuals(tree=self.tree) pdt.assert_frame_equal(res, exp) # test prediction exp = pd.DataFrame({ 'y1': self.r1_.predict(), 'y2': self.r2_.predict() }, index=self.Y.index) exp = pd.DataFrame(ilr_inv(exp, basis), index=self.Y.index, columns=['c', 'b', 'a']) res = model.predict(tree=self.tree) pdt.assert_frame_equal(res, exp)
def test_ilr_basis_isomorphism(self): # tests to make sure that the isomorphism holds # with the introduction of the basis. basis = np.array([[0.80442968, 0.19557032]]) table = np.array([[ np.log(1 / 10) * np.sqrt(1 / 2), np.log(1.14141414 / 9.90909091) * np.sqrt(1 / 2), np.log(1.28282828 / 9.81818182) * np.sqrt(1 / 2), np.log(1.42424242 / 9.72727273) * np.sqrt(1 / 2), np.log(1.56565657 / 9.63636364) * np.sqrt(1 / 2) ]]).T res = ilr(ilr_inv(table, basis=basis), basis=basis) npt.assert_allclose(res, table.squeeze()) table = np.array([[1., 10.], [1.14141414, 9.90909091], [1.28282828, 9.81818182], [1.42424242, 9.72727273], [1.56565657, 9.63636364]]) res = ilr_inv(np.atleast_2d(ilr(table, basis=basis)).T, basis=basis) npt.assert_allclose(res, closure(table.squeeze()))
def test_ilr_basis_isomorphism(self): # tests to make sure that the isomorphism holds # with the introduction of the basis. basis = np.array([[0.80442968, 0.19557032]]) table = np.array([[np.log(1/10)*np.sqrt(1/2), np.log(1.14141414 / 9.90909091)*np.sqrt(1/2), np.log(1.28282828 / 9.81818182)*np.sqrt(1/2), np.log(1.42424242 / 9.72727273)*np.sqrt(1/2), np.log(1.56565657 / 9.63636364)*np.sqrt(1/2)]]).T res = ilr(ilr_inv(table, basis=basis), basis=basis) npt.assert_allclose(res, table.squeeze()) table = np.array([[1., 10.], [1.14141414, 9.90909091], [1.28282828, 9.81818182], [1.42424242, 9.72727273], [1.56565657, 9.63636364]]) res = ilr_inv(np.atleast_2d(ilr(table, basis=basis)).T, basis=basis) npt.assert_allclose(res, closure(table.squeeze()))
def predict(self, X=None, project=False, **kwargs): """ Performs a prediction based on model. Parameters ---------- X : pd.DataFrame, optional Input table of covariates, where columns are covariates, and rows are samples. If not specified, then the fitted values calculated from training the model will be returned. project : bool, optional Specifies if coefficients should be projected back into the Aitchison simplex [1]_. If false, the coefficients will be represented as balances (default: False). **kwargs : dict Other arguments to be passed into the model prediction. Returns ------- pd.DataFrame A table of values where rows are coefficients, and the columns are either balances or proportions, depending on the value of `project`. References ---------- .. [1] Aitchison, J. "A concise guide to compositional data analysis, CDA work." Girona 24 (2003): 73-81. """ self._check_projection(project) prediction = pd.DataFrame() for m in self.results: # check if X is none. p = pd.Series(m.predict(X, **kwargs)) p.name = m.model.endog_names if X is not None: p.index = X.index else: p.index = m.fittedvalues.index prediction = prediction.append(p) if project: # `check=False`, due to a problem with error handling # addressed here https://github.com/biocore/scikit-bio/pull/1396 # This will need to be fixed here: # https://github.com/biocore/gneiss/issues/34 proj_prediction = ilr_inv(prediction.values.T, basis=self.basis, check=False) return pd.DataFrame(proj_prediction, columns=self.feature_names, index=prediction.columns) return prediction.T
def setUp(self): A = np.array # aliasing for the sake of pep8 self.table = pd.DataFrame({ 's1': ilr_inv(A([1., 1.])), 's2': ilr_inv(A([1., 2.])), 's3': ilr_inv(A([1., 3.])), 's4': ilr_inv(A([1., 4.])), 's5': ilr_inv(A([1., 5.]))}, index=['a', 'b', 'c']).T self.tree = TreeNode.read(['(c, (b,a)Y2)Y1;']) self.unannotated_tree = TreeNode.read(['(c, (b,a));']) self.metadata = pd.DataFrame({ 'lame': [1, 1, 1, 1, 1], 'real': [1, 2, 3, 4, 5] }, index=['s1', 's2', 's3', 's4', 's5']) np.random.seed(0) n = 15 a = np.array([1, 4.2, 5.3, -2.2, 8]) x1 = np.linspace(.01, 0.1, n) x2 = np.logspace(0, 0.01, n) x3 = np.exp(np.linspace(0, 0.01, n)) x4 = x1 ** 2 self.x = pd.DataFrame({'x1': x1, 'x2': x2, 'x3': x3, 'x4': x4}) y = (a[0] + a[1]*x1 + a[2]*x2 + a[3]*x3 + a[4]*x4 + np.random.normal(size=n)) sy = np.vstack((y, y/10)).T self.y = pd.DataFrame(ilr_inv(sy), columns=['a', 'b', 'c']) self.t2 = TreeNode.read([r"((a,b)n,c);"])
def test_ols_ilr_inv_test(self): model = ols('x1 + x2', self.Y, self.X) model.fit() basis, _ = balance_basis(self.tree) # test pvalues exp = pd.DataFrame({'y1': self.r1_.pvalues, 'y2': self.r2_.pvalues}) pdt.assert_frame_equal(model.pvalues, exp) # test coefficients exp = pd.DataFrame({'y1': self.r1_.params, 'y2': self.r2_.params}) exp = pd.DataFrame(ilr_inv(exp, basis), columns=['c', 'b', 'a'], index=self.X.columns) res = model.coefficients(tree=self.tree) pdt.assert_frame_equal(res, exp) # test residuals exp = pd.DataFrame({'y1': self.r1_.resid, 'y2': self.r2_.resid}, index=self.Y.index) exp = pd.DataFrame(ilr_inv(exp, basis), index=self.Y.index, columns=['c', 'b', 'a']) res = model.residuals(tree=self.tree) pdt.assert_frame_equal(res, exp) # test prediction exp = pd.DataFrame({'y1': self.r1_.predict(), 'y2': self.r2_.predict()}, index=self.Y.index) exp = pd.DataFrame(ilr_inv(exp, basis), index=self.Y.index, columns=['c', 'b', 'a']) res = model.predict(tree=self.tree) pdt.assert_frame_equal(res, exp)
def coefficients(self, project=False): """ Returns coefficients from fit. Parameters ---------- project : bool, optional Specifies if coefficients should be projected back into the Aitchison simplex [1]_. If false, the coefficients will be represented as balances (default: False). Returns ------- pd.DataFrame A table of values where columns are coefficients, and the index is either balances or proportions, depending on the value of `project`. Raises ------ ValueError: Cannot perform projection into Aitchison simplex if `basis` is not specified. ValueError: Cannot perform projection into Aitchison simplex if `feature_names` is not specified. References ---------- .. [1] Aitchison, J. "A concise guide to compositional data analysis, CDA work." Girona 24 (2003): 73-81. """ self._check_projection(project) coef = pd.DataFrame() for r in self.results: c = r.params c.name = r.model.endog_names coef = coef.append(c) if project: # `check=False`, due to a problem with error handling # addressed here https://github.com/biocore/scikit-bio/pull/1396 # This will need to be fixed here: # https://github.com/biocore/gneiss/issues/34 c = ilr_inv(coef.values.T, basis=self.basis, check=False).T return pd.DataFrame(c, index=self.feature_names, columns=coef.columns) else: return coef
def test_ilr_inv_basis(self): exp = closure(np.array([[1., 10.], [1.14141414, 9.90909091], [1.28282828, 9.81818182], [1.42424242, 9.72727273], [1.56565657, 9.63636364]])) basis = np.array([[0.80442968, 0.19557032]]) table = np.array([[np.log(1/10)*np.sqrt(1/2), np.log(1.14141414 / 9.90909091)*np.sqrt(1/2), np.log(1.28282828 / 9.81818182)*np.sqrt(1/2), np.log(1.42424242 / 9.72727273)*np.sqrt(1/2), np.log(1.56565657 / 9.63636364)*np.sqrt(1/2)]]).T res = ilr_inv(table, basis=basis) npt.assert_allclose(res, exp)
def test_ilr_inv_basis(self): exp = closure(np.array([[1., 10.], [1.14141414, 9.90909091], [1.28282828, 9.81818182], [1.42424242, 9.72727273], [1.56565657, 9.63636364]])) basis = np.array([[0.80442968, 0.19557032]]) table = np.array([[np.log(1/10)*np.sqrt(1/2), np.log(1.14141414 / 9.90909091)*np.sqrt(1/2), np.log(1.28282828 / 9.81818182)*np.sqrt(1/2), np.log(1.42424242 / 9.72727273)*np.sqrt(1/2), np.log(1.56565657 / 9.63636364)*np.sqrt(1/2)]]).T res = ilr_inv(table, basis=basis) npt.assert_allclose(res, exp)
def setUp(self): self.results = "results" if not os.path.exists(self.results): os.mkdir(self.results) self.balances = pd.DataFrame( { 'a': [-2, -1, 0, 1, 2], 'b': [-2, 0, 0, 0, 0] }, index=['a1', 'a2', 'a3', 'a4', 'a5']) self.tree = TreeNode.read([r'((k, q)d, ((x, y)a, z)b)c;']) self.taxonomy = pd.DataFrame( [['foo;barf;a;b;c;d;e', 1], ['foo;bark;f;g;h;i;j', 1], ['foo;bark;f;g;h;w;j', 1], ['nom;tu;k;l;m;n;o', 0.9], ['nom;tu;k;l;m;t;o', 0.9]], columns=['Taxon', 'Confidence'], index=['x', 'y', 'z', 'k', 'q']) self.balances = pd.DataFrame( [[1, 2, 3, 4, 5, 6, 7], [-3.1, -2.9, -3, 3, 2.9, 3.2, 3.1], [1, 1, 1, 1, 1, 1, 1], [3, 2, 1, 0, -1, -2, -3]], index=['d', 'a', 'b', 'c'], columns=['s1', 's2', 's3', 's4', 's5', 's6', 's7']).T basis, _ = balance_basis(self.tree) self.table = pd.DataFrame( ilr_inv(self.balances, basis), columns=['x', 'y', 'z', 'k', 'q'], index=['s1', 's2', 's3', 's4', 's5', 's6', 's7']) index = pd.Index(['s1', 's2', 's3', 's4', 's5', 's6', 's7'], name='id') self.categorical = CategoricalMetadataColumn( pd.Series(['a', 'a', 'a', 'b', 'b', 'b', 'b'], index=index, name='categorical')) self.multi_categorical = CategoricalMetadataColumn( pd.Series(['a', 'a', 'c', 'b', 'b', 'b', 'c'], index=index, name='multi_categorical')) self.partial_numerical_categorical = CategoricalMetadataColumn( pd.Series(['1', '1', '1', '2', '2', '2', 'a'], index=index, name='multi_categorical')) self.full_numerical_categorical = CategoricalMetadataColumn( pd.Series(['1', '1', '1.0', '2', '2', '2.0', '3'], index=index, name='numerical_categorical')) self.continuous = NumericMetadataColumn( pd.Series(np.arange(7), index=index, name='continuous'))
def residuals(self, project=False): """ Returns calculated residuals. Parameters ---------- X : pd.DataFrame, optional Input table of covariates. If not specified, then the fitted values calculated from training the model will be returned. project : bool, optional Specifies if coefficients should be projected back into the Aitchison simplex [1]_. If false, the coefficients will be represented as balances (default: False). Returns ------- pd.DataFrame A table of values where rows are samples, and the columns are either balances or proportions, depending on the value of `project`. References ---------- .. [1] Aitchison, J. "A concise guide to compositional data analysis, CDA work." Girona 24 (2003): 73-81. """ self._check_projection(project) resid = pd.DataFrame() for r in self.results: err = r.resid err.name = r.model.endog_names resid = resid.append(err) if project: # `check=False`, due to a problem with error handling # addressed here https://github.com/biocore/scikit-bio/pull/1396 # This will need to be fixed here: # https://github.com/biocore/gneiss/issues/34 proj_resid = ilr_inv(resid.values.T, basis=self.basis, check=False).T return pd.DataFrame(proj_resid, index=self.feature_names, columns=resid.columns).T else: return resid.T
def predict(self, X=None, tree=None, **kwargs): """ Performs a prediction based on model. Parameters ---------- X : pd.DataFrame, optional Input table of covariates, where columns are covariates, and rows are samples. If not specified, then the fitted values calculated from training the model will be returned. tree : skbio.TreeNode, optional The tree used to perform the ilr transformation. If this is specified, then the prediction will be represented as proportions. Otherwise, if this is not specified, the prediction will be represented as balances. (default: None). **kwargs : dict Other arguments to be passed into the model prediction. Returns ------- pd.DataFrame A table of predicted values where rows are covariates, and the columns are balances. If `tree` is specified, then the columns are proportions. """ prediction = pd.DataFrame() for m in self.results: # check if X is none. p = pd.Series(m.predict(X, **kwargs)) p.name = m.model.endog_names if X is not None: p.index = X.index else: p.index = m.fittedvalues.index prediction = prediction.append(p) if tree is not None: basis, _ = balance_basis(tree) proj_prediction = ilr_inv(prediction.values.T, basis=basis) return pd.DataFrame(proj_prediction, columns=[n.name for n in tree.tips()], index=prediction.columns) else: return prediction.T
def residuals(self, tree=None): """ Returns calculated residuals from fit. Parameters ---------- X : pd.DataFrame, optional Input table of covariates. If not specified, then the fitted values calculated from training the model will be returned. tree : skbio.TreeNode, optional The tree used to perform the ilr transformation. If this is specified, then the prediction will be represented as proportions. Otherwise, if this is not specified, the prediction will be represented as balances. (default: None). Returns ------- pd.DataFrame A table of residuals where rows are covariates, and the columns are balances. If `tree` is specified, then the columns are proportions. References ---------- .. [1] Aitchison, J. "A concise guide to compositional data analysis, CDA work." Girona 24 (2003): 73-81. """ resid = pd.DataFrame() for r in self.results: err = r.resid err.name = r.model.endog_names resid = resid.append(err) if tree is not None: basis, _ = balance_basis(tree) proj_resid = ilr_inv(resid.values.T, basis=basis).T return pd.DataFrame(proj_resid, index=[n.name for n in tree.tips()], columns=resid.columns).T else: return resid.T
def predict(self, X=None, tree=None, **kwargs): """ Performs a prediction based on model. Parameters ---------- X : pd.DataFrame, optional Input table of covariates, where columns are covariates, and rows are samples. If not specified, then the fitted values calculated from training the model will be returned. tree : skbio.TreeNode, optional The tree used to perform the ilr transformation. If this is specified, then the prediction will be represented as proportions. Otherwise, if this is not specified, the prediction will be represented as balances. (default: None). **kwargs : dict Other arguments to be passed into the model prediction. Returns ------- pd.DataFrame A table of predicted values where rows are covariates, and the columns are balances. If `tree` is specified, then the columns are proportions. """ if not self._fitted: ValueError(('Model not fitted - coefficients not calculated.' 'See `fit()`')) if X is None: X = self.design_matrices prediction = X.dot(self._beta) if tree is not None: basis, _ = balance_basis(tree) proj_prediction = ilr_inv(prediction.values, basis=basis) ids = [n.name for n in tree.tips()] return pd.DataFrame(proj_prediction, columns=ids, index=prediction.index) else: return prediction
def predict(self, X=None, tree=None, **kwargs): """ Performs a prediction based on model. Parameters ---------- X : pd.DataFrame, optional Input table of covariates, where columns are covariates, and rows are samples. If not specified, then the fitted values calculated from training the model will be returned. tree : skbio.TreeNode, optional The tree used to perform the ilr transformation. If this is specified, then the prediction will be represented as proportions. Otherwise, if this is not specified, the prediction will be represented as balances. (default: None). **kwargs : dict Other arguments to be passed into the model prediction. Returns ------- pd.DataFrame A table of predicted values where columns are coefficients, and the rows are balances. If `tree` is specified, then the rows are proportions. """ if not self._fitted: ValueError(('Model not fitted - coefficients not calculated.' 'See `fit()`')) if X is None: X = self.design_matrices prediction = X.dot(self._beta) if tree is not None: basis, _ = balance_basis(tree) proj_prediction = ilr_inv(prediction.values, basis=basis) ids = [n.name for n in tree.tips()] return pd.DataFrame(proj_prediction, columns=ids, index=prediction.index) else: return prediction
def test_ilr_inv(self): mat = closure(self.cdata7) npt.assert_array_almost_equal(ilr_inv(ilr(mat)), mat) npt.assert_allclose(ilr_inv(np.identity(3)), self.ortho1, rtol=1e-04, atol=1e-06) with self.assertRaises(ValueError): ilr_inv(self.cdata1, basis=self.cdata1) # make sure that inplace modification is not occurring ilr_inv(self.cdata1) npt.assert_allclose(self.cdata1, np.array([[2, 2, 6], [4, 4, 2]]))
def test_ilr_inv(self): mat = closure(self.cdata7) npt.assert_array_almost_equal(ilr_inv(ilr(mat)), mat) npt.assert_allclose(ilr_inv(np.identity(3)), self.ortho1, rtol=1e-04, atol=1e-06) with self.assertRaises(ValueError): ilr_inv(self.cdata1, basis=self.cdata1) # make sure that inplace modification is not occurring ilr_inv(self.cdata1) npt.assert_allclose(self.cdata1, np.array([[2, 2, 6], [4, 4, 2]]))
def residuals(self, tree=None): """ Returns calculated residuals from fit. Parameters ---------- X : pd.DataFrame, optional Input table of covariates. If not specified, then the fitted values calculated from training the model will be returned. tree : skbio.TreeNode, optional The tree used to perform the ilr transformation. If this is specified, then the prediction will be represented as proportions. Otherwise, if this is not specified, the prediction will be represented as balances. (default: None). Returns ------- pd.DataFrame A table of residuals where rows are covariates, and the columns are balances. If `tree` is specified, then the columns are proportions. References ---------- .. [1] Aitchison, J. "A concise guide to compositional data analysis, CDA work." Girona 24 (2003): 73-81. """ if not self._fitted: ValueError(('Model not fitted - coefficients not calculated.' 'See `fit()`')) resid = self._resid if tree is not None: basis, _ = balance_basis(tree) proj_resid = ilr_inv(resid.values, basis=basis) ids = [n.name for n in tree.tips()] return pd.DataFrame(proj_resid, columns=ids, index=resid.index) else: return resid
def setUp(self): np.random.seed(6241) n = 1600 exog = np.random.normal(size=(n, 2)) groups = np.kron(np.arange(n // 16), np.ones(16)) # Build up the random error vector errors = 0 # The random effects exog_re = np.random.normal(size=(n, 2)) slopes = np.random.normal(size=(n // 16, 2)) slopes = np.kron(slopes, np.ones((16, 1))) * exog_re errors += slopes.sum(1) # First variance component errors += np.kron(2 * np.random.normal(size=n // 4), np.ones(4)) # Second variance component errors += np.kron(2 * np.random.normal(size=n // 2), np.ones(2)) # iid errors errors += np.random.normal(size=n) endog = exog.sum(1) + errors df = pd.DataFrame(index=range(n)) df["y1"] = endog df["y2"] = endog + 2 * 2 df["groups"] = groups df["x1"] = exog[:, 0] df["x2"] = exog[:, 1] self.tree = TreeNode.read(['(c, (b,a)Y2)Y1;']) iv = ilr_inv(df[["y1", "y2"]].values) self.table = pd.DataFrame(iv, columns=['a', 'b', 'c']) self.metadata = df[['x1', 'x2', 'groups']] self.results = "results" os.mkdir(self.results)
def residuals(self, tree=None): """ Returns calculated residuals from fit. Parameters ---------- X : pd.DataFrame, optional Input table of covariates. If not specified, then the fitted values calculated from training the model will be returned. tree : skbio.TreeNode, optional The tree used to perform the ilr transformation. If this is specified, then the prediction will be represented as proportions. Otherwise, if this is not specified, the prediction will be represented as balances. (default: None). Returns ------- pd.DataFrame A table of residuals where rows are covariates, and the columns are balances. If `tree` is specified, then the columns are proportions. References ---------- .. [1] Aitchison, J. "A concise guide to compositional data analysis, CDA work." Girona 24 (2003): 73-81. """ if not self._fitted: ValueError(('Model not fitted - coefficients not calculated.' 'See `fit()`')) resid = self._resid if tree is not None: basis, _ = balance_basis(tree) proj_resid = ilr_inv(resid.values, basis=basis) ids = [n.name for n in tree.tips()] return pd.DataFrame(proj_resid, columns=ids, index=resid.index) else: return resid
def test_ols_empty_metadata_error(self): A = np.array # aliasing for the sake of pep8 table = pd.DataFrame({ 'k1': ilr_inv(A([1., 1.])), 'k2': ilr_inv(A([1., 2.])), 'k3': ilr_inv(A([1., 3.])), 'k4': ilr_inv(A([1., 4.])), 'k5': ilr_inv(A([1., 5.])), 'k6': ilr_inv(A([1., 5.]))}, index=['a', 'b', 'c']).T tree = TreeNode.read(['((c,d),(b,a)Y2)Y1;']) metadata = pd.DataFrame({ 'lame': [1, 1, 1, 1, 1], 'real': [1, 2, 3, 4, 5] }, index=['s1', 's2', 's3', 's4', 's5']) with self.assertRaises(ValueError): ols('real + lame', table, metadata, tree)
def test_regression_results_residuals_projection(self): A = np.array # aliasing np.array for the sake of pep8 exp_resid = pd.DataFrame( { 's1': ilr_inv(A([-0.986842, -0.236842])), 's2': ilr_inv(A([-0.065789, -1.815789])), 's3': ilr_inv(A([1.473684, 0.473684])), 's4': ilr_inv(A([1.394737, -1.105263])), 's5': ilr_inv(A([-1.065789, 1.184211])), 's6': ilr_inv(A([-1.144737, -0.394737])), 's7': ilr_inv(A([0.394737, 1.894737])) }, index=['Z1', 'Z2', 'Z3']).T feature_names = ['Z1', 'Z2', 'Z3'] basis = _gram_schmidt_basis(3) res = RegressionResults(self.results, basis=basis, feature_names=feature_names) pdt.assert_frame_equal(res.residuals(project=True), exp_resid, check_exact=False, check_less_precise=True)
def test_regression_results_predict_projection(self): feature_names = ['Z1', 'Z2', 'Z3'] basis = _gram_schmidt_basis(3) model = RegressionResults(self.results, basis=basis, feature_names=feature_names) res_predict = model.predict(self.data[['X']], project=True) A = np.array # aliasing np.array for the sake of pep8 exp_predict = pd.DataFrame( { 's1': ilr_inv(A([1.986842, 1.236842])), 's2': ilr_inv(A([3.065789, 3.815789])), 's3': ilr_inv(A([2.526316, 2.526316])), 's4': ilr_inv(A([3.605263, 5.105263])), 's5': ilr_inv(A([3.065789, 3.815789])), 's6': ilr_inv(A([4.144737, 6.394737])), 's7': ilr_inv(A([3.605263, 5.105263])) }, index=feature_names).T pdt.assert_frame_equal(res_predict, exp_predict)