def test_ols_immutable(self): # test to see if values in table get filtered out. # and that the original table doesn't change table = self.Y x = pd.DataFrame(self.X.values, columns=self.X.columns, index=range(100, 100+len(self.X.index))) metadata = pd.concat((self.X, x)) exp_metadata = metadata.copy() ols('x1 + x2', self.Y, self.X) self.assertEqual(str(table), str(self.Y)) self.assertEqual(str(metadata), str(exp_metadata))
def test_ols_immutable(self): # test to see if values in table get filtered out. # and that the original table doesn't change table = self.Y x = pd.DataFrame(self.X.values, columns=self.X.columns, index=range(100, 100 + len(self.X.index))) metadata = pd.concat((self.X, x)) exp_metadata = metadata.copy() ols('x1 + x2', self.Y, self.X) self.assertEqual(str(table), str(self.Y)) self.assertEqual(str(metadata), str(exp_metadata))
def test_ess(self): model = ols('x1 + x2', self.Y, self.X) model.fit() exp = pd.Series({'y1': self.r1_.ess, 'y2': self.r2_.ess}) pdt.assert_series_equal(model.ess, exp)
def test_mse(self): model = ols('x1 + x2', self.Y, self.X) model.fit() exp = pd.Series({'y1': self.r1_.mse_resid, 'y2': self.r2_.mse_resid}) pdt.assert_series_equal(model.mse, exp)
def test_tvalues(self): model = ols('x1 + x2', self.Y, self.X) model.fit() exp = pd.DataFrame({'y1': self.r1_.tvalues, 'y2': self.r2_.tvalues}) pdt.assert_frame_equal(model.tvalues, exp)
def test_visualization(self): res = ols(formula="x1 + x2 + x3 + x4", table=self.y, metadata=self.x) res.fit() ols_summary(self.results, res, tree=self.t2) fp = os.path.join(self.results, 'pvalues.csv') self.assertTrue(os.path.exists(fp)) fp = os.path.join(self.results, 'coefficients.csv') self.assertTrue(os.path.exists(fp)) fp = os.path.join(self.results, 'predicted.csv') self.assertTrue(os.path.exists(fp)) fp = os.path.join(self.results, 'residuals.csv') self.assertTrue(os.path.exists(fp)) index_fp = os.path.join(self.results, 'index.html') self.assertTrue(os.path.exists(index_fp)) with open(index_fp, 'r') as fh: html = fh.read() self.assertIn('<h1>Simplicial Linear Regression Summary</h1>', html) self.assertIn('<th>Coefficients</th>\n', html) self.assertIn('<th>Predicted Balances</th>\n', html) self.assertIn('<th>Residuals</th>\n', html)
def test_ols_rename(self): res = ols('real', self.table, self.metadata, self.unannotated_tree) res.fit() res_coef = res.coefficients() exp_coef = pd.DataFrame( {'Intercept': [0, 1.00], 'real': [1.0, 0]}, index=['y0', 'y1']) pdt.assert_frame_equal(res_coef, exp_coef, check_exact=False, check_less_precise=True) # Double check to make sure the fit is perfect self.assertAlmostEqual(res.r2, 1) # Double check to make sure residuals are zero exp_resid = pd.DataFrame([[0., 0.], [0., 0.], [0., 0.], [0., 0.], [0., 0.]], index=['s1', 's2', 's3', 's4', 's5'], columns=['y0', 'y1']) pdt.assert_frame_equal(exp_resid, res.residuals())
def test_ols_immutable(self): A = np.array # aliasing for the sake of pep8 table = pd.DataFrame({ 's1': ilr_inv(A([1., 1.])), 's2': ilr_inv(A([1., 2.])), 's3': ilr_inv(A([1., 3.])), 's4': ilr_inv(A([1., 4.])), 's5': ilr_inv(A([1., 5.])), 's6': ilr_inv(A([1., 5.]))}, index=['a', 'b', 'c']).T exp_table = pd.DataFrame({ 's1': ilr_inv(A([1., 1.])), 's2': ilr_inv(A([1., 2.])), 's3': ilr_inv(A([1., 3.])), 's4': ilr_inv(A([1., 4.])), 's5': ilr_inv(A([1., 5.])), 's6': ilr_inv(A([1., 5.]))}, index=['a', 'b', 'c']).T tree = TreeNode.read(['((c,d),(b,a));']) exp_tree = TreeNode.read(['((b,a)y1,c)y0;\n']) metadata = pd.DataFrame({ 'lame': [1, 1, 1, 1, 1], 'real': [1, 2, 3, 4, 5] }, index=['s1', 's2', 's3', 's4', 's5']) res = ols('real + lame', table, metadata, tree) res.fit() self.assertEqual(str(table), str(exp_table)) self.assertEqual(str(exp_tree), str(res.tree))
def test_mse(self): res = ols(formula="x1 + x2 + x3 + x4", table=self.y, metadata=self.x, tree=self.t2) res.fit() self.assertAlmostEqual(res.mse, 0.79228890379010453, places=4)
def test_summary_head(self): A = np.array # aliasing for the sake of pep8 table = pd.DataFrame({ 's1': ilr_inv(A([1., 3.])), 's2': ilr_inv(A([2., 2.])), 's3': ilr_inv(A([1., 3.])), 's4': ilr_inv(A([3., 4.])), 's5': ilr_inv(A([1., 5.]))}, index=['a', 'b', 'c']).T tree = TreeNode.read(['(c, (b,a)Y2)Y1;']) metadata = pd.DataFrame({ 'lame': [1, 2, 1, 4, 1], 'real': [1, 2, 3, 4, 5] }, index=['s1', 's2', 's3', 's4', 's5']) np.random.seed(0) self.maxDiff = None model = ols('real', table, metadata, tree) model.fit() fname = get_data_path('exp_ols_results2.txt') res = str(model.summary(ndim=1)) with open(fname, 'r') as fh: exp = fh.read() self.assertEqual(res, exp)
def test_loo(self): res = ols(formula="x1 + x2 + x3 + x4", table=self.y, metadata=self.x, tree=self.t2) res.fit() exp_loo = pd.DataFrame([[0.66953263510975791, 10.994700550912553], [0.69679777354984163, 2.3613911713947062], [0.84934173316473072, 0.4057812892157881], [0.6990546679957772, 2.2872776593899351], [0.72855466737125463, 1.7615637744849277], [0.55998953661859308, 3.617823652256889], [0.81787392852582308, 0.72395497360494043], [0.8653549732546999, 0.17706927499520822], [0.86983181933002329, 0.1216027316667969], [0.87779006612352628, 0.028600627330344405], [0.86591226075609384, 0.16724511075065476], [0.7787232221539, 1.2820054843437292], [0.88032413856094505, 3.4113910096200831e-06], [0.83195133809800792, 0.62276589277034022], [0.85352707356786695, 1.4038585971691198]], columns=['mse', 'pred_err'], index=self.y.index) res_loo = res.loo().astype(np.float) # Precision issues ... # pdt.assert_frame_equal(exp_loo, res_loo, check_less_precise=True) npt.assert_allclose(exp_loo, res_loo, atol=1e-3, rtol=1e-3)
def test_ols_missing_metadata(self): # test to see if values in table get filtered out. # and that the original table doesn't change table = self.Y y = pd.DataFrame(self.Y.values, columns=self.Y.columns, index=range(100, 100 + len(self.Y.index))) table = pd.concat((self.Y, y)) ids = np.arange(100, 100 + len(self.X.index)) x = pd.DataFrame([[np.nan] * len(self.X.columns)] * len(ids), columns=self.X.columns, index=ids) metadata = pd.concat((self.X, x)) model = ols('x1 + x2', table, metadata) model.fit() # test prediction exp = pd.DataFrame({ 'y1': self.r1_.predict(), 'y2': self.r2_.predict() }, index=self.Y.index) res = model.predict() pdt.assert_frame_equal(res, exp)
def test_percent_explained(self): res = ols(formula="x1 + x2 + x3 + x4", table=self.y, metadata=self.x, tree=self.t2) res.fit() res_perc = res.percent_explained() exp_perc = pd.Series({'y0': 0.009901, 'y1': 0.990099}) pdt.assert_series_equal(res_perc, exp_perc)
def test_lovo(self): res = ols(formula="x1 + x2 + x3 + x4", table=self.y, metadata=self.x, tree=self.t2) res.fit() exp_lovo = pd.DataFrame([[0.799364, 0.978214], [0.799363, 0.097355], [0.799368, 0.0973498], [0.799364, 0.097354], [0.799361, 0.0973575]], columns=['mse', 'Rsquared'], index=['Intercept', 'x1', 'x2', 'x3', 'x4']) res_lovo = res.lovo().astype(np.float) pdt.assert_frame_equal(exp_lovo, res_lovo, check_less_precise=True)
def test_ols_test(self): model = ols('x1 + x2', self.Y, self.X) model.fit() # test pvalues exp = pd.DataFrame({'y1': self.r1_.pvalues, 'y2': self.r2_.pvalues}) pdt.assert_frame_equal(model.pvalues, exp) # test coefficients exp = pd.DataFrame({'y1': self.r1_.params, 'y2': self.r2_.params}) res = model.coefficients() pdt.assert_frame_equal(res, exp) # test residuals exp = pd.DataFrame({'y1': self.r1_.resid, 'y2': self.r2_.resid}, index=self.Y.index) res = model.residuals() pdt.assert_frame_equal(res, exp) # test prediction exp = pd.DataFrame({'y1': self.r1_.predict(), 'y2': self.r2_.predict()}, index=self.Y.index) res = model.predict() pdt.assert_frame_equal(res, exp) # make a small prediction fx = pd.DataFrame( [[1, 1, 1], [1, 1, 2]], columns=['Intercept', 'x1', 'x2'], index=['f1', 'f2']) rp1 = self.r1_.predict([[1, 1, 1], [1, 1, 2]]) rp2 = self.r2_.predict([[1, 1, 1], [1, 1, 2]]) exp = pd.DataFrame({'y1': rp1, 'y2': rp2}, index=['f1', 'f2']) res = model.predict(X=fx) pdt.assert_frame_equal(res, exp) # test r2 self.assertAlmostEqual(model.r2, 0.21981627865598752)
def test_ols_test(self): model = ols('x1 + x2', self.Y, self.X) model.fit() # test pvalues exp = pd.DataFrame({'y1': self.r1_.pvalues, 'y2': self.r2_.pvalues}) pdt.assert_frame_equal(model.pvalues, exp) # test coefficients exp = pd.DataFrame({'y1': self.r1_.params, 'y2': self.r2_.params}) res = model.coefficients() pdt.assert_frame_equal(res, exp) # test residuals exp = pd.DataFrame({ 'y1': self.r1_.resid, 'y2': self.r2_.resid }, index=self.Y.index) res = model.residuals() pdt.assert_frame_equal(res, exp) # test prediction exp = pd.DataFrame({ 'y1': self.r1_.predict(), 'y2': self.r2_.predict() }, index=self.Y.index) res = model.predict() pdt.assert_frame_equal(res, exp) # make a small prediction fx = pd.DataFrame([[1, 1, 1], [1, 1, 2]], columns=['Intercept', 'x1', 'x2'], index=['f1', 'f2']) rp1 = self.r1_.predict([[1, 1, 1], [1, 1, 2]]) rp2 = self.r2_.predict([[1, 1, 1], [1, 1, 2]]) exp = pd.DataFrame({'y1': rp1, 'y2': rp2}, index=['f1', 'f2']) res = model.predict(X=fx) pdt.assert_frame_equal(res, exp) # test r2 self.assertAlmostEqual(model.r2, 0.21981627865598752)
def test_ols_zero_error(self): table = pd.DataFrame({ 's1': [0, 0, 0], 's2': [0, 0, 0], 's3': [0, 0, 0], 's4': [0, 0, 0], 's5': [0, 0, 0], 's6': [0, 0, 0]}, index=['a', 'b', 'c']).T tree = TreeNode.read(['((c,d),(b,a)Y2)Y1;']) metadata = pd.DataFrame({ 'lame': [1, 1, 1, 1, 1], 'real': [1, 2, 3, 4, 5] }, index=['s1', 's2', 's3', 's4', 's5']) with self.assertRaises(ValueError): res = ols('real + lame', table, metadata, tree) res.fit()
def test_ols_empty_metadata_error(self): A = np.array # aliasing for the sake of pep8 table = pd.DataFrame({ 'k1': ilr_inv(A([1., 1.])), 'k2': ilr_inv(A([1., 2.])), 'k3': ilr_inv(A([1., 3.])), 'k4': ilr_inv(A([1., 4.])), 'k5': ilr_inv(A([1., 5.])), 'k6': ilr_inv(A([1., 5.]))}, index=['a', 'b', 'c']).T tree = TreeNode.read(['((c,d),(b,a)Y2)Y1;']) metadata = pd.DataFrame({ 'lame': [1, 1, 1, 1, 1], 'real': [1, 2, 3, 4, 5] }, index=['s1', 's2', 's3', 's4', 's5']) with self.assertRaises(ValueError): res = ols('real + lame', table, metadata, tree) res.fit()
def test_ols_ilr_inv_test(self): model = ols('x1 + x2', self.Y, self.X) model.fit() basis, _ = balance_basis(self.tree) # test pvalues exp = pd.DataFrame({'y1': self.r1_.pvalues, 'y2': self.r2_.pvalues}) pdt.assert_frame_equal(model.pvalues, exp) # test coefficients exp = pd.DataFrame({'y1': self.r1_.params, 'y2': self.r2_.params}) exp = pd.DataFrame(ilr_inv(exp, basis), columns=['c', 'b', 'a'], index=self.X.columns) res = model.coefficients(tree=self.tree) pdt.assert_frame_equal(res, exp) # test residuals exp = pd.DataFrame({ 'y1': self.r1_.resid, 'y2': self.r2_.resid }, index=self.Y.index) exp = pd.DataFrame(ilr_inv(exp, basis), index=self.Y.index, columns=['c', 'b', 'a']) res = model.residuals(tree=self.tree) pdt.assert_frame_equal(res, exp) # test prediction exp = pd.DataFrame({ 'y1': self.r1_.predict(), 'y2': self.r2_.predict() }, index=self.Y.index) exp = pd.DataFrame(ilr_inv(exp, basis), index=self.Y.index, columns=['c', 'b', 'a']) res = model.predict(tree=self.tree) pdt.assert_frame_equal(res, exp)
def test_ols_ilr_inv_test(self): model = ols('x1 + x2', self.Y, self.X) model.fit() basis, _ = balance_basis(self.tree) # test pvalues exp = pd.DataFrame({'y1': self.r1_.pvalues, 'y2': self.r2_.pvalues}) pdt.assert_frame_equal(model.pvalues, exp) # test coefficients exp = pd.DataFrame({'y1': self.r1_.params, 'y2': self.r2_.params}) exp = pd.DataFrame(ilr_inv(exp, basis), columns=['c', 'b', 'a'], index=self.X.columns) res = model.coefficients(tree=self.tree) pdt.assert_frame_equal(res, exp) # test residuals exp = pd.DataFrame({'y1': self.r1_.resid, 'y2': self.r2_.resid}, index=self.Y.index) exp = pd.DataFrame(ilr_inv(exp, basis), index=self.Y.index, columns=['c', 'b', 'a']) res = model.residuals(tree=self.tree) pdt.assert_frame_equal(res, exp) # test prediction exp = pd.DataFrame({'y1': self.r1_.predict(), 'y2': self.r2_.predict()}, index=self.Y.index) exp = pd.DataFrame(ilr_inv(exp, basis), index=self.Y.index, columns=['c', 'b', 'a']) res = model.predict(tree=self.tree) pdt.assert_frame_equal(res, exp)
def test_ols_missing_metadata(self): # test to see if values in table get filtered out. # and that the original table doesn't change table = self.Y y = pd.DataFrame(self.Y.values, columns=self.Y.columns, index=range(100, 100+len(self.Y.index))) table = pd.concat((self.Y, y)) ids = np.arange(100, 100+len(self.X.index)) x = pd.DataFrame([[np.nan] * len(self.X.columns)] * len(ids), columns=self.X.columns, index=ids) metadata = pd.concat((self.X, x)) model = ols('x1 + x2', table, metadata) model.fit() # test prediction exp = pd.DataFrame({'y1': self.r1_.predict(), 'y2': self.r2_.predict()}, index=self.Y.index) res = model.predict() pdt.assert_frame_equal(res, exp)
def test_write(self): res = ols(formula="x1 + x2 + x3 + x4", table=self.y, metadata=self.x, tree=self.t2) res.fit() res.write_pickle('ols.pickle')
def test_visualization(self): res = ols(formula="x1 + x2 + x3 + x4", table=self.y, metadata=self.x, tree=self.t2) res.fit() ols_summary(self.results, res) pvals = pd.read_csv(os.path.join(self.results, 'pvalues.csv'), index_col=0) coefs = pd.read_csv(os.path.join(self.results, 'coefficients.csv'), index_col=0) pred = pd.read_csv(os.path.join(self.results, 'predicted.csv'), index_col=0) resid = pd.read_csv(os.path.join(self.results, 'residuals.csv'), index_col=0) exp_pred = pd.DataFrame({ 'y0': { 0: -0.53375121547306381, 1: -0.56479853016207482, 2: -0.56287346890240741, 3: -0.54189204731941831, 4: -0.51324876614124992, 5: -0.48580516711594918, 6: -0.46588315729838481, 7: -0.45726500901030648, 8: -0.46118573050287187, 9: -0.47632066813918106, 10: -0.49878455279984207, 11: -0.52212577764307233, 12: -0.53732163007547018, 13: -0.53276780094653364, 14: -0.49427170901103434 }, 'y1': { 0: -5.3374587490426801, 1: -5.6479395254526388, 2: -5.628692727739514, 3: -5.4188785121568728, 4: -5.1324342565916066, 5: -4.8580135254968413, 6: -4.6587877057054454, 7: -4.5725985939212412, 8: -4.6118058093989722, 9: -4.7631647231182699, 10: -4.9877959410043564, 11: -5.2212158195661642, 12: -5.3731686226401827, 13: -5.3276265175104554, 14: -4.942667506421965 } }) npt.assert_allclose(exp_pred.values, pred.values, rtol=1e-2, atol=1e-2) exp_coefs = pd.DataFrame({ 'Intercept': { 'y0': 6880999561.7020159, 'y1': 68809995617.020004 }, 'x1': { 'y0': 676465286.62179089, 'y1': 6764652866.2178936 }, 'x2': { 'y0': 610204064.32702351, 'y1': 6102040643.2702208 }, 'x3': { 'y0': -7497970910.8040514, 'y1': -74979709108.040298 }, 'x4': { 'y0': 26313750.43187603, 'y1': 263137504.31875956 } }) npt.assert_allclose(exp_coefs.values, coefs.values, rtol=1e-2, atol=1e-2) exp_resid = pd.DataFrame({ 'y0': { 0: -0.05693401912370244, 1: 0.10695167979147802, 2: 0.043549965263444679, 3: -0.10733300196780859, 4: -0.10239623711106705, 5: 0.15082282297327071, 6: -0.065724199795309968, 7: 0.031846373671398198, 8: 0.026929958766511719, 9: -0.013407601943539682, 10: 0.031553548285065736, 11: -0.080353914812739569, 12: -0.00012764772886153519, 13: 0.054894158986049046, 14: -0.02024886341379073 }, 'y1': { 0: -0.56939359692498392, 1: 1.0694710217466721, 2: 0.4354576913498871, 3: -1.0733719807153905, 4: -1.0240157759315673, 5: 1.5081900840700544, 6: -0.65728586523150234, 7: 0.318412240532159, 8: 0.26924809203537148, 9: -0.13411797770893941, 10: 0.31548589585659403, 11: -0.80358110499195856, 12: -0.0013241554031324654, 13: 0.54889009790560728, 14: -0.20253821782628822 } }) npt.assert_allclose(exp_resid.values, resid.values, rtol=1e-2, atol=1e-2) exp_pvals = pd.DataFrame({ 'Intercept': { 'y0': 0.3193097383026624, 'y1': 0.31931029350376261 }, 'x1': { 'y0': 0.31931130074025166, 'y1': 0.31931185594151867 }, 'x2': { 'y0': 0.31929793802591028, 'y1': 0.3192984932257481 }, 'x3': { 'y0': 0.31930876472902192, 'y1': 0.31930931993001832 }, 'x4': { 'y0': 0.31931786743864193, 'y1': 0.31931842264061172 } }) npt.assert_allclose(exp_pvals.values, pvals.values, rtol=1e-2, atol=1e-2) index_fp = os.path.join(self.results, 'index.html') self.assertTrue(os.path.exists(index_fp)) with open(index_fp, 'r') as fh: html = fh.read() self.assertIn('<h1>Simplicial Linear Regression Summary</h1>', html) self.assertIn('<th>Relative importance</th>', html) self.assertIn('<th>Cross Validation</th>', html) self.assertIn('<th>Coefficients</th>\n', html) self.assertIn('<th>Raw Balances</th>\n', html) self.assertIn('<th>Predicted Proportions</th>\n', html) self.assertIn('<th>Residuals</th>\n', html)
def test_lovo(self): model = ols('x1 + x2', self.Y, self.X) model.fit() res = model.lovo() exp = pd.read_csv(get_data_path('lovo2.csv'), index_col=0) pdt.assert_frame_equal(res, exp)