def test_parser(config): fmla, model, interface = config parser = SystemFormulaParser(fmla, joined, eval_env=5) orig_data = parser.data assert isinstance(orig_data, OrderedDict) assert parser.eval_env == 5 parser.eval_env = 4 assert parser.eval_env == 4 exog = parser.exog dep = parser.dependent endog = parser.endog instr = parser.instruments for key in orig_data: eq = orig_data[key] assert_frame_equal(exog[key], eq['exog']) assert_frame_equal(dep[key], eq['dependent']) assert_frame_equal(endog[key], eq['endog']) assert_frame_equal(instr[key], eq['instruments']) labels = parser.equation_labels for label in labels: assert label in orig_data new_parser = SystemFormulaParser(parser.formula, joined, eval_env=5) new_data = new_parser.data for key in orig_data: eq1 = orig_data[key] eq2 = new_data[key] for key in eq1: if eq1[key] is not None: assert_frame_equal(eq1[key], eq2[key])
def test_parser(data, formula, effects): if not isinstance(data.y, pd.DataFrame): return if effects: formula += ' + EntityEffects + TimeEffects' joined = data.x joined['y'] = data.y parser = PanelFormulaParser(formula, joined) dep, exog = parser.data assert_frame_equal(parser.dependent, dep) assert_frame_equal(parser.exog, exog) parser.eval_env = 3 assert parser.eval_env == 3 parser.eval_env = 2 assert parser.eval_env == 2 assert parser.entity_effect == ('EntityEffects' in formula) assert parser.time_effect == ('TimeEffects' in formula) formula += ' + FixedEffects ' if effects: with pytest.raises(ValueError): PanelFormulaParser(formula, joined) else: parser = PanelFormulaParser(formula, joined) assert parser.entity_effect
def assert_results_equal(res1, res2, test_fit=True, test_df=True): n = min(res1.params.shape[0], res2.params.shape[0]) assert_series_equal(res1.params.iloc[:n], res2.params.iloc[:n]) assert_series_equal(res1.pvalues.iloc[:n], res2.pvalues.iloc[:n]) assert_series_equal(res1.tstats.iloc[:n], res2.tstats.iloc[:n]) assert_frame_equal(res1.cov.iloc[:n, :n], res2.cov.iloc[:n, :n]) assert_frame_equal(res1.conf_int().iloc[:n], res2.conf_int().iloc[:n]) assert_allclose(res1.s2, res2.s2) delta = 1 + (res1.resids.values - res2.resids.values) / max( res1.resids.std(), res2.resids.std()) assert_allclose(delta, np.ones_like(delta)) delta = 1 + (res1.wresids.values - res2.wresids.values) / max( res1.wresids.std(), res2.wresids.std()) assert_allclose(delta, np.ones_like(delta)) if test_df: assert_allclose(res1.df_model, res2.df_model) assert_allclose(res1.df_resid, res2.df_resid) if test_fit: assert_allclose(res1.rsquared, res2.rsquared) assert_allclose(res1.total_ss, res2.total_ss) assert_allclose(res1.resid_ss, res2.resid_ss) assert_allclose(res1.model_ss, res2.model_ss)
def test_demean_missing_alt_types(data): xpd = PanelData(data.x) xpd.drop(xpd.isnull) entity_demean = xpd.demean('entity') expected = xpd.dataframe.groupby(level=0).transform(lambda s: s - s.mean()) assert_frame_equal(entity_demean.dataframe, expected) time_demean = xpd.demean('time') expected = xpd.dataframe.groupby(level=1).transform(lambda s: s - s.mean()) assert_frame_equal(time_demean.dataframe, expected)
def test_existing_panel_data(): n, t, k = 11, 7, 3 x = np.random.random((k, t, n)) major = pd.date_range('12-31-1999', periods=7) items = ['var.{0}'.format(i) for i in range(1, k + 1)] minor = ['entities.{0}'.format(i) for i in range(1, n + 1)] x = pd.Panel(x, items=items, major_axis=major, minor_axis=minor) dh = PanelData(x) dh2 = PanelData(dh) assert_frame_equal(dh.dataframe, dh2.dataframe)
def test_predict_formula(data, model_and_func, formula): model, func = model_and_func mod = model.from_formula(formula, data) res = mod.fit() exog = data[['Intercept', 'x3', 'x4', 'x5']] endog = data[['x1', 'x2']] pred = res.predict(exog, endog) pred2 = res.predict(data=data) assert_frame_equal(pred, pred2) assert_allclose(res.fitted_values, pred)
def test_numpy_1d(self): x = np.empty(10) xdh = IVData(x) assert xdh.ndim == 2 assert xdh.cols == ['x'] assert xdh.rows == list(np.arange(10)) assert_equal(xdh.ndarray, x[:, None]) df = pd.DataFrame(x[:, None], columns=xdh.cols, index=xdh.rows) assert_frame_equal(xdh.pandas, df) assert xdh.shape == (10, 1)
def test_string_cat_equiv(self): s1 = pd.Series(['a', 'b', 'a', 'b', 'c', 'd', 'a', 'b']) s2 = pd.Series(np.arange(8.0)) s3 = pd.Series(['apple', 'banana', 'apple', 'banana', 'cherry', 'date', 'apple', 'banana']) df = pd.DataFrame({'string': s1, 'number': s2, 'other_string': s3}) dh = IVData(df) df_cat = df.copy() df_cat['string'] = df_cat['string'].astype('category') dh_cat = IVData(df_cat) assert_frame_equal(dh.pandas, dh_cat.pandas)
def test_numpy_2d(self): x = np.empty((10, 2)) xdh = IVData(x) assert xdh.ndim == x.ndim assert xdh.cols == ['x.0', 'x.1'] assert xdh.rows == list(np.arange(10)) assert_equal(xdh.ndarray, x) df = pd.DataFrame(x, columns=xdh.cols, index=xdh.rows) assert_frame_equal(xdh.pandas, df) assert xdh.shape == (10, 2) assert xdh.labels == {0: xdh.rows, 1: xdh.cols}
def test_categorical_no_conversion(self): index = pd.date_range('2017-01-01', periods=10) cat = pd.Categorical(['a', 'b', 'a', 'b', 'a', 'a', 'b', 'c', 'c', 'a']) s = pd.Series({'cat': cat}, index=index, name='cat') dh = IVData(s, convert_dummies=False) assert dh.ndim == 2 assert dh.shape == (10, 1) assert dh.cols == ['cat'] assert dh.rows == list(index) df = pd.DataFrame(s) assert_frame_equal(dh.pandas, df)
def test_existing_datahandler(self): x = np.empty((10, 2)) index = pd.date_range('2017-01-01', periods=10) xdf = pd.DataFrame(x, columns=['a', 'b'], index=index) xdh = IVData(xdf) xdh2 = IVData(xdh) assert xdh is not xdh2 assert xdh.cols == xdh2.cols assert xdh.rows == xdh2.rows assert_equal(xdh.ndarray, xdh2.ndarray) assert xdh.ndim == xdh2.ndim assert_frame_equal(xdh.pandas, xdh2.pandas)
def test_pandas_df_numeric(self): x = np.empty((10, 2)) index = pd.date_range('2017-01-01', periods=10) xdf = pd.DataFrame(x, columns=['a', 'b'], index=index) xdh = IVData(xdf) assert xdh.ndim == 2 assert xdh.cols == list(xdf.columns) assert xdh.rows == list(xdf.index) assert_equal(xdh.ndarray, x) df = pd.DataFrame(x, columns=xdh.cols, index=xdh.rows) assert_frame_equal(xdh.pandas, df) assert xdh.shape == (10, 2)
def test_pandas_series_numeric(self): x = np.empty(10) index = pd.date_range('2017-01-01', periods=10) xs = pd.Series(x, name='charlie', index=index) xdh = IVData(xs) assert xdh.ndim == 2 assert xdh.cols == [xs.name] assert xdh.rows == list(xs.index) assert_equal(xdh.ndarray, x[:, None]) df = pd.DataFrame(x[:, None], columns=xdh.cols, index=xdh.rows) assert_frame_equal(xdh.pandas, df) assert xdh.shape == (10, 1)
def test_predict(config): fmla, model, interface = config for key in fmla: if '[' in fmla[key] and model not in (IVSystemGMM, IV3SLS): return mod = model.from_formula(fmla, joined) res = mod.fit() pred = res.predict(data=joined) assert isinstance(pred, dict) pred2 = res.predict(data=joined, dataframe=True) pred3 = res.predict(equations=mod._equations, dataframe=True) assert_frame_equal(pred2, pred3)
def test_mean_missing(data): xpd = PanelData(data.x) xpd.drop(xpd.isnull) entity_mean = xpd.mean('entity') expected = xpd.dataframe.groupby(level=0).mean() expected = expected.loc[xpd.entities] expected.columns.name = None assert_frame_equal(entity_mean, expected) time_mean = xpd.mean('time') expected = xpd.dataframe.groupby(level=1).mean() expected = expected.loc[xpd.time] expected.columns.name = None assert_frame_equal(time_mean, expected)
def test_original_unmodified(data): pre_y = data.y.copy() pre_x = data.x.copy() pre_w = data.w.copy() mod = PanelOLS(data.y, data.x, weights=data.w) mod.fit(debiased=True) if isinstance(data.y, (pd.DataFrame, pd.Panel)): for after, before in ((data.y, pre_y), (data.x, pre_x), (data.w, pre_w)): if isinstance(before, pd.DataFrame): assert_frame_equal(before, after) else: assert_panel_equal(before, after) mi_df_y = PanelData(data.y).dataframe mi_df_x = PanelData(data.x).dataframe mi_df_y.index.names = ['firm', 'period'] mi_df_x.index.names = ['firm', 'period'] mi_df_w = PanelData(data.w).dataframe pre_y = mi_df_y.copy() pre_x = mi_df_x.copy() pre_w = mi_df_w.copy() mod = PanelOLS(mi_df_y, mi_df_x, weights=mi_df_w) mod.fit(debiased=True) assert_frame_equal(mi_df_w, pre_w) assert_frame_equal(mi_df_y, pre_y) assert_frame_equal(mi_df_x, pre_x) elif isinstance(data.y, xr.DataArray): xr.testing.assert_identical(data.y, pre_y) xr.testing.assert_identical(data.w, pre_w) xr.testing.assert_identical(data.x, pre_x) else: assert_allclose(data.y, pre_y) assert_allclose(data.x, pre_x) assert_allclose(data.w, pre_w)
def test_count(data): xpd = PanelData(data.x) xpd.drop(xpd.isnull) entity_mean = xpd.count('entity') expected = xpd.dataframe.groupby(level=0).count() expected = expected.loc[xpd.entities] expected.columns.name = None expected = expected.astype(np.int64) assert_frame_equal(entity_mean, expected) time_mean = xpd.count('time') expected = xpd.dataframe.groupby(level=1).count() expected = expected.loc[xpd.time] expected.columns.name = None expected = expected.astype(np.int64) assert_frame_equal(time_mean, expected)
def test_fitted(data): mod = IVSystemGMM(data.eqns) res = mod.fit() expected = [] for i, key in enumerate(res.equations): eq = res.equations[key] fv = res.fitted_values[key].copy() fv.name = 'fitted_values' assert_series_equal(eq.fitted_values, fv) b = eq.params.values direct = mod._x[i] @ b expected.append(direct[:, None]) assert_allclose(eq.fitted_values, direct, atol=1e-8) expected = np.concatenate(expected, 1) expected = pd.DataFrame(expected, index=mod._dependent[i].pandas.index, columns=[key for key in res.equations]) assert_frame_equal(expected, res.fitted_values)
def test_demean_missing_alt_types(data): check = isinstance(data.x, (pd.DataFrame, np.ndarray)) xpd = PanelData(data.x) xpd.drop(xpd.isnull) entity_demean = xpd.demean('entity') expected = xpd.dataframe.groupby(level=0).transform(lambda s: s - s.mean()) assert_frame_equal(entity_demean.dataframe, expected, check_index_type=check, check_column_type=check) time_demean = xpd.demean('time') expected = xpd.dataframe.groupby(level=1).transform(lambda s: s - s.mean()) assert_frame_equal(time_demean.dataframe, expected, check_index_type=check, check_column_type=check)
def test_numpy_3d(): n, t, k = 11, 7, 3 x = np.random.random((k, t, n)) dh = PanelData(x) assert_equal(x, dh.values3d) assert dh.nentity == n assert dh.nobs == t assert dh.nvar == k assert_equal(np.reshape(x.T, (n * t, k)), dh.values2d) items = ['entity.{0}'.format(i) for i in range(n)] obs = [i for i in range(t)] vars = ['x.{0}'.format(i) for i in range(k)] expected = pd.Panel(np.reshape(x, (k, t, n)), items=vars, major_axis=obs, minor_axis=items) expected_frame = expected.swapaxes(1, 2).to_frame() expected_frame.index.levels[0].name = 'entity' expected_frame.index.levels[1].name = 'time' assert_frame_equal(dh.dataframe, expected_frame)
def test_xarray_1d(self): x_np = np.random.randn(10) x = xr.DataArray(x_np) dh = IVData(x, 'some_variable') assert_equal(dh.ndarray, x_np[:, None]) assert dh.rows == list(np.arange(10)) assert dh.cols == ['some_variable.0'] expected = pd.DataFrame(x_np, columns=dh.cols, index=dh.rows) assert_frame_equal(expected, dh.pandas) index = pd.date_range('2017-01-01', periods=10) x = xr.DataArray(x_np, [('time', index)]) dh = IVData(x, 'some_variable') assert_equal(dh.ndarray, x_np[:, None]) assert_series_equal(pd.Series(dh.rows), pd.Series(list(index))) assert dh.cols == ['some_variable.0'] expected = pd.DataFrame(x_np[:, None], columns=dh.cols, index=dh.rows) assert_frame_equal(expected, dh.pandas)
def test_pandas_panel(): n, t, k = 11, 7, 3 x = np.random.random((k, t, n)) major = pd.date_range('12-31-1999', periods=7) items = ['var.{0}'.format(i) for i in range(1, k + 1)] minor = ['entities.{0}'.format(i) for i in range(1, n + 1)] x = pd.Panel(x, items=items, major_axis=major, minor_axis=minor) dh = PanelData(x) assert dh.nentity == n assert dh.nobs == t assert dh.nvar == k assert_equal(dh.values3d, x.values) expected = np.reshape(x.swapaxes(0, 2).values, (n * t, k)) assert_equal(dh.values2d, expected) expected_frame = x.swapaxes(1, 2).to_frame() expected_frame.index.levels[0].name = 'entity' expected_frame.index.levels[1].name = 'time' assert_frame_equal(dh.dataframe, expected_frame)
def test_xarray_2d(self): x_np = np.random.randn(10, 2) x = xr.DataArray(x_np) dh = IVData(x) assert_equal(dh.ndarray, x_np) assert dh.rows == list(np.arange(10)) assert dh.cols == ['x.0', 'x.1'] expected = pd.DataFrame(x_np, columns=dh.cols, index=dh.rows) assert_frame_equal(expected, dh.pandas) index = pd.date_range('2017-01-01', periods=10) x = xr.DataArray(x_np, [('time', index), ('variables', ['apple', 'banana'])]) dh = IVData(x) assert_equal(dh.ndarray, x_np) assert_series_equal(pd.Series(dh.rows), pd.Series(list(index))) assert dh.cols == ['apple', 'banana'] expected = pd.DataFrame(x_np, columns=dh.cols, index=dh.rows) assert_frame_equal(expected, dh.pandas)
def test_predict_partial(config): fmla, model, interface = config for key in fmla: if '[' in fmla[key] and model not in (IVSystemGMM, IV3SLS): return mod = model.from_formula(fmla, joined) res = mod.fit() eqns = AttrDict() for key in list(mod._equations.keys())[1:]: eqns[key] = mod._equations[key] pred = res.predict(equations=eqns, dataframe=True) for key in mod._equations: if key in eqns: assert key in pred else: assert key not in pred pred2 = res.predict(data=joined, dataframe=True) assert_frame_equal(pred2[pred.columns], pred) eqns = AttrDict() for key in list(mod._equations.keys())[1:]: eqns[key] = mod._equations[key] final = list(mod._equations.keys())[0] eqns[final] = {'exog': None, 'endog': None} pred3 = res.predict(equations=eqns, dataframe=True) assert_frame_equal(pred2[pred3.columns], pred3) eqns = AttrDict() for key in mod._equations: eqns[key] = { k: v for k, v in mod._equations[key].items() if v.shape[1] > 0 } pred4 = res.predict(equations=eqns, dataframe=True) assert_frame_equal(pred2, pred4)
def test_predict(missing_data): mod = SUR(missing_data) res = mod.fit() pred = res.predict() for key in pred: assert_series_equal(pred[key].iloc[:, 0], res.equations[key].fitted_values, check_names=False) pred = res.predict(fitted=False, idiosyncratic=True) for key in pred: assert_series_equal(pred[key].iloc[:, 0], res.equations[key].resids, check_names=False) pred = res.predict(fitted=True, idiosyncratic=True) assert isinstance(pred, dict) for key in res.equations: assert key in pred pred = res.predict(dataframe=True) assert isinstance(pred, DataFrame) assert_frame_equal(pred, res.fitted_values) pred = res.predict(fitted=False, idiosyncratic=True, dataframe=True) assert isinstance(pred, DataFrame) assert_frame_equal(pred, res.resids) pred = res.predict(fitted=True, idiosyncratic=True, dataframe=True) assert isinstance(pred, dict) assert 'fitted_values' in pred assert_frame_equal(pred['fitted_values'], res.fitted_values) assert 'idiosyncratic' in pred assert_frame_equal(pred['idiosyncratic'], res.resids) nobs = missing_data[list(missing_data.keys())[0]]['dependent'].shape[0] pred = res.predict(fitted=True, idiosyncratic=False, dataframe=True, missing=True) assert pred.shape[0] == nobs pred = res.predict(fitted=True, idiosyncratic=True, missing=True) for key in pred: assert pred[key].shape[0] == nobs
def test_predict_formula_function(data, model_and_func): model, func = model_and_func fmla = 'y ~ 1 + sigmoid(x3) + x4 + [x1 + x2 ~ z1 + z2 + z3] + np.exp(x5)' mod = model.from_formula(fmla, data) res = mod.fit() exog = [data[['Intercept']], sigmoid(data[['x3']]), data[['x4']], np.exp(data[['x5']])] exog = pd.concat(exog, 1) endog = data[['x1', 'x2']] pred = res.predict(exog, endog) pred2 = res.predict(data=data) assert_frame_equal(pred, pred2) assert_allclose(res.fitted_values, pred) res2 = func(fmla, data).fit() pred3 = res2.predict(exog, endog) pred4 = res2.predict(data=data) assert_frame_equal(pred, pred3) assert_frame_equal(pred, pred4)
def assert_frame_similar(result, expected): r = result.copy() r.iloc[:, :] = 0.0 e = expected.copy() e.iloc[:, :] = 0.0 assert_frame_equal(r, e)
def test_series_multiindex(mi_df): from_df = PanelData(mi_df.iloc[:, [0]]) from_series = PanelData(mi_df.iloc[:, 0]) assert_frame_equal(from_df.dataframe, from_series.dataframe)
def test_non_traded_risk_free(data, non_traded_model): mod1 = non_traded_model.from_formula(FORMULA, data.joined, risk_free=True) mod2 = non_traded_model(data.portfolios, data.factors, risk_free=True) res1 = mod1.fit() res2 = mod2.fit() assert_frame_equal(mod1.factors.pandas, mod2.factors.pandas) assert_frame_equal(mod1.portfolios.pandas, mod2.portfolios.pandas) assert_frame_equal(res1.params, res2.params) assert mod1.formula == FORMULA assert mod2.formula is None mod1 = non_traded_model.from_formula(FORMULA_FACTORS, data.joined, portfolios=data.portfolios, risk_free=True) mod2 = non_traded_model(data.portfolios, data.factors, risk_free=True) res1 = mod1.fit() res2 = mod2.fit() assert_frame_equal(mod1.factors.pandas, mod2.factors.pandas) assert_frame_equal(mod1.portfolios.pandas, mod2.portfolios.pandas) assert_frame_equal(res1.params, res2.params) assert mod1.formula == FORMULA_FACTORS assert mod2.formula is None
def test_series_multiindex(panel): mi = panel.swapaxes(1, 2).to_frame(filter_observations=False) from_df = PanelData(mi.iloc[:, [0]]) from_series = PanelData(mi.iloc[:, 0]) assert_frame_equal(from_df.dataframe, from_series.dataframe)