def setup_class(cls): d = macrodata.load_pandas().data # growth rates d['gs_l_realinv'] = 400 * np.log(d['realinv']).diff() d['gs_l_realgdp'] = 400 * np.log(d['realgdp']).diff() d['lint'] = d['realint'].shift(1) d['tbilrate'] = d['tbilrate'].shift(1) d = d.dropna() cls.d = d endogg = d['gs_l_realinv'] exogg = add_constant(d[['gs_l_realgdp', 'lint']]) exogg2 = add_constant(d[['gs_l_realgdp', 'tbilrate']]) exogg3 = add_constant(d[['gs_l_realgdp']]) res_ols = OLS(endogg, exogg).fit() res_ols2 = OLS(endogg, exogg2).fit() res_ols3 = OLS(endogg, exogg3).fit() cls.res = res_ols cls.res2 = res_ols2 cls.res3 = res_ols3 cls.endog = cls.res.model.endog cls.exog = cls.res.model.exog
def setup_class(cls): mod1 = cls.model_cls(endog, exog, **cls.mod_kwargs) cls.res1 = mod1.fit(disp=False, **cls.fit_kwargs) cls.res1b = mod1.fit(disp=False, **cls.fit_kwargs) mod2 = OLS(endog, exog) cls.res2 = mod2.fit(disp=False, **cls.fit_kwargs)
def test_regression_with_tuples(self): i = pd.Series([1, 2, 3, 4] * 10, name="i") y = pd.Series([1, 2, 3, 4, 5] * 8, name="y") x = pd.Series([1, 2, 3, 4, 5, 6, 7, 8] * 5, name="x") df = pd.DataFrame(index=i.index) df = df.join(i) endo = df.join(y) exo = df.join(x) endo_groups = endo.groupby("i") exo_groups = exo.groupby("i") exo_Df = exo_groups.agg([np.sum, np.max]) endo_Df = endo_groups.agg([np.sum, np.max]) reg = OLS(exo_Df[[("x", "sum")]], endo_Df).fit() interesting_lines = [] with warnings.catch_warnings(): # Catch ominormal warning, not interesting here warnings.simplefilter("ignore") for line in str(reg.summary()).splitlines(): if "('" in line: interesting_lines.append(line[:38]) desired = [ "Dep. Variable: ('x', 'sum') ", "('y', 'sum') 1.4595 0.209 ", "('y', 'amax') 0.2432 0.035 " ] assert sorted(desired) == sorted(interesting_lines)
def test_regularized_weights(self): np.random.seed(1432) exog1 = np.random.normal(size=(100, 3)) endog1 = exog1[:, 0] + exog1[:, 1] + np.random.normal(size=100) exog2 = np.random.normal(size=(100, 3)) endog2 = exog2[:, 0] + exog2[:, 1] + np.random.normal(size=100) exog_a = np.vstack((exog1, exog1, exog2)) endog_a = np.concatenate((endog1, endog1, endog2)) # Should be equivalent to exog_a, endog_a. exog_b = np.vstack((exog1, exog2)) endog_b = np.concatenate((endog1, endog2)) wgts = np.ones(200) wgts[0:100] = 2 sigma = np.diag(1 / wgts) # TODO: parametrize? for L1_wt in [0, 0.5, 1]: for alpha in [0, 1]: mod1 = OLS(endog_a, exog_a) rslt1 = mod1.fit_regularized(L1_wt=L1_wt, alpha=alpha) mod2 = WLS(endog_b, exog_b, weights=wgts) rslt2 = mod2.fit_regularized(L1_wt=L1_wt, alpha=alpha) mod3 = GLS(endog_b, exog_b, sigma=sigma) rslt3 = mod3.fit_regularized(L1_wt=L1_wt, alpha=alpha) assert_almost_equal(rslt1.params, rslt2.params, decimal=3) assert_almost_equal(rslt1.params, rslt3.params, decimal=3)
def setup_class(cls): mod1 = cls.model_cls(endog, exog, **cls.mod_kwargs) cls.res1 = mod1.fit(disp=False, **cls.fit_kwargs) cls.res1b = mod1.fit(cov_type='nw-panel', cov_kwds=cls.cov_kwds) mod2 = OLS(endog, exog) cls.res2 = mod2.fit(disp=False, **cls.fit_kwargs)
def setup_class(cls): R = np.zeros(7) R[4:6] = [1, -1] data = datasets.longley.load(as_pandas=False) data.exog = add_constant(data.exog, prepend=False) res1 = OLS(data.endog, data.exog).fit() cls.Ttest1 = res1.t_test(R)
def setup_class(cls): data = datasets.longley.load(as_pandas=False) res1 = OLS(data.endog, data.exog).fit() cls.res1 = res1 #cls.res2.wresid = res1.wresid # workaround hack res_qr = OLS(data.endog, data.exog).fit(method="qr") cls.res_qr = res_qr
def test_const_indicator(): np.random.seed(12345) X = np.random.randint(0, 3, size=30) X = categorical(X, drop=True) y = np.dot(X, [1., 2., 3.]) + np.random.normal(size=30) modc = OLS(y, add_constant(X[:, 1:], prepend=True)).fit() mod = OLS(y, X, hasconst=True).fit() assert_almost_equal(modc.rsquared, mod.rsquared, 12)
def setup_class(cls): mod1 = cls.model_cls(endog, exog, **cls.mod_kwargs) cls.res1 = mod1.fit(**cls.fit_kwargs) mod2 = OLS(endog, exog) # check kernel as string kwds2 = {'kernel': 'uniform', 'maxlags': 2} cls.res2 = mod2.fit(cov_type=cls.cov_type, cov_kwds=kwds2)
def test_summary_as_latex(): # GH#734 dta = datasets.longley.load_pandas() X = dta.exog X["constant"] = 1 y = dta.endog with warnings.catch_warnings(record=True): res = OLS(y, X).fit() table = res.summary().as_latex() # replace the date and time table = re.sub(r"(?<=\n\\textbf\{Date:\} &).+?&", r" Sun, 07 Apr 2013 &", table) table = re.sub(r"(?<=\n\\textbf\{Time:\} &).+?&", r" 13:46:07 &", table) expected = textwrap.dedent(""" \\begin{center} \\begin{tabular}{lclc} \\toprule \\textbf{Dep. Variable:} & TOTEMP & \\textbf{ R-squared: } & 0.995 \\\\ \\textbf{Model:} & OLS & \\textbf{ Adj. R-squared: } & 0.992 \\\\ \\textbf{Method:} & Least Squares & \\textbf{ F-statistic: } & 330.3 \\\\ \\textbf{Date:} & Sun, 07 Apr 2013 & \\textbf{ Prob (F-statistic):} & 4.98e-10 \\\\ \\textbf{Time:} & 13:46:07 & \\textbf{ Log-Likelihood: } & -109.62 \\\\ \\textbf{No. Observations:} & 16 & \\textbf{ AIC: } & 233.2 \\\\ \\textbf{Df Residuals:} & 9 & \\textbf{ BIC: } & 238.6 \\\\ \\textbf{Df Model:} & 6 & \\textbf{ } & \\\\ \\bottomrule \\end{tabular} \\begin{tabular}{lcccccc} & \\textbf{coef} & \\textbf{std err} & \\textbf{t} & \\textbf{P$>$$|$t$|$} & \\textbf{[0.025} & \\textbf{0.975]} \\\\ \\midrule \\textbf{GNPDEFL} & 15.0619 & 84.915 & 0.177 & 0.863 & -177.029 & 207.153 \\\\ \\textbf{GNP} & -0.0358 & 0.033 & -1.070 & 0.313 & -0.112 & 0.040 \\\\ \\textbf{UNEMP} & -2.0202 & 0.488 & -4.136 & 0.003 & -3.125 & -0.915 \\\\ \\textbf{ARMED} & -1.0332 & 0.214 & -4.822 & 0.001 & -1.518 & -0.549 \\\\ \\textbf{POP} & -0.0511 & 0.226 & -0.226 & 0.826 & -0.563 & 0.460 \\\\ \\textbf{YEAR} & 1829.1515 & 455.478 & 4.016 & 0.003 & 798.788 & 2859.515 \\\\ \\textbf{constant} & -3.482e+06 & 8.9e+05 & -3.911 & 0.004 & -5.5e+06 & -1.47e+06 \\\\ \\bottomrule \\end{tabular} \\begin{tabular}{lclc} \\textbf{Omnibus:} & 0.749 & \\textbf{ Durbin-Watson: } & 2.559 \\\\ \\textbf{Prob(Omnibus):} & 0.688 & \\textbf{ Jarque-Bera (JB): } & 0.684 \\\\ \\textbf{Skew:} & 0.420 & \\textbf{ Prob(JB): } & 0.710 \\\\ \\textbf{Kurtosis:} & 2.434 & \\textbf{ Cond. No. } & 4.86e+09 \\\\ \\bottomrule \\end{tabular} %\\caption{OLS Regression Results} \\end{center} Warnings: \\newline [1] Standard Errors assume that the covariance matrix of the errors is correctly specified. \\newline [2] The condition number is large, 4.86e+09. This might indicate that there are \\newline strong multicollinearity or other numerical problems.""").strip() # noqa:E501 assert_equal(table, expected)
def setup_class(cls): # check kernel specified as string mod1 = cls.model_cls(endog, exog, **cls.mod_kwargs) cls.res1 = mod1.fit(disp=False, **cls.fit_kwargs) mod2 = OLS(endog, exog) cls.res2 = mod2.fit(disp=False, cov_type=cls.cov_type, cov_kwds={'maxlags': 2})
def test_conf_int_single_regressor(): # GH#706 single-regressor model (i.e. no intercept) with 1D exog # should get passed to DataFrame for conf_int y = pd.Series(np.random.randn(10)) x = pd.Series(np.ones(10)) res = OLS(y, x).fit() conf_int = res.conf_int() assert conf_int.shape == (1, 2) assert isinstance(conf_int, pd.DataFrame)
def setup_class(cls): mod1 = cls.model_cls(endog, exog, **cls.mod_kwargs) cls.res1 = mod1.fit(disp=False, **cls.fit_kwargs) mod2 = OLS(endog, exog) cls.res2 = mod2.fit(disp=False, **cls.fit_kwargs) # for debugging cls.res3 = mod2.fit(cov_type=cls.cov_type, cov_kwds={'maxlags': 2})
def setup_class(cls): data = datasets.longley.load(as_pandas=False) data.exog = add_constant(data.exog, prepend=False) res1 = OLS(data.endog, data.exog).fit() R2 = [[0, 1, -1, 0, 0, 0, 0], [0, 0, 0, 0, 1, -1, 0]] cls.Ftest1 = res1.f_test(R2) hyp = 'x2 = x3, x5 = x6' cls.NewFtest1 = res1.f_test(hyp)
def setup_class(cls): data = datasets.longley.load(as_pandas=False) data.exog = add_constant(data.exog, prepend=False) res1 = OLS(data.endog, data.exog).fit() R = np.array([[0, 1, 1, 0, 0, 0, 0], [0, 1, 0, 1, 0, 0, 0], [0, 1, 0, 0, 0, 0, 0], [0, 0, 0, 0, 1, 0, 0], [0, 0, 0, 0, 0, 1, 0]]) q = np.array([0, 0, 0, 1, 0]) cls.Ftest1 = res1.f_test((R, q))
def test_outlier_influence_funcs(reset_randomstate): x = add_constant(np.random.randn(10, 2)) y = x.sum(1) + np.random.randn(10) res = OLS(y, x).fit() out_05 = oi.summary_table(res) # GH#3344 : Check alpha has an effect out_01 = oi.summary_table(res, alpha=0.01) assert np.all(out_01[1][:, 6] <= out_05[1][:, 6]) assert np.all(out_01[1][:, 7] >= out_05[1][:, 7]) res2 = OLS(y, x[:, 0]).fit() oi.summary_table(res2, alpha=0.05) infl = res2.get_influence() infl.summary_table()
def setup(self): model = OLS(self.res1.model.endog, self.res1.model.exog) res_ols = model.fit(cov_type='cluster', cov_kwds=dict(groups=self.groups, use_correction=False, use_t=False, df_correction=True)) self.res3 = self.res1 self.res1 = res_ols self.bse_robust = res_ols.bse self.cov_robust = res_ols.cov_params() cov1 = sw.cov_cluster(self.res1, self.groups, use_correction=False) se1 = sw.se_cov(cov1) self.bse_robust2 = se1 self.cov_robust2 = cov1
def setup_class(cls): np.random.seed(54321) cls.endog_n_ = np.random.uniform(0, 20, size=30) cls.endog_n_one = cls.endog_n_[:, None] cls.exog_n_ = np.random.uniform(0, 20, size=30) cls.exog_n_one = cls.exog_n_[:, None] cls.degen_exog = cls.exog_n_one[:-1] cls.mod1 = OLS(cls.endog_n_one, cls.exog_n_one) cls.mod1.df_model += 1 cls.res1 = cls.mod1.fit() # Note that these are created for every subclass.. # A little extra overhead probably cls.mod2 = OLS(cls.endog_n_one, cls.exog_n_one) cls.mod2.df_model += 1 cls.res2 = cls.mod2.fit()
def test_missing(self): data = datasets.longley.load(as_pandas=False) data.exog = add_constant(data.exog, prepend=False) data.endog[[3, 7, 14]] = np.nan mod = OLS(data.endog, data.exog, missing='drop') assert mod.endog.shape[0] == 13 assert mod.exog.shape[0] == 13
def test_influence_wrapped(): d = macrodata.load_pandas().data # growth rates gs_l_realinv = 400 * np.log(d['realinv']).diff().dropna() gs_l_realgdp = 400 * np.log(d['realgdp']).diff().dropna() lint = d['realint'][:-1] # re-index these because they won't conform to lint gs_l_realgdp.index = lint.index gs_l_realinv.index = lint.index data = dict(const=np.ones_like(lint), lint=lint, lrealgdp=gs_l_realgdp) # order is important exog = pd.DataFrame(data, columns=['const', 'lrealgdp', 'lint']) res = OLS(gs_l_realinv, exog).fit() # basic # already tested # cov.scaled and cov.unscaled have already been tested # TODO: check that above is correct; # comment is (roughly) copied from upstream infl = oi.OLSInfluence(res) # smoke test just to make sure it works, results separately tested df = infl.summary_frame() assert isinstance(df, pd.DataFrame) # this test is slow path = os.path.join(cur_dir, "results", "influence_lsdiag_R.json") with open(path, 'r') as fp: lsdiag = json.load(fp) c0, c1 = infl.cooks_distance # TODO: what's c1, it's pvalues? -ss # NOTE: we get a hard-cored 5 decimals with pandas testing assert_almost_equal(c0, lsdiag['cooks'], 14) assert_almost_equal(infl.hat_matrix_diag, (lsdiag['hat']), 14) assert_almost_equal(infl.resid_studentized_internal, lsdiag['std.res'], 14) # slow dffits, dffth = infl.dffits assert_almost_equal(dffits, lsdiag['dfits'], 14) assert_almost_equal(infl.resid_studentized_external, lsdiag['stud.res'], 14) fn = os.path.join(cur_dir, "results", "influence_measures_R.csv") infl_r = pd.read_csv(fn, index_col=0) # not used yet: # fn = os.path.join(cur_dir, "results", "influence_measures_bool_R.csv") # conv = lambda s: 1 if s == 'TRUE' else 0 #infl_bool_r = pd.read_csv(fn, index_col=0, # converters=dict(zip(lrange(7), [conv]*7))) infl_r2 = np.asarray(infl_r) # TODO: finish wrapping this stuff assert_almost_equal(infl.dfbetas, infl_r2[:, :3], decimal=13) assert_almost_equal(infl.cov_ratio, infl_r2[:, 4], decimal=14)
def test_formula_missing_cat(): # GH#805 dta = datasets.grunfeld.load_pandas().data dta.loc[dta.index[0], 'firm'] = np.nan formula = 'value ~ invest + capital + firm + year' mod = OLS.from_formula(formula=formula, data=dta.dropna()) res = mod.fit() mod2 = OLS.from_formula(formula=formula, data=dta) res2 = mod2.fit() assert_almost_equal(res.params.values, res2.params.values) with pytest.raises(PatsyError): OLS.from_formula(formula, data=dta, missing='raise')
def setup_class(cls): d2 = macrodata.load_pandas().data g_gdp = 400 * np.diff(np.log(d2['realgdp'].values)) g_inv = 400 * np.diff(np.log(d2['realinv'].values)) exogg = add_constant(np.c_[g_gdp, d2['realint'][:-1].values], prepend=False) cls.res1 = OLS(g_inv, exogg).fit()
def setup_class(cls): # TODO: Test HAC method X = np.random.randn(100, 3) b = np.ones((3, 1)) e = np.random.randn(100, 1) y = np.dot(X, b) + e # Cases? # Homoskedastic # HC0 cls.res1_full = OLS(y, X).fit() cls.res1_restricted = OLS(y, X[:, 0]).fit() cls.res2_full = cls.res1_full.get_robustcov_results('HC0') cls.res2_restricted = cls.res1_restricted.get_robustcov_results('HC0') cls.X = X cls.Y = y
def setup_class(cls): data = datasets.longley.load(as_pandas=False) data.exog = add_constant(data.exog, prepend=False) cls.res1 = OLS(data.endog, data.exog).fit() R = np.identity(7) cls.Ttest = cls.res1.t_test(R) hyp = 'x1 = 0, x2 = 0, x3 = 0, x4 = 0, x5 = 0, x6 = 0, const = 0' cls.NewTTest = cls.res1.t_test(hyp)
def test_recursive_residuals(self): reccumres_standardize = np.array([ -2.151, -3.748, -3.114, -3.096, -1.865, -2.230, -1.194, -3.500, -3.638, -4.447, -4.602, -4.631, -3.999, -4.830, -5.429, -5.435, -6.554, -8.093, -8.567, -7.532, -7.079, -8.468, -9.320, -12.256, -11.932, -11.454, -11.690, -11.318, -12.665, -12.842, -11.693, -10.803, -12.113, -12.109, -13.002, -11.897, -10.787, -10.159, -9.038, -9.007, -8.634, -7.552, -7.153, -6.447, -5.183, -3.794, -3.511, -3.979, -3.236, -3.793, -3.699, -5.056, -5.724, -4.888, -4.309, -3.688, -3.918, -3.735, -3.452, -2.086, -6.520, -7.959, -6.760, -6.855, -6.032, -4.405, -4.123, -4.075, -3.235, -3.115, -3.131, -2.986, -1.813, -4.824, -4.424, -4.796, -4.000, -3.390, -4.485, -4.669, -4.560, -3.834, -5.507, -3.792, -2.427, -1.756, -0.354, 1.150, 0.586, 0.643, 1.773, -0.830, -0.388, 0.517, 0.819, 2.240, 3.791, 3.187, 3.409, 2.431, 0.668, 0.957, -0.928, 0.327, -0.285, -0.625, -2.316, -1.986, -0.744, -1.396, -1.728, -0.646, -2.602, -2.741, -2.289, -2.897, -1.934, -2.532, -3.175, -2.806, -3.099, -2.658, -2.487, -2.515, -2.224, -2.416, -1.141, 0.650, -0.947, 0.725, 0.439, 0.885, 2.419, 2.642, 2.745, 3.506, 4.491, 5.377, 4.624, 5.523, 6.488, 6.097, 5.390, 6.299, 6.656, 6.735, 8.151, 7.260, 7.846, 8.771, 8.400, 8.717, 9.916, 9.008, 8.910, 8.294, 8.982, 8.540, 8.395, 7.782, 7.794, 8.142, 8.362, 8.400, 7.850, 7.643, 8.228, 6.408, 7.218, 7.699, 7.895, 8.725, 8.938, 8.781, 8.350, 9.136, 9.056, 10.365, 10.495, 10.704, 10.784, 10.275, 10.389, 11.586, 11.033, 11.335, 11.661, 10.522, 10.392, 10.521, 10.126, 9.428, 9.734, 8.954, 9.949, 10.595, 8.016, 6.636, 6.975]) rr = diagnostic.recursive_olsresiduals(self.res, skip=3, alpha=0.95) np.testing.assert_equal(np.round(rr[5][1:], 3), reccumres_standardize) # extra zero in front assert_almost_equal(rr[3][4:], np.diff(reccumres_standardize), 3) assert_almost_equal(rr[4][3:].std(ddof=1), 10.7242, decimal=4) # regression number, visually checked with graph from gretl ub0 = np.array([13.37318571, 13.50758959, 13.64199346, 13.77639734, 13.91080121]) ub1 = np.array([39.44753774, 39.58194162, 39.7163455, 39.85074937, 39.98515325]) lb, ub = rr[6] assert_almost_equal(ub[:5], ub0, decimal=7) assert_almost_equal(lb[:5], -ub0, decimal=7) assert_almost_equal(ub[-5:], ub1, decimal=7) assert_almost_equal(lb[-5:], -ub1, decimal=7) # test a few values with explicit OLS endog = self.res.model.endog exog = self.res.model.exog params = [] ypred = [] for i in range(3, 10): resi = OLS(endog[:i], exog[:i]).fit() ypred.append(resi.model.predict(resi.params, exog[i])) params.append(resi.params) assert_almost_equal(rr[2][3:10], ypred, decimal=12) assert_almost_equal(rr[0][3:10], endog[3:10] - ypred, decimal=12) assert_almost_equal(rr[1][2:9], params, decimal=12)
def test_norm_resid_zero_variance(self): with warnings.catch_warnings(record=True): y = self.res1.model.endog res = OLS(y, y).fit() assert_allclose(res.scale, 0, atol=1e-20) assert_allclose(res.wresid, res.resid_pearson, atol=5e-11)
def test_rsquared_adj_overfit(self): # Test that if df_resid = 0, rsquared_adj = 0. # This is a regression test for user issue: # GH#868 with warnings.catch_warnings(record=True): x = np.random.randn(5) y = np.random.randn(5, 6) results = OLS(x, y).fit() rsquared_adj = results.rsquared_adj assert_equal(rsquared_adj, np.nan)
def setup_class(cls): super(TestGLS_large_data, cls).setup_class() nobs = 1000 y = np.random.randn(nobs, 1) X = np.random.randn(nobs, 20) sigma = np.ones_like(y) cls.gls_res = GLS(y, X, sigma=sigma).fit() cls.gls_res_scalar = GLS(y, X, sigma=1).fit() cls.gls_res_none = GLS(y, X).fit() cls.ols_res = OLS(y, X).fit()
def setup_class(cls): nobs, k_exog = 100, 5 np.random.seed(987125) x = np.random.randn(nobs, k_exog - 1) x = add_constant(x) y_true = x.sum(1) / 2 y = y_true + 2 * np.random.randn(nobs) cls.endog = y cls.exog = x cls.idx_p_uc = np.array(cls.idx_uc) cls.exogc = xc = x[:, cls.idx_uc] mod_ols_c = OLS(y - 0.5 * x[:, 1], xc) mod_ols_c.exog_names[:] = ['const', 'x2', 'x3', 'x4'] cls.mod2 = mod_ols_c cls.res2 = cls.mod2.fit(**cls.fit_kwargs) cls.init()
def setup_class(cls): data = datasets.longley.load(as_pandas=False) data.exog = add_constant(data.exog, prepend=False) ols_res = OLS(data.endog, data.exog).fit() gls_res = GLS(data.endog, data.exog).fit() gls_res_scalar = GLS(data.endog, data.exog, sigma=1) cls.endog = data.endog cls.exog = data.exog cls.res1 = gls_res cls.res2 = ols_res cls.res3 = gls_res_scalar # TODO: Do something with this?