def test_from_formula(): mod = RecursiveLS.from_formula('cpi ~ m1', data=dta) res = mod.fit() # Test the RLS estimates against OLS estimates mod_ols = OLS.from_formula('cpi ~ m1', data=dta) res_ols = mod_ols.fit() assert_allclose(res.params, res_ols.params)
def test_escaped_variable_name(): # Rename 'cpi' column to 'CPI_' data = macrodata.load().data data.rename(columns={'cpi': 'CPI_'}, inplace=True) mod = OLS.from_formula('CPI_ ~ 1 + np.log(realgdp)', data=data) res = mod.fit() assert 'CPI\\_' in res.summary().as_latex() assert 'CPI_' in res.summary().as_text()
def test_from_formula(): with pytest.warns(ValueWarning, match="No frequency information"): mod = RecursiveLS.from_formula('cpi ~ m1', data=dta) res = mod.fit() # Test the RLS estimates against OLS estimates mod_ols = OLS.from_formula('cpi ~ m1', data=dta) res_ols = mod_ols.fit() assert_allclose(res.params, res_ols.params)
def setup_class(cls): from .test_diagnostic import get_duncan_data endog, exog, labels = get_duncan_data() data = pd.DataFrame(np.column_stack((endog, exog)), columns='y const var1 var2'.split(), index=labels) res0 = GLM.from_formula('y ~ const + var1 + var2 - 1', data).fit() res1 = OLS.from_formula('y ~ const + var1 + var2 - 1', data).fit() cls.infl1 = res1.get_influence() cls.infl0 = res0.get_influence()
def setup_class(cls): nobs = 30 np.random.seed(987128) x = np.random.randn(nobs, 3) y = x.sum(1) + np.random.randn(nobs) index = ['obs%02d' % i for i in range(nobs)] # add one extra column to check that it doesn't matter cls.data = pd.DataFrame(np.round(np.column_stack((y, x)), 4), columns='y var1 var2 var3'.split(), index=index) cls.res = OLS.from_formula('y ~ var1 + var2', data=cls.data).fit()
def test_missing_formula_predict(): # see 2171 nsample = 30 data = np.linspace(0, 10, nsample) null = np.array([np.nan]) data = pandas.DataFrame({'x': np.concatenate((data, null))}) beta = np.array([1, 0.1]) e = np.random.normal(size=nsample + 1) data['y'] = beta[0] + beta[1] * data['x'] + e model = OLS.from_formula('y ~ x', data=data) fit = model.fit() fit.predict(exog=data[:-1])
def test_missing_formula_predict(): # see 2171 nsample = 30 data = pandas.DataFrame({'x': np.linspace(0, 10, nsample)}) null = pandas.DataFrame({'x': np.array([np.nan])}) data = pandas.concat([data, null]) beta = np.array([1, 0.1]) e = np.random.normal(size=nsample+1) data['y'] = beta[0] + beta[1] * data['x'] + e model = OLS.from_formula('y ~ x', data=data) fit = model.fit() pred = fit.predict(exog=data[:-1])
def env_corr(self, env_vars, coeff_plot=False, qq_plot=False): """ Determine correlations with environmental/non-discretionary variables using a logit regression. Tobit will be implemented when available upstream in statsmodels. Takes: env_vars: A pandas dataframe of environmental variables Returns: corr_mod: the statsmodels' model instance containing the inputs and results from the logit model. Note that there can be no spaces in the variables' names. """ import matplotlib.pyplot as plt from statsmodels.regression.linear_model import OLS from statsmodels.graphics.gofplots import qqplot from seaborn import coefplot env_data = _to_dataframe(env_vars) corr_data = env_data.join(self['Efficiency']) corr_mod = OLS.from_formula( "Efficiency ~ " + " + ".join(env_vars.columns), corr_data) corr_res = corr_mod.fit() #plot coeffs if coeff_plot: coefplot("Efficiency ~ " + " + ".join(env_vars.columns), data=corr_data) plt.xticks(rotation=45, ha='right') plt.title('Regression coefficients and standard errors') #plot qq of residuals if qq_plot: qqplot(corr_res.resid, line='s') plt.title('Distribution of residuals') print(corr_res.summary()) return corr_res
def test_outlier_test(): endog, exog, labels = get_duncan_data() ndarray_mod = OLS(endog, exog).fit() rstudent = [ 3.1345185839, -2.3970223990, 2.0438046359, -1.9309187757, 1.8870465798, -1.7604905300, -1.7040324156, 1.6024285876, -1.4332485037, -1.1044851583, 1.0688582315, 1.0185271840, -0.9024219332, -0.9023876471, -0.8830953936, 0.8265782334, 0.8089220547, 0.7682770197, 0.7319491074, -0.6665962829, 0.5227352794, -0.5135016547, 0.5083881518, 0.4999224372, -0.4980818221, -0.4759717075, -0.4293565820, -0.4114056499, -0.3779540862, 0.3556874030, 0.3409200462, 0.3062248646, 0.3038999429, -0.3030815773, -0.1873387893, 0.1738050251, 0.1424246593, -0.1292266025, 0.1272066463, -0.0798902878, 0.0788467222, 0.0722556991, 0.0505098280, 0.0233215136, 0.0007112055 ] unadj_p = [ 0.003177202, 0.021170298, 0.047432955, 0.060427645, 0.066248120, 0.085783008, 0.095943909, 0.116738318, 0.159368890, 0.275822623, 0.291386358, 0.314400295, 0.372104049, 0.372122040, 0.382333561, 0.413260793, 0.423229432, 0.446725370, 0.468363101, 0.508764039, 0.603971990, 0.610356737, 0.613905871, 0.619802317, 0.621087703, 0.636621083, 0.669911674, 0.682917818, 0.707414459, 0.723898263, 0.734904667, 0.760983108, 0.762741124, 0.763360242, 0.852319039, 0.862874018, 0.887442197, 0.897810225, 0.899398691, 0.936713197, 0.937538115, 0.942749758, 0.959961394, 0.981506948, 0.999435989 ] bonf_p = [ 0.1429741, 0.9526634, 2.1344830, 2.7192440, 2.9811654, 3.8602354, 4.3174759, 5.2532243, 7.1716001, 12.4120180, 13.1123861, 14.1480133, 16.7446822, 16.7454918, 17.2050103, 18.5967357, 19.0453245, 20.1026416, 21.0763395, 22.8943818, 27.1787396, 27.4660532, 27.6257642, 27.8911043, 27.9489466, 28.6479487, 30.1460253, 30.7313018, 31.8336506, 32.5754218, 33.0707100, 34.2442399, 34.3233506, 34.3512109, 38.3543568, 38.8293308, 39.9348989, 40.4014601, 40.4729411, 42.1520939, 42.1892152, 42.4237391, 43.1982627, 44.1678127, 44.9746195 ] bonf_p = np.array(bonf_p) bonf_p[bonf_p > 1] = 1 sorted_labels = [ "minister", "reporter", "contractor", "insurance.agent", "machinist", "store.clerk", "conductor", "factory.owner", "mail.carrier", "streetcar.motorman", "carpenter", "coal.miner", "bartender", "bookkeeper", "soda.clerk", "chemist", "RR.engineer", "professor", "electrician", "gas.stn.attendant", "auto.repairman", "watchman", "banker", "machine.operator", "dentist", "waiter", "shoe.shiner", "welfare.worker", "plumber", "physician", "pilot", "engineer", "accountant", "lawyer", "undertaker", "barber", "store.manager", "truck.driver", "cook", "janitor", "policeman", "architect", "teacher", "taxi.driver", "author" ] res2 = np.c_[rstudent, unadj_p, bonf_p] res = oi.outlier_test(ndarray_mod, method='b', labels=labels, order=True) np.testing.assert_almost_equal(res.values, res2, 7) np.testing.assert_equal(res.index.tolist(), sorted_labels) # pylint: disable-msg=E1103 data = pd.DataFrame(np.column_stack((endog, exog)), columns='y const var1 var2'.split(), index=labels) # check `order` with pandas bug in #3971 res_pd = OLS.from_formula('y ~ const + var1 + var2 - 0', data).fit() res_outl2 = oi.outlier_test(res_pd, method='b', order=True) assert_almost_equal(res_outl2.values, res2, 7) assert_equal(res_outl2.index.tolist(), sorted_labels) res_outl1 = res_pd.outlier_test(method='b') res_outl1 = res_outl1.sort_values(['unadj_p'], ascending=True) assert_almost_equal(res_outl1.values, res2, 7) assert_equal(res_outl1.index.tolist(), sorted_labels) assert_array_equal(res_outl2.index, res_outl1.index) # additional keywords in method res_outl3 = res_pd.outlier_test(method='b', order=True) assert_equal(res_outl3.index.tolist(), sorted_labels) res_outl4 = res_pd.outlier_test(method='b', order=True, cutoff=0.15) assert_equal(res_outl4.index.tolist(), sorted_labels[:1])
def test_outlier_test(): endog, exog, labels = get_duncan_data() ndarray_mod = OLS(endog, exog).fit() rstudent = [3.1345185839, -2.3970223990, 2.0438046359, -1.9309187757, 1.8870465798, -1.7604905300, -1.7040324156, 1.6024285876, -1.4332485037, -1.1044851583, 1.0688582315, 1.0185271840, -0.9024219332, -0.9023876471, -0.8830953936, 0.8265782334, 0.8089220547, 0.7682770197, 0.7319491074, -0.6665962829, 0.5227352794, -0.5135016547, 0.5083881518, 0.4999224372, -0.4980818221, -0.4759717075, -0.4293565820, -0.4114056499, -0.3779540862, 0.3556874030, 0.3409200462, 0.3062248646, 0.3038999429, -0.3030815773, -0.1873387893, 0.1738050251, 0.1424246593, -0.1292266025, 0.1272066463, -0.0798902878, 0.0788467222, 0.0722556991, 0.0505098280, 0.0233215136, 0.0007112055] unadj_p = [0.003177202, 0.021170298, 0.047432955, 0.060427645, 0.066248120, 0.085783008, 0.095943909, 0.116738318, 0.159368890, 0.275822623, 0.291386358, 0.314400295, 0.372104049, 0.372122040, 0.382333561, 0.413260793, 0.423229432, 0.446725370, 0.468363101, 0.508764039, 0.603971990, 0.610356737, 0.613905871, 0.619802317, 0.621087703, 0.636621083, 0.669911674, 0.682917818, 0.707414459, 0.723898263, 0.734904667, 0.760983108, 0.762741124, 0.763360242, 0.852319039, 0.862874018, 0.887442197, 0.897810225, 0.899398691, 0.936713197, 0.937538115, 0.942749758, 0.959961394, 0.981506948, 0.999435989] bonf_p = [0.1429741, 0.9526634, 2.1344830, 2.7192440, 2.9811654, 3.8602354, 4.3174759, 5.2532243, 7.1716001, 12.4120180, 13.1123861, 14.1480133, 16.7446822, 16.7454918, 17.2050103, 18.5967357, 19.0453245, 20.1026416, 21.0763395, 22.8943818, 27.1787396, 27.4660532, 27.6257642, 27.8911043, 27.9489466, 28.6479487, 30.1460253, 30.7313018, 31.8336506, 32.5754218, 33.0707100, 34.2442399, 34.3233506, 34.3512109, 38.3543568, 38.8293308, 39.9348989, 40.4014601, 40.4729411, 42.1520939, 42.1892152, 42.4237391, 43.1982627, 44.1678127, 44.9746195] bonf_p = np.array(bonf_p) bonf_p[bonf_p > 1] = 1 sorted_labels = ["minister", "reporter", "contractor", "insurance.agent", "machinist", "store.clerk", "conductor", "factory.owner", "mail.carrier", "streetcar.motorman", "carpenter", "coal.miner", "bartender", "bookkeeper", "soda.clerk", "chemist", "RR.engineer", "professor", "electrician", "gas.stn.attendant", "auto.repairman", "watchman", "banker", "machine.operator", "dentist", "waiter", "shoe.shiner", "welfare.worker", "plumber", "physician", "pilot", "engineer", "accountant", "lawyer", "undertaker", "barber", "store.manager", "truck.driver", "cook", "janitor", "policeman", "architect", "teacher", "taxi.driver", "author"] res2 = np.c_[rstudent, unadj_p, bonf_p] res = oi.outlier_test(ndarray_mod, method='b', labels=labels, order=True) np.testing.assert_almost_equal(res.values, res2, 7) np.testing.assert_equal(res.index.tolist(), sorted_labels) # pylint: disable-msg=E1103 data = pd.DataFrame(np.column_stack((endog, exog)), columns='y const var1 var2'.split(), index=labels) # check `order` with pandas bug in #3971 res_pd = OLS.from_formula('y ~ const + var1 + var2 - 0', data).fit() res_outl2 = oi.outlier_test(res_pd, method='b', order=True) assert_almost_equal(res_outl2.values, res2, 7) assert_equal(res_outl2.index.tolist(), sorted_labels) res_outl1 = res_pd.outlier_test(method='b') res_outl1 = res_outl1.sort_values(['unadj_p'], ascending=True) assert_almost_equal(res_outl1.values, res2, 7) assert_equal(res_outl1.index.tolist(), sorted_labels) assert_array_equal(res_outl2.index, res_outl1.index) # additional keywords in method res_outl3 = res_pd.outlier_test(method='b', order=True) assert_equal(res_outl3.index.tolist(), sorted_labels) res_outl4 = res_pd.outlier_test(method='b', order=True, cutoff=0.15) assert_equal(res_outl4.index.tolist(), sorted_labels[:1])
def setup_class(cls): formula_outcome = 'bweight ~ prenatal1_ + mmarried_ + mage + fbaby_' mod = OLS.from_formula(formula_outcome, dta_cat) tind = np.asarray(dta_cat['mbsmoke_']) cls.teff = TreatmentEffect(mod, tind, results_select=res_probit)