def test_f_score_with_covars_and_normalized_design_withcovar(random_state=0): """ This test has a statsmodels dependance. There seems to be no simple, alternative way to perform a F-test on a linear model including covariates. """ try: from statsmodels.regression.linear_model import OLS except: warnings.warn("Statsmodels is required to run this test") raise nose.SkipTest rng = check_random_state(random_state) ### Normalized data n_samples = 50 # generate data var1 = np.ones((n_samples, 1)) / np.sqrt(n_samples) # normalized var2 = rng.randn(n_samples, 1) var2 = var2 / np.sqrt(np.sum(var2 ** 2, 0)) # normalize covars = np.eye(n_samples, 3) # covars is orthogonal covars[3] = -1 # covars is orthogonal to var1 covars = orthonormalize_matrix(covars) # own f_score f_val_own = _f_score_with_covars_and_normalized_design(var1, var2, covars)[0] # statsmodels f_score test_matrix = np.array([[1.0, 0.0, 0.0, 0.0]]) statsmodels_ols = OLS(var2, np.hstack((var1, covars))).fit() f_val_statsmodels = statsmodels_ols.f_test(test_matrix).fvalue[0] assert_array_almost_equal(f_val_own, f_val_statsmodels)
def test_regularized_weights(self): np.random.seed(1432) exog1 = np.random.normal(size=(100, 3)) endog1 = exog1[:, 0] + exog1[:, 1] + np.random.normal(size=100) exog2 = np.random.normal(size=(100, 3)) endog2 = exog2[:, 0] + exog2[:, 1] + np.random.normal(size=100) exog_a = np.vstack((exog1, exog1, exog2)) endog_a = np.concatenate((endog1, endog1, endog2)) # Should be equivalent to exog_a, endog_a. exog_b = np.vstack((exog1, exog2)) endog_b = np.concatenate((endog1, endog2)) wgts = np.ones(200) wgts[0:100] = 2 sigma = np.diag(1/wgts) for L1_wt in 0, 0.5, 1: for alpha in 0, 1: mod1 = OLS(endog_a, exog_a) rslt1 = mod1.fit_regularized(L1_wt=L1_wt, alpha=alpha) mod2 = WLS(endog_b, exog_b, weights=wgts) rslt2 = mod2.fit_regularized(L1_wt=L1_wt, alpha=alpha) mod3 = GLS(endog_b, exog_b, sigma=sigma) rslt3 = mod3.fit_regularized(L1_wt=L1_wt, alpha=alpha) assert_almost_equal(rslt1.params, rslt2.params, decimal=3) assert_almost_equal(rslt1.params, rslt3.params, decimal=3)
def fit_dlogM_mw(tab, sfrsd_tab, mltype='ring', mlb='i'): merge_tab = t.join(tab, sfrsd_tab, 'plateifu') is_agn = m.mask_from_maskbits(merge_tab['mngtarg3'], [1, 2, 3, 4]) mlb_ix = totalmass.StellarMass.bands_ixs[mlb] absmag_sun_mlb = totalmass.StellarMass.absmag_sun[mlb_ix] logmass_in_ifu = merge_tab['mass_in_ifu'].to(u.dex(u.Msun)) logmass_in_ifu_lw = merge_tab['ml_fluxwt'] + merge_tab['ifu_absmag'][:, mlb_ix].to( u.dex(m.bandpass_sol_l_unit), totalmass.bandpass_flux_to_solarunits(absmag_sun_mlb)) merge_tab['dlogmass_lw'] = logmass_in_ifu - logmass_in_ifu_lw ha_corr = np.exp(merge_tab['mean_atten_mwtd'] * (6563 / 5500)**-1.3) sfrsd = merge_tab['sigma_sfr'] * ha_corr * u.Msun / u.yr / u.pc**2 mass_pca = merge_tab['mass_in_ifu'] + merge_tab['outer_mass_{}'.format(mltype)] ssfrsd = sfrsd / mass_pca merge_tab['log_ssfrsd'] = ssfrsd.to(u.dex(ssfrsd.unit)) merge_tab['log_ssfrsd'][~np.isfinite(merge_tab['log_ssfrsd'])] = np.nan * merge_tab['log_ssfrsd'].unit ols = OLS( endog=np.array(merge_tab['dlogmass_lw'][~is_agn]), exog=sm_add_constant( t.Table(merge_tab['mean_atten_mwtd', 'std_atten_mwtd', 'log_ssfrsd'])[~is_agn].to_pandas(), prepend=False), hasconst=True, missing='drop') olsfit = ols.fit() return olsfit
def test_regression_with_tuples(self): i = pandas.Series([1, 2, 3, 4] * 10, name="i") y = pandas.Series([1, 2, 3, 4, 5] * 8, name="y") x = pandas.Series([1, 2, 3, 4, 5, 6, 7, 8] * 5, name="x") df = pandas.DataFrame(index=i.index) df = df.join(i) endo = df.join(y) exo = df.join(x) endo_groups = endo.groupby("i") exo_groups = exo.groupby("i") exo_df = exo_groups.agg([np.sum, np.max]) endo_df = endo_groups.agg([np.sum, np.max]) reg = OLS(exo_df[[("x", "sum")]], endo_df).fit() interesting_lines = [] import warnings with warnings.catch_warnings(): # Catch ominormal warning, not interesting here warnings.simplefilter("ignore") for line in str(reg.summary()).splitlines(): if "_" in line: interesting_lines.append(line[:38]) desired = ["Dep. Variable: x_sum ", "y_sum 1.4595 0.209 ", "y_amax 0.2432 0.035 "] assert_equal(sorted(desired), sorted(interesting_lines))
def reset_ramsey(res, degree=5): '''Ramsey's RESET specification test for linear models This is a general specification test, for additional non-linear effects in a model. Notes ----- The test fits an auxiliary OLS regression where the design matrix, exog, is augmented by powers 2 to degree of the fitted values. Then it performs an F-test whether these additional terms are significant. If the p-value of the f-test is below a threshold, e.g. 0.1, then this indicates that there might be additional non-linear effects in the model and that the linear model is mis-specified. References ---------- http://en.wikipedia.org/wiki/Ramsey_RESET_test ''' order = degree + 1 k_vars = res.model.exog.shape[1] #vander without constant and x: y_fitted_vander = np.vander(res.fittedvalues, order)[:, :-2] #drop constant exog = np.column_stack((res.model.exog, y_fitted_vander)) res_aux = OLS(res.model.endog, exog).fit() #r_matrix = np.eye(degree, exog.shape[1], k_vars) r_matrix = np.eye(degree-1, exog.shape[1], k_vars) #df1 = degree - 1 #df2 = exog.shape[0] - degree - res.df_model (without constant) return res_aux.f_test(r_matrix) #, r_matrix, res_aux
def setupClass(cls): R = np.zeros(7) R[4:6] = [1,-1] data = longley.load() data.exog = add_constant(data.exog, prepend=False) res1 = OLS(data.endog, data.exog).fit() cls.Ttest1 = res1.t_test(R)
def test_regularized(self): import os from . import glmnet_r_results cur_dir = os.path.dirname(os.path.abspath(__file__)) data = np.loadtxt(os.path.join(cur_dir, "results", "lasso_data.csv"), delimiter=",") tests = [x for x in dir(glmnet_r_results) if x.startswith("rslt_")] for test in tests: vec = getattr(glmnet_r_results, test) n = vec[0] p = vec[1] L1_wt = float(vec[2]) lam = float(vec[3]) params = vec[4:].astype(np.float64) endog = data[0:int(n), 0] exog = data[0:int(n), 1:(int(p)+1)] endog = endog - endog.mean() endog /= endog.std(ddof=1) exog = exog - exog.mean(0) exog /= exog.std(0, ddof=1) mod = OLS(endog, exog) rslt = mod.fit_regularized(L1_wt=L1_wt, alpha=lam) assert_almost_equal(rslt.params, params, decimal=3) # Smoke test for summary smry = rslt.summary()
def test_permuted_ols_intercept_statsmodels_withcovar(random_state=0): """ This test has a statsmodels dependance. There seems to be no simple, alternative way to perform a F-test on a linear model including covariates. """ try: from statsmodels.regression.linear_model import OLS except: warnings.warn("Statsmodels is required to run this test") raise nose.SkipTest rng = check_random_state(random_state) # design parameters n_samples = 50 # create design target_var = rng.randn(n_samples, 1) tested_var = np.ones((n_samples, 1)) confounding_vars = rng.randn(n_samples, 2) # statsmodels OLS ols = OLS(target_var, np.hstack((tested_var, confounding_vars))).fit() fvals = ols.f_test([[1.0, 0.0, 0.0]]).fvalue # permuted OLS _, orig_scores, _ = permuted_ols(tested_var, target_var, confounding_vars, n_perm=0, random_state=random_state) # same thing but with model_intercept=True to check it has no effect _, orig_scores_addintercept, _ = permuted_ols( tested_var, target_var, confounding_vars, model_intercept=True, n_perm=0, random_state=random_state ) assert_array_almost_equal(fvals, orig_scores, decimal=6) assert_array_almost_equal(orig_scores, orig_scores_addintercept, decimal=6)
def test_repeat_partition(): # tests that if we use identical partitions the average is the same # as the estimate for the full data np.random.seed(435265) N = 200 p = 10 m = 1 beta = np.random.normal(size=p) beta = beta * np.random.randint(0, 2, p) X = np.random.normal(size=(N, p)) y = X.dot(beta) + np.random.normal(size=N) def _rep_data_gen(endog, exog, partitions): """partitions data""" n_exog = exog.shape[0] n_part = np.ceil(n_exog / partitions) ii = 0 while ii < n_exog: yield endog, exog ii += int(n_part) nv_mod = DistributedModel(m, estimation_method=_est_regularized_naive, join_method=_join_naive) fitOLSnv = nv_mod.fit(_rep_data_gen(y, X, m), fit_kwds={"alpha": 0.1}) ols_mod = OLS(y, X) fitOLS = ols_mod.fit_regularized(alpha=0.1) assert_allclose(fitOLSnv.params, fitOLS.params)
def setup_class(cls): cls.cov_type = 'cluster' mod1 = GLM(endog, exog, family=families.Gaussian()) cls.res1 = mod1.fit(cov_type='cluster', cov_kwds=dict(groups=group)) mod2 = OLS(endog, exog) cls.res2 = mod2.fit(cov_type='cluster', cov_kwds=dict(groups=group))
def setup_class(cls): cls.cov_type = 'HC0' mod1 = GLM(endog, exog, family=families.Gaussian()) cls.res1 = mod1.fit(cov_type='HC0') mod2 = OLS(endog, exog) cls.res2 = mod2.fit(cov_type='HC0')
def setupClass(cls): data = longley.load() data.exog = add_constant(data.exog, prepend=False) res1 = OLS(data.endog, data.exog).fit() R2 = [[0,1,-1,0,0,0,0],[0, 0, 0, 0, 1, -1, 0]] cls.Ftest1 = res1.f_test(R2) hyp = 'x2 = x3, x5 = x6' cls.NewFtest1 = res1.f_test(hyp)
def test_filter(): # Basic test for filtering mod = RecursiveLS(endog, exog) res = mod.filter() # Test the RLS estimates against OLS estimates mod_ols = OLS(endog, exog) res_ols = mod_ols.fit() assert_allclose(res.params, res_ols.params)
def qqline(ax, line, x=None, y=None, dist=None, fmt='r-'): """ Plot a reference line for a qqplot. Parameters ---------- ax : matplotlib axes instance The axes on which to plot the line line : str {'45','r','s','q'} Options for the reference line to which the data is compared.: - '45' - 45-degree line - 's' - standardized line, the expected order statistics are scaled by the standard deviation of the given sample and have the mean added to them - 'r' - A regression line is fit - 'q' - A line is fit through the quartiles. - None - By default no reference line is added to the plot. x : array X data for plot. Not needed if line is '45'. y : array Y data for plot. Not needed if line is '45'. dist : scipy.stats.distribution A scipy.stats distribution, needed if line is 'q'. Notes ----- There is no return value. The line is plotted on the given `ax`. """ if line == '45': end_pts = zip(ax.get_xlim(), ax.get_ylim()) end_pts[0] = min(end_pts[0]) end_pts[1] = max(end_pts[1]) ax.plot(end_pts, end_pts, fmt) ax.set_xlim(end_pts) ax.set_ylim(end_pts) return # does this have any side effects? if x is None and y is None: raise ValueError("If line is not 45, x and y cannot be None.") elif line == 'r': # could use ax.lines[0].get_xdata(), get_ydata(), # but don't know axes are 'clean' y = OLS(y, add_constant(x)).fit().fittedvalues ax.plot(x,y,fmt) elif line == 's': m,b = y.std(), y.mean() ref_line = x*m + b ax.plot(x, ref_line, fmt) elif line == 'q': _check_for_ppf(dist) q25 = stats.scoreatpercentile(y, 25) q75 = stats.scoreatpercentile(y, 75) theoretical_quartiles = dist.ppf([0.25, 0.75]) m = (q75 - q25) / np.diff(theoretical_quartiles) b = q25 - m*theoretical_quartiles[0] ax.plot(x, m*x + b, fmt)
def test_conf_int_single_regressor(): # GH#706 single-regressor model (i.e. no intercept) with 1D exog # should get passed to DataFrame for conf_int y = pandas.Series(np.random.randn(10)) x = pandas.Series(np.ones(10)) res = OLS(y, x).fit() conf_int = res.conf_int() np.testing.assert_equal(conf_int.shape, (1, 2)) np.testing.assert_(isinstance(conf_int, pandas.DataFrame))
def test_706(): # make sure one regressor pandas Series gets passed to DataFrame # for conf_int. y = pandas.Series(np.random.randn(10)) x = pandas.Series(np.ones(10)) res = OLS(y,x).fit() conf_int = res.conf_int() np.testing.assert_equal(conf_int.shape, (1, 2)) np.testing.assert_(isinstance(conf_int, pandas.DataFrame))
def test_summary_as_latex(): # GH#734 import re dta = longley.load_pandas() X = dta.exog X["constant"] = 1 y = dta.endog res = OLS(y, X).fit() with pytest.warns(UserWarning): table = res.summary().as_latex() # replace the date and time table = re.sub("(?<=\n\\\\textbf\\{Date:\\} &).+?&", " Sun, 07 Apr 2013 &", table) table = re.sub("(?<=\n\\\\textbf\\{Time:\\} &).+?&", " 13:46:07 &", table) expected = """\\begin{center} \\begin{tabular}{lclc} \\toprule \\textbf{Dep. Variable:} & TOTEMP & \\textbf{ R-squared: } & 0.995 \\\\ \\textbf{Model:} & OLS & \\textbf{ Adj. R-squared: } & 0.992 \\\\ \\textbf{Method:} & Least Squares & \\textbf{ F-statistic: } & 330.3 \\\\ \\textbf{Date:} & Sun, 07 Apr 2013 & \\textbf{ Prob (F-statistic):} & 4.98e-10 \\\\ \\textbf{Time:} & 13:46:07 & \\textbf{ Log-Likelihood: } & -109.62 \\\\ \\textbf{No. Observations:} & 16 & \\textbf{ AIC: } & 233.2 \\\\ \\textbf{Df Residuals:} & 9 & \\textbf{ BIC: } & 238.6 \\\\ \\textbf{Df Model:} & 6 & \\textbf{ } & \\\\ \\bottomrule \\end{tabular} \\begin{tabular}{lcccccc} & \\textbf{coef} & \\textbf{std err} & \\textbf{t} & \\textbf{P$> |$t$|$} & \\textbf{[0.025} & \\textbf{0.975]} \\\\ \\midrule \\textbf{GNPDEFL} & 15.0619 & 84.915 & 0.177 & 0.863 & -177.029 & 207.153 \\\\ \\textbf{GNP} & -0.0358 & 0.033 & -1.070 & 0.313 & -0.112 & 0.040 \\\\ \\textbf{UNEMP} & -2.0202 & 0.488 & -4.136 & 0.003 & -3.125 & -0.915 \\\\ \\textbf{ARMED} & -1.0332 & 0.214 & -4.822 & 0.001 & -1.518 & -0.549 \\\\ \\textbf{POP} & -0.0511 & 0.226 & -0.226 & 0.826 & -0.563 & 0.460 \\\\ \\textbf{YEAR} & 1829.1515 & 455.478 & 4.016 & 0.003 & 798.788 & 2859.515 \\\\ \\textbf{constant} & -3.482e+06 & 8.9e+05 & -3.911 & 0.004 & -5.5e+06 & -1.47e+06 \\\\ \\bottomrule \\end{tabular} \\begin{tabular}{lclc} \\textbf{Omnibus:} & 0.749 & \\textbf{ Durbin-Watson: } & 2.559 \\\\ \\textbf{Prob(Omnibus):} & 0.688 & \\textbf{ Jarque-Bera (JB): } & 0.684 \\\\ \\textbf{Skew:} & 0.420 & \\textbf{ Prob(JB): } & 0.710 \\\\ \\textbf{Kurtosis:} & 2.434 & \\textbf{ Cond. No. } & 4.86e+09 \\\\ \\bottomrule \\end{tabular} %\\caption{OLS Regression Results} \\end{center} Warnings: \\newline [1] Standard Errors assume that the covariance matrix of the errors is correctly specified. \\newline [2] The condition number is large, 4.86e+09. This might indicate that there are \\newline strong multicollinearity or other numerical problems.""" assert_equal(table, expected)
def setup_class(cls): cls.cov_type = 'HAC' kwds={'maxlags':2} mod1 = GLM(endog, exog, family=families.Gaussian()) cls.res1 = mod1.fit(cov_type='HAC', cov_kwds=kwds) mod2 = OLS(endog, exog) cls.res2 = mod2.fit(cov_type='HAC', cov_kwds=kwds)
def test_empty_model(self): np.random.seed(742) n = 100 endog = np.random.normal(size=n) exog = np.random.normal(size=(n, 3)) model = OLS(endog, exog) result = model.fit_regularized(alpha=1000) assert_equal(result.params, 0.)
def test_outlier_influence_funcs(): # smoke test x = add_constant(np.random.randn(10, 2)) y = x.sum(1) + np.random.randn(10) res = OLS(y, x).fit() oi.summary_table(res, alpha=0.05) res2 = OLS(y, x[:, 0]).fit() oi.summary_table(res2, alpha=0.05) infl = res2.get_influence() infl.summary_table()
def setupClass(cls): data = longley.load() data.exog = add_constant(data.exog, prepend=False) res1 = OLS(data.endog, data.exog).fit() R = np.array([[0,1,1,0,0,0,0], [0,1,0,1,0,0,0], [0,1,0,0,0,0,0], [0,0,0,0,1,0,0], [0,0,0,0,0,1,0]]) q = np.array([0,0,0,1,0]) cls.Ftest1 = res1.f_test((R,q))
def setup_class(cls): cls.cov_type = 'HAC' # check kernel specified as string kwds = {'kernel': 'bartlett', 'maxlags': 2} mod1 = GLM(endog, exog, family=families.Gaussian()) cls.res1 = mod1.fit(cov_type='HAC', cov_kwds=kwds) mod2 = OLS(endog, exog) kwds2 = {'maxlags': 2} cls.res2 = mod2.fit(cov_type='HAC', cov_kwds=kwds2)
def test_regularized_refit(): n = 100 p = 5 np.random.seed(3132) xmat = np.random.normal(size=(n, p)) yvec = xmat.sum(1) + np.random.normal(size=n) model1 = OLS(yvec, xmat) result1 = model1.fit_regularized(alpha=2., L1_wt=0.5, refit=True) model2 = OLS(yvec, xmat) result2 = model2.fit_regularized(alpha=2., L1_wt=0.5, refit=True) assert_allclose(result1.params, result2.params) assert_allclose(result1.bse, result2.bse)
def setup_class(cls): cls.cov_type = 'HAC' kwds={'kernel': sw.weights_uniform, 'maxlags': 2} mod1 = GLM(endog, exog, family=families.Gaussian()) cls.res1 = mod1.fit(cov_type='HAC', cov_kwds=kwds) # check kernel as string mod2 = OLS(endog, exog) kwds2 = {'kernel': 'uniform', 'maxlags': 2} cls.res2 = mod2.fit(cov_type='HAC', cov_kwds=kwds)
def test_regularized_options(): n = 100 p = 5 np.random.seed(3132) xmat = np.random.normal(size=(n, p)) yvec = xmat.sum(1) + np.random.normal(size=n) model1 = OLS(yvec - 1, xmat) result1 = model1.fit_regularized(alpha=1., L1_wt=0.5) model2 = OLS(yvec, xmat, offset=1) result2 = model2.fit_regularized(alpha=1., L1_wt=0.5, start_params=np.zeros(5)) assert_allclose(result1.params, result2.params)
def setup_class(cls): cls.cov_type = 'HAC' kwds={'kernel':sw.weights_uniform, 'maxlags':2} mod1 = GLM(endog, exog, family=families.Gaussian()) cls.res1 = mod1.fit(cov_type='HAC', cov_kwds=kwds) mod2 = OLS(endog, exog) cls.res2 = mod2.fit(cov_type='HAC', cov_kwds=kwds) #for debugging cls.res3 = mod2.fit(cov_type='HAC', cov_kwds={'maxlags':2})
def test_summary(): # test 734 import re dta = longley.load_pandas() X = dta.exog X["constant"] = 1 y = dta.endog with warnings.catch_warnings(record=True): res = OLS(y, X).fit() table = res.summary().as_latex() # replace the date and time table = re.sub("(?<=\n\\\\textbf\{Date:\} &).+?&", " Sun, 07 Apr 2013 &", table) table = re.sub("(?<=\n\\\\textbf\{Time:\} &).+?&", " 13:46:07 &", table) expected = """\\begin{center} \\begin{tabular}{lclc} \\toprule \\textbf{Dep. Variable:} & TOTEMP & \\textbf{ R-squared: } & 0.995 \\\\ \\textbf{Model:} & OLS & \\textbf{ Adj. R-squared: } & 0.992 \\\\ \\textbf{Method:} & Least Squares & \\textbf{ F-statistic: } & 330.3 \\\\ \\textbf{Date:} & Sun, 07 Apr 2013 & \\textbf{ Prob (F-statistic):} & 4.98e-10 \\\\ \\textbf{Time:} & 13:46:07 & \\textbf{ Log-Likelihood: } & -109.62 \\\\ \\textbf{No. Observations:} & 16 & \\textbf{ AIC: } & 233.2 \\\\ \\textbf{Df Residuals:} & 9 & \\textbf{ BIC: } & 238.6 \\\\ \\textbf{Df Model:} & 6 & \\textbf{ } & \\\\ \\bottomrule \\end{tabular} \\begin{tabular}{lccccc} & \\textbf{coef} & \\textbf{std err} & \\textbf{t} & \\textbf{P$>$$|$t$|$} & \\textbf{[95.0\\% Conf. Int.]} \\\\ \\midrule \\textbf{GNPDEFL} & 15.0619 & 84.915 & 0.177 & 0.863 & -177.029 207.153 \\\\ \\textbf{GNP} & -0.0358 & 0.033 & -1.070 & 0.313 & -0.112 0.040 \\\\ \\textbf{UNEMP} & -2.0202 & 0.488 & -4.136 & 0.003 & -3.125 -0.915 \\\\ \\textbf{ARMED} & -1.0332 & 0.214 & -4.822 & 0.001 & -1.518 -0.549 \\\\ \\textbf{POP} & -0.0511 & 0.226 & -0.226 & 0.826 & -0.563 0.460 \\\\ \\textbf{YEAR} & 1829.1515 & 455.478 & 4.016 & 0.003 & 798.788 2859.515 \\\\ \\textbf{constant} & -3.482e+06 & 8.9e+05 & -3.911 & 0.004 & -5.5e+06 -1.47e+06 \\\\ \\bottomrule \\end{tabular} \\begin{tabular}{lclc} \\textbf{Omnibus:} & 0.749 & \\textbf{ Durbin-Watson: } & 2.559 \\\\ \\textbf{Prob(Omnibus):} & 0.688 & \\textbf{ Jarque-Bera (JB): } & 0.684 \\\\ \\textbf{Skew:} & 0.420 & \\textbf{ Prob(JB): } & 0.710 \\\\ \\textbf{Kurtosis:} & 2.434 & \\textbf{ Cond. No. } & 4.86e+09 \\\\ \\bottomrule \\end{tabular} %\\caption{OLS Regression Results} \\end{center}""" assert_equal(table, expected)
def setup_class(cls): cls.cov_type = 'hac-groupsum' # time index is just made up to have a test case time = np.tile(np.arange(7), 5)[:-1] mod1 = GLM(endog, exog, family=families.Gaussian()) kwds = dict(time=pd.Series(time), # check for #3606 maxlags=2, use_correction='hac', df_correction=False) cls.res1 = mod1.fit(cov_type='hac-groupsum', cov_kwds=kwds) cls.res1b = mod1.fit(cov_type='nw-groupsum', cov_kwds=kwds) mod2 = OLS(endog, exog) cls.res2 = mod2.fit(cov_type='hac-groupsum', cov_kwds=kwds)
def test_regularized_refit(): n = 100 p = 5 np.random.seed(3132) xmat = np.random.normal(size=(n, p)) # covariates 0 and 2 matter yvec = xmat[:, 0] + xmat[:, 2] + np.random.normal(size=n) model1 = OLS(yvec, xmat) result1 = model1.fit_regularized(alpha=2., L1_wt=0.5, refit=True) model2 = OLS(yvec, xmat[:, [0, 2]]) result2 = model2.fit() ii = [0, 2] assert_allclose(result1.params[ii], result2.params) assert_allclose(result1.bse[ii], result2.bse)
def setup_class(cls): cls.cov_type = 'hac-panel' # time index is just made up to have a test case groups = np.repeat(np.arange(5), 7)[:-1] mod1 = GLM(endog.copy(), exog.copy(), family=families.Gaussian()) kwds = dict(groups=pd.Series(groups), # check for #3606 maxlags=2, kernel=sw.weights_uniform, use_correction='hac', df_correction=False) cls.res1 = mod1.fit(cov_type='hac-panel', cov_kwds=kwds) mod2 = OLS(endog, exog) cls.res2 = mod2.fit(cov_type='hac-panel', cov_kwds=kwds)
print sm_res_adf print my_res_adf['adfstat'] print "%0.4f" % my_res_adf['adfstat'] # ===== STABILITY CHECK ===== print key, np.abs(my_res_adf['roots']) print "passes stability check: {0}".format(is_stable(my_res_adf['roots'])) from statsmodels.regression.linear_model import OLS Y = y.diff()[1:] # must remove first element from array which is nan X = pd.concat([x.diff()[1:], e_t_hat.shift(1)[1:]], axis=1) X_c = add_constant(X) sm_res_ecm = OLS(Y, X).fit() # fit without constant sm_res_ecm_c = OLS(Y, X_c).fit() # fit without constant sm_res_ecm_c.summary2() sm_res_ecm.summary2() # ====== FIT TO OU PROCESS ====== # My implementations from analysis import my_AR # AR(p) model # Import statsmodels equivalents to validate results from statsmodels.tsa.ar_model import AR # Run AR(1) model with constant term with e_t_hat as endogenous variable my_res_ar = my_AR(endog=e_t_hat, maxlag=1, trend='c')
2000, )) T = T[::-1] # For percentiles 1, 5 and 10, regress on a constant, and powers of 1/T out = [] for cv in critical_values: num_ex = results.shape[2] loc = np.where(percentiles == cv)[0][0] lhs = np.squeeze(results[loc, :, :]) # Adjust for effective sample size, this is what lookup the code uses tau = np.ones((num_ex, 1)).dot(T[None, :]) - 1.0 tau = tau.T lhs = lhs.ravel() tau = tau.ravel() tau = tau[:, None] n = lhs.shape[0] rhs = np.ones((n, 1)) rhs = np.hstack((rhs, 1.0 / tau)) rhs = np.hstack((rhs, (1.0 / tau)**2.0)) rhs = np.hstack((rhs, (1.0 / tau)**3.0)) res = OLS(lhs, rhs).fit() res.params[np.abs(res.tvalues) < 1.96] = 0.0 out.append(res.params) adf_z_cv_approx[t] = np.array(out) print("from numpy import array") print("") print("adf_z_cv_approx = " + str(adf_z_cv_approx))
def setup_class(cls): data = longley.load(as_pandas=False) data.exog = add_constant(data.exog, prepend=False) cls.res1 = OLS(data.endog, data.exog).fit() cls.res2 = WLS(data.endog, data.exog).fit()
def notyet_atst(): d = macrodata.load().data realinv = d['realinv'] realgdp = d['realgdp'] realint = d['realint'] endog = realinv exog = add_constant(np.c_[realgdp, realint]) res_ols1 = OLS(endog, exog).fit() #growth rates gs_l_realinv = 400 * np.diff(np.log(d['realinv'])) gs_l_realgdp = 400 * np.diff(np.log(d['realgdp'])) lint = d['realint'][:-1] tbilrate = d['tbilrate'][:-1] endogg = gs_l_realinv exogg = add_constant(np.c_[gs_l_realgdp, lint]) exogg2 = add_constant(np.c_[gs_l_realgdp, tbilrate]) res_ols = OLS(endogg, exogg).fit() res_ols2 = OLS(endogg, exogg2).fit() #the following were done accidentally with res_ols1 in R, #with original Greene data params = np.array( [-272.3986041341653, 0.1779455206941112, 0.2149432424658157]) cov_hac_4 = np.array([ 1321.569466333051, -0.2318836566017612, 37.01280466875694, -0.2318836566017614, 4.602339488102263e-05, -0.0104687835998635, 37.012804668757, -0.0104687835998635, 21.16037144168061 ]).reshape(3, 3, order='F') cov_hac_10 = np.array([ 2027.356101193361, -0.3507514463299015, 54.81079621448568, -0.350751446329901, 6.953380432635583e-05, -0.01268990195095196, 54.81079621448564, -0.01268990195095195, 22.92512402151113 ]).reshape(3, 3, order='F') #goldfeld-quandt het_gq_greater = dict(statistic=13.20512768685082, df1=99, df2=98, pvalue=1.246141976112324e-30, distr='f') het_gq_less = dict(statistic=13.20512768685082, df1=99, df2=98, pvalue=1.) het_gq_2sided = dict(statistic=13.20512768685082, df1=99, df2=98, pvalue=1.246141976112324e-30, distr='f') #goldfeld-quandt, fraction = 0.5 het_gq_greater_2 = dict(statistic=87.1328934692124, df1=48, df2=47, pvalue=2.154956842194898e-33, distr='f') gq = smsdia.het_goldfeldquandt(endog, exog, split=0.5) compare_t_est(gq, het_gq_greater, decimal=(13, 14)) assert_equal(gq[-1], 'increasing') harvey_collier = dict(stat=2.28042114041313, df=199, pvalue=0.02364236161988260, distr='t') #hc = harvtest(fm, order.by=ggdp , data = list()) harvey_collier_2 = dict(stat=0.7516918462158783, df=199, pvalue=0.4531244858006127, distr='t')
def plot_ccpr(results, exog_idx, ax=None): """Plot CCPR against one regressor. Generates a CCPR (component and component-plus-residual) plot. Parameters ---------- results : result instance A regression results instance. exog_idx : {int, str} Exogenous, explanatory variable. If string is given, it should be the variable name that you want to use, and you can use arbitrary translations as with a formula. ax : Matplotlib AxesSubplot instance, optional If given, it is used to plot in instead of a new figure being created. Returns ------- fig : Figure If `ax` is None, the created figure. Otherwise the figure to which `ax` is connected. See Also -------- plot_ccpr_grid : Creates CCPR plot for multiple regressors in a plot grid. Notes ----- The CCPR plot provides a way to judge the effect of one regressor on the response variable by taking into account the effects of the other independent variables. The partial residuals plot is defined as Residuals + B_i*X_i versus X_i. The component adds the B_i*X_i versus X_i to show where the fitted line would lie. Care should be taken if X_i is highly correlated with any of the other independent variables. If this is the case, the variance evident in the plot will be an underestimate of the true variance. Examples -------- Using the state crime dataset plot the effect of the rate of single households ('single') on the murder rate while accounting for high school graduation rate ('hs_grad'), percentage of people in an urban area, and rate of poverty ('poverty'). >>> import statsmodels.api as sm >>> import matplotlib.pyplot as plot >>> import statsmodels.formula.api as smf >>> crime_data = sm.datasets.statecrime.load_pandas() >>> results = smf.ols('murder ~ hs_grad + urban + poverty + single', ... data=crime_data.data).fit() >>> sm.graphics.plot_ccpr(results, 'single') >>> plt.show() .. plot:: plots/graphics_regression_ccpr.py References ---------- http://www.itl.nist.gov/div898/software/dataplot/refman1/auxillar/ccpr.htm """ fig, ax = utils.create_mpl_ax(ax) exog_name, exog_idx = utils.maybe_name_or_idx(exog_idx, results.model) results = maybe_unwrap_results(results) x1 = results.model.exog[:, exog_idx] #namestr = ' for %s' % self.name if self.name else '' x1beta = x1 * results.params[exog_idx] ax.plot(x1, x1beta + results.resid, 'o') from statsmodels.tools.tools import add_constant mod = OLS(x1beta, add_constant(x1)).fit() params = mod.params fig = abline_plot(*params, **dict(ax=ax)) #ax.plot(x1, x1beta, '-') ax.set_title('Component and component plus residual plot') ax.set_ylabel("Residual + %s*beta_%d" % (exog_name, exog_idx)) ax.set_xlabel("%s" % exog_name) return fig
def __slope_ols(x): x = x[~np.isnan(x)] xs = 2 * (x - min(x)) / (max(x) - min(x)) - 1 m = OLS(xs, np.vander(np.linspace(-1, 1, len(xs)), 2)).fit() return m.params[0]
r_2016 = r_shares.loc[r_shares['Year'] == 2016] # unify county/city names by capitalizing all acs['County/City'] = acs['County/City'].str.upper() r_2016['County/City'] = r_shares['County/City'].str.upper() # join dataframes r_acs = pd.merge(acs, r_2016, on='County/City') sbn.pairplot( r_acs, vars=['Household Income', 'Medicare Coverage', 'Foreign Born', 'R_SHARE']) fig2 = plt.gcf() fig2.savefig('pairplot.png', bbox_inches='tight') from statsmodels.regression.linear_model import OLS # define variables for regression X = r_acs[['Household Income', 'Medicare Coverage', 'Foreign Born']] X['Intercept'] = 1 y = r_acs['R_SHARE'] # run regression reg = OLS(y, X) fit = reg.fit() fit.summary() # save coefficients pd.DataFrame(fit.params).to_csv('coefficients.csv')
def get_tvalue_with_alternative_library(tested_vars, target_vars, covars=None): """Utility function to compute tvalues with linalg or statsmodels Massively univariate linear model (= each target is considered independently). Parameters ---------- tested_vars: array-like, shape=(n_samples, n_regressors) Tested variates, the associated coefficient of which are to be tested independently with a t-test, resulting in as many t-values. target_vars: array-like, shape=(n_samples, n_targets) Target variates, to be approximated with a linear combination of the tested variates and the confounding variates. covars: array-like, shape=(n_samples, n_confounds) Confounding variates, to be fitted but not to be tested Returns ------- t-values: np.ndarray, shape=(n_regressors, n_targets) """ ### set up design n_samples, n_regressors = tested_vars.shape n_targets = target_vars.shape[1] if covars is not None: n_covars = covars.shape[1] design_matrix = np.hstack((tested_vars, covars)) else: n_covars = 0 design_matrix = tested_vars mask_covars = np.ones(n_regressors + n_covars, dtype=bool) mask_covars[:n_regressors] = False test_matrix = np.array([[1.] + [0.] * n_covars]) ### t-values computation try: # try with statsmodels is available (more concise) from statsmodels.regression.linear_model import OLS t_values = np.empty((n_targets, n_regressors)) for i in range(n_targets): current_target = target_vars[:, i].reshape((-1, 1)) for j in range(n_regressors): current_tested_mask = mask_covars.copy() current_tested_mask[j] = True current_design_matrix = design_matrix[:, current_tested_mask] ols_fit = OLS(current_target, current_design_matrix).fit() t_values[i, j] = np.ravel(ols_fit.t_test(test_matrix).tvalue) except: # use linalg if statsmodels is not available from numpy import linalg lost_dof = n_covars + 1 # fit all tested variates independently t_values = np.empty((n_targets, n_regressors)) for i in range(n_regressors): current_tested_mask = mask_covars.copy() current_tested_mask[i] = True current_design_matrix = design_matrix[:, current_tested_mask] invcov = linalg.pinv(current_design_matrix) normalized_cov = np.dot(invcov, invcov.T) t_val_denom_aux = np.diag( np.dot(test_matrix, np.dot(normalized_cov, test_matrix.T))) t_val_denom_aux = t_val_denom_aux.reshape((-1, 1)) for j in range(n_targets): current_target = target_vars[:, j].reshape((-1, 1)) res_lstsq = linalg.lstsq(current_design_matrix, current_target) residuals = (current_target - np.dot(current_design_matrix, res_lstsq[0])) t_val_num = np.dot(test_matrix, res_lstsq[0]) t_val_denom = np.sqrt( np.sum(residuals**2, 0) / float(n_samples - lost_dof) * t_val_denom_aux) t_values[j, i] = np.ravel(t_val_num / t_val_denom) return t_values
def setupClass(cls): data = longley.load() data.exog = add_constant(data.exog, prepend=False) cls.res1 = OLS(data.endog, data.exog).fit() R = np.identity(7)[:-1, :] cls.Ftest = cls.res1.f_test(R)
def local_fdr(zscores, null_proportion=1.0, null_pdf=None, deg=7, nbins=30): """ Calculate local FDR values for a list of Z-scores. Parameters ---------- zscores : array_like A vector of Z-scores null_proportion : float The assumed proportion of true null hypotheses null_pdf : function mapping reals to positive reals The density of null Z-scores; if None, use standard normal deg : int The maximum exponent in the polynomial expansion of the density of non-null Z-scores nbins : int The number of bins for estimating the marginal density of Z-scores. Returns ------- fdr : array_like A vector of FDR values References ---------- B Efron (2008). Microarrays, Empirical Bayes, and the Two-Groups Model. Statistical Science 23:1, 1-22. Examples -------- Basic use (the null Z-scores are taken to be standard normal): >>> from statsmodels.stats.multitest import local_fdr >>> import numpy as np >>> zscores = np.random.randn(30) >>> fdr = local_fdr(zscores) Use a Gaussian null distribution estimated from the data: >>> null = EmpiricalNull(zscores) >>> fdr = local_fdr(zscores, null_pdf=null.pdf) """ from statsmodels.genmod.generalized_linear_model import GLM from statsmodels.genmod.generalized_linear_model import families from statsmodels.regression.linear_model import OLS # Bins for Poisson modeling of the marginal Z-score density minz = min(zscores) maxz = max(zscores) bins = np.linspace(minz, maxz, nbins) # Bin counts zhist = np.histogram(zscores, bins)[0] # Bin centers zbins = (bins[:-1] + bins[1:]) / 2 # The design matrix at bin centers dmat = np.vander(zbins, deg + 1) # Use this to get starting values for Poisson regression md = OLS(np.log(1 + zhist), dmat).fit() # Poisson regression md = GLM(zhist, dmat, family=families.Poisson()).fit(start_params=md.params) # The design matrix for all Z-scores dmat_full = np.vander(zscores, deg + 1) # The height of the estimated marginal density of Z-scores, # evaluated at every observed Z-score. fz = md.predict(dmat_full) / (len(zscores) * (bins[1] - bins[0])) # The null density. if null_pdf is None: f0 = np.exp(-0.5 * zscores**2) / np.sqrt(2 * np.pi) else: f0 = null_pdf(zscores) # The local FDR values fdr = null_proportion * f0 / fz fdr = np.clip(fdr, 0, 1) return fdr
def test_influence_wrapped(): from pandas import DataFrame from pandas.util.testing import assert_series_equal d = macrodata.load_pandas().data #growth rates gs_l_realinv = 400 * np.log(d['realinv']).diff().dropna() gs_l_realgdp = 400 * np.log(d['realgdp']).diff().dropna() lint = d['realint'][:-1] # re-index these because they won't conform to lint gs_l_realgdp.index = lint.index gs_l_realinv.index = lint.index data = dict(const=np.ones_like(lint), lint=lint, lrealgdp=gs_l_realgdp) #order is important exog = DataFrame(data, columns=['const', 'lrealgdp', 'lint']) res = OLS(gs_l_realinv, exog).fit() #basic # already tested #assert_almost_equal(lsdiag['cov.scaled'], # res.cov_params().values.ravel(), decimal=14) #assert_almost_equal(lsdiag['cov.unscaled'], # res.normalized_cov_params.values.ravel(), decimal=14) infl = oi.OLSInfluence(res) # smoke test just to make sure it works, results separately tested df = infl.summary_frame() assert_(isinstance(df, DataFrame)) #this test is slow path = os.path.join(cur_dir, "results", "influence_lsdiag_R.json") with open(path, "r") as fp: lsdiag = json.load(fp) c0, c1 = infl.cooks_distance #TODO: what's c1, it's pvalues? -ss #NOTE: we get a hard-cored 5 decimals with pandas testing assert_almost_equal(c0, lsdiag['cooks'], 14) assert_almost_equal(infl.hat_matrix_diag, (lsdiag['hat']), 14) assert_almost_equal(infl.resid_studentized_internal, lsdiag['std.res'], 14) #slow: dffits, dffth = infl.dffits assert_almost_equal(dffits, lsdiag['dfits'], 14) assert_almost_equal(infl.resid_studentized_external, lsdiag['stud.res'], 14) import pandas fn = os.path.join(cur_dir, "results/influence_measures_R.csv") infl_r = pandas.read_csv(fn, index_col=0) conv = lambda s: 1 if s == 'TRUE' else 0 fn = os.path.join(cur_dir, "results/influence_measures_bool_R.csv") #not used yet: #infl_bool_r = pandas.read_csv(fn, index_col=0, # converters=dict(zip(lrange(7),[conv]*7))) infl_r2 = np.asarray(infl_r) #TODO: finish wrapping this stuff assert_almost_equal(infl.dfbetas, infl_r2[:, :3], decimal=13) assert_almost_equal(infl.cov_ratio, infl_r2[:, 4], decimal=14)
def fit(self): """estimate the model and compute the Anova table Returns ------- AnovaResults instance """ y = self.data[self.depvar].values # Construct OLS endog and exog from string using patsy within = ['C(%s, Sum)' % i for i in self.within] subject = 'C(%s, Sum)' % self.subject factors = within + [subject] x = patsy.dmatrix('*'.join(factors), data=self.data) term_slices = x.design_info.term_name_slices for key in term_slices: ind = np.array([False] * x.shape[1]) ind[term_slices[key]] = True term_slices[key] = np.array(ind) term_exclude = [':'.join(factors)] ind = _not_slice(term_slices, term_exclude, x.shape[1]) x = x[:, ind] # Fit OLS model = OLS(y, x) results = model.fit() if model.rank < x.shape[1]: raise ValueError('Independent variables are collinear.') for i in term_exclude: term_slices.pop(i) for key in term_slices: term_slices[key] = term_slices[key][ind] params = results.params df_resid = results.df_resid ssr = results.ssr columns = ['F Value', 'Num DF', 'Den DF', 'Pr > F'] anova_table = pd.DataFrame(np.zeros((0, 4)), columns=columns) for key in term_slices: if self.subject not in key and key != 'Intercept': # Independen variables are orthogonal ssr1, df_resid1 = _ssr_reduced_model(y, x, term_slices, params, [key]) df1 = df_resid1 - df_resid msm = (ssr1 - ssr) / df1 if (key == ':'.join(factors[:-1]) or (key + ':' + subject not in term_slices)): mse = ssr / df_resid df2 = df_resid else: ssr1, df_resid1 = _ssr_reduced_model( y, x, term_slices, params, [key + ':' + subject]) df2 = df_resid1 - df_resid mse = (ssr1 - ssr) / df2 F = msm / mse p = stats.f.sf(F, df1, df2) term = key.replace('C(', '').replace(', Sum)', '') anova_table.loc[term, 'F Value'] = F anova_table.loc[term, 'Num DF'] = df1 anova_table.loc[term, 'Den DF'] = df2 anova_table.loc[term, 'Pr > F'] = p return AnovaResults(anova_table)
def hacked_gct(x, maxlag, addconst=True, verbose=True): #from scipy import stats x = np.asarray(x) if x.shape[0] <= 3 * maxlag + int(addconst): raise ValueError( "Insufficient observations. Maximum allowable " "lag is {0}".format(int((x.shape[0] - int(addconst)) / 3) - 1)) resli = {} for mlg in range(1, maxlag + 1): result = {} if verbose: print('\nGranger Causality') print('number of lags (no zero)', mlg) mxlg = mlg # create lagmat of both time series dta = lagmat2ds(x, mxlg, trim='both', dropex=1) #add constant if addconst: '''dtaown = add_constant(dta[:, 1:(mxlg + 1)], prepend=False)''' dtajoint = add_constant(dta[:, 1:], prepend=False) else: raise NotImplementedError('Not Implemented') #dtaown = dta[:, 1:mxlg] #dtajoint = dta[:, 1:] # Run ols on both models without and with lags of second variable '''res2down = OLS(dta[:, 0], dtaown).fit()''' res2down = 'skipped' res2djoint = OLS(dta[:, 0], dtajoint).fit() #print results #for ssr based tests see: #http://support.sas.com/rnd/app/examples/ets/granger/index.htm #the other tests are made-up ''' # Granger Causality test using ssr (F statistic) fgc1 = ((res2down.ssr - res2djoint.ssr) / res2djoint.ssr / mxlg * res2djoint.df_resid) if verbose: print('ssr based F test: F=%-8.4f, p=%-8.4f, df_denom=%d,' ' df_num=%d' % (fgc1, stats.f.sf(fgc1, mxlg, res2djoint.df_resid), res2djoint.df_resid, mxlg)) result['ssr_ftest'] = (fgc1, stats.f.sf(fgc1, mxlg, res2djoint.df_resid), res2djoint.df_resid, mxlg) # Granger Causality test using ssr (ch2 statistic) fgc2 = res2down.nobs * (res2down.ssr - res2djoint.ssr) / res2djoint.ssr if verbose: print('ssr based chi2 test: chi2=%-8.4f, p=%-8.4f, ' 'df=%d' % (fgc2, stats.chi2.sf(fgc2, mxlg), mxlg)) result['ssr_chi2test'] = (fgc2, stats.chi2.sf(fgc2, mxlg), mxlg) #likelihood ratio test pvalue: lr = -2 * (res2down.llf - res2djoint.llf) if verbose: print('likelihood ratio test: chi2=%-8.4f, p=%-8.4f, df=%d' % (lr, stats.chi2.sf(lr, mxlg), mxlg)) result['lrtest'] = (lr, stats.chi2.sf(lr, mxlg), mxlg) ''' # F test that all lag coefficients of exog are zero rconstr = np.column_stack((np.zeros( (mxlg, mxlg)), np.eye(mxlg, mxlg), np.zeros((mxlg, 1)))) ftres = res2djoint.f_test(rconstr) if verbose: print('parameter F test: F=%-8.4f, p=%-8.4f, df_denom=%d,' ' df_num=%d' % (ftres.fvalue, ftres.pvalue, ftres.df_denom, ftres.df_num)) result['params_ftest'] = (np.squeeze(ftres.fvalue)[()], np.squeeze(ftres.pvalue)[()], ftres.df_denom, ftres.df_num) resli[mxlg] = (result, [res2down, res2djoint, rconstr]) return resli
def plot_partregress(endog, exog_i, exog_others, data=None, title_kwargs={}, obs_labels=True, label_kwargs={}, ax=None, ret_coords=False, **kwargs): """Plot partial regression for a single regressor. Parameters ---------- endog : {ndarray, str} endogenous or response variable. If string is given, you can use a arbitrary translations as with a formula. exog_i : {ndarray, str} exogenous, explanatory variable. If string is given, you can use a arbitrary translations as with a formula. exog_others : {ndarray, list[str]} other exogenous, explanatory variables. If a list of strings is given, each item is a term in formula. You can use a arbitrary translations as with a formula. The effect of these variables will be removed by OLS regression. data : DataFrame, dict, or recarray Some kind of data structure with names if the other variables are given as strings. title_kwargs : dict Keyword arguments to pass on for the title. The key to control the fonts is fontdict. obs_labels : bool or array_like Whether or not to annotate the plot points with their observation labels. If obs_labels is a boolean, the point labels will try to do the right thing. First it will try to use the index of data, then fall back to the index of exog_i. Alternatively, you may give an array-like object corresponding to the observation numbers. labels_kwargs : dict Keyword arguments that control annotate for the observation labels. ax : Matplotlib AxesSubplot instance, optional If given, this subplot is used to plot in instead of a new figure being created. ret_coords : bool If True will return the coordinates of the points in the plot. You can use this to add your own annotations. **kwargs The keyword arguments passed to plot for the points. Returns ------- fig : Figure If `ax` is None, the created figure. Otherwise the figure to which `ax` is connected. coords : list, optional If ret_coords is True, return a tuple of arrays (x_coords, y_coords). Notes ----- The slope of the fitted line is the that of `exog_i` in the full multiple regression. The individual points can be used to assess the influence of points on the estimated coefficient. See Also -------- plot_partregress_grid : Plot partial regression for a set of regressors. Examples -------- Load the Statewide Crime data set and plot partial regression of the rate of high school graduation (hs_grad) on the murder rate(murder). The effects of the percent of the population living in urban areas (urban), below the poverty line (poverty) , and in a single person household (single) are removed by OLS regression. >>> import statsmodels.api as sm >>> import matplotlib.pyplot as plt >>> crime_data = sm.datasets.statecrime.load_pandas() >>> sm.graphics.plot_partregress(endog='murder', exog_i='hs_grad', ... exog_others=['urban', 'poverty', 'single'], ... data=crime_data.data, obs_labels=False) >>> plt.show() .. plot:: plots/graphics_regression_partregress.py More detailed examples can be found in the Regression Plots notebook on the examples page. """ #NOTE: there is no interaction between possible missing data and #obs_labels yet, so this will need to be tweaked a bit for this case fig, ax = utils.create_mpl_ax(ax) # strings, use patsy to transform to data if isinstance(endog, str): endog = dmatrix(endog + "-1", data) if isinstance(exog_others, str): RHS = dmatrix(exog_others, data) elif isinstance(exog_others, list): RHS = "+".join(exog_others) RHS = dmatrix(RHS, data) else: RHS = exog_others RHS_isemtpy = False if isinstance(RHS, np.ndarray) and RHS.size == 0: RHS_isemtpy = True elif isinstance(RHS, pd.DataFrame) and RHS.empty: RHS_isemtpy = True if isinstance(exog_i, str): exog_i = dmatrix(exog_i + "-1", data) # all arrays or pandas-like if RHS_isemtpy: ax.plot(endog, exog_i, 'o', **kwargs) fitted_line = OLS(endog, exog_i).fit() x_axis_endog_name = 'x' if isinstance(exog_i, np.ndarray) else exog_i.name y_axis_endog_name = 'y' if isinstance( endog, np.ndarray) else endog.design_info.column_names[0] else: res_yaxis = OLS(endog, RHS).fit() res_xaxis = OLS(exog_i, RHS).fit() xaxis_resid = res_xaxis.resid yaxis_resid = res_yaxis.resid x_axis_endog_name = res_xaxis.model.endog_names y_axis_endog_name = res_yaxis.model.endog_names ax.plot(xaxis_resid, yaxis_resid, 'o', **kwargs) fitted_line = OLS(yaxis_resid, xaxis_resid).fit() fig = abline_plot(0, fitted_line.params[0], color='k', ax=ax) if x_axis_endog_name == 'y': # for no names regression will just get a y x_axis_endog_name = 'x' # this is misleading, so use x ax.set_xlabel("e(%s | X)" % x_axis_endog_name) ax.set_ylabel("e(%s | X)" % y_axis_endog_name) ax.set_title('Partial Regression Plot', **title_kwargs) #NOTE: if we want to get super fancy, we could annotate if a point is #clicked using this widget #http://stackoverflow.com/questions/4652439/ #is-there-a-matplotlib-equivalent-of-matlabs-datacursormode/ #4674445#4674445 if obs_labels is True: if data is not None: obs_labels = data.index elif hasattr(exog_i, "index"): obs_labels = exog_i.index else: obs_labels = res_xaxis.model.data.row_labels #NOTE: row_labels can be None. #Maybe we should fix this to never be the case. if obs_labels is None: obs_labels = lrange(len(exog_i)) if obs_labels is not False: # could be array_like if len(obs_labels) != len(exog_i): raise ValueError("obs_labels does not match length of exog_i") label_kwargs.update(dict(ha="center", va="bottom")) ax = utils.annotate_axes(lrange(len(obs_labels)), obs_labels, lzip(res_xaxis.resid, res_yaxis.resid), [(0, 5)] * len(obs_labels), "x-large", ax=ax, **label_kwargs) if ret_coords: return fig, (res_xaxis.resid, res_yaxis.resid) else: return fig
def test_norm_resid_zero_variance(self): with warnings.catch_warnings(record=True): y = self.res1.model.endog res = OLS(y, y).fit() assert_allclose(res.scale, 0, atol=1e-20) assert_allclose(res.wresid, res.resid_pearson, atol=5e-11)
nobs = 100 lb, ub = -1, 2 x = np.linspace(lb, ub, nobs) x = np.sin(x) exog = x[:, None]**np.arange(order + 1) y_true = exog.sum(1) y = y_true + sigma_noise * np.random.randn(nobs) #xind = np.argsort(x) pmod = smoothers.PolySmoother(2, x) pmod.fit(y) #no return y_pred = pmod.predict(x) error = y - y_pred mse = (error * error).mean() print mse res_ols = OLS(y, exog[:, :3]).fit() print np.squeeze(pmod.coef) - res_ols.params weights = np.ones(nobs) weights[:nobs // 3] = 0.1 weights[-nobs // 5:] = 2 pmodw = smoothers.PolySmoother(2, x) pmodw.fit(y, weights=weights) #no return y_predw = pmodw.predict(x) error = y - y_predw mse = (error * error).mean() print mse res_wls = WLS(y, exog[:, :3], weights=weights).fit() print np.squeeze(pmodw.coef) - res_wls.params
def setup_class(cls): data = longley.load(as_pandas=False) data.exog = add_constant(data.exog, prepend=False) cls.endog = data.endog cls.exog = data.exog cls.ols_model = OLS(data.endog, data.exog)
z = y_true #alias check d = x y = y_true + sigma_noise * np.random.randn(nobs) example = 1 if example == 1: m = AdditiveModel(d) m.fit(y) y_pred = m.results.predict(d) for ss in m.smoothers: print(ss.params) res_ols = OLS(y, exog_reduced).fit() print(res_ols.params) #assert_almost_equal(y_pred, res_ols.fittedvalues, 3) if example > 0: import matplotlib.pyplot as plt plt.figure() plt.plot(exog) y_pred = m.results.mu # + m.results.alpha #m.results.predict(d) plt.figure() plt.subplot(2, 2, 1) plt.plot(y, '.', alpha=0.25) plt.plot(y_true, 'k-', label='true')
def linear_regression_orp(db, test_swp, nb): from statsmodels.regression.linear_model import OLS random_seed = 0 random.seed(random_seed) # Load data db.reset_index(inplace=True, drop=True) # X_train, y_train, index, cr = orp_db_generator(db, nb = 80) filename = "orp_80_biggest2" # np_save_training_data(filename,X_train, y_train) # print('saved') X_train, y_train = np_read_training_data(filename) X_test, y_test, index2, created = orp_db_generator(test_swp, nb=0, bool=True) from scipy.stats import norm # Build the quantile def ols_quantile(m, X, q): # m: OLS statsmodels model. # X: X matrix. # q: Quantile. mean_pred = m.predict(X) se = np.sqrt(m.scale) print(se) return mean_pred + norm.ppf(q) * se model = OLS(y_train[:].astype(float), X_train.astype(float)) model = model.fit() print('model fitted') predictions = pd.DataFrame(y_test) predictions['lower'] = ols_quantile(model, X_test.astype(float), 0.48) predictions['upper'] = ols_quantile(model, X_test.astype(float), 0.52) # Displays the main metrics predictions = predictions.set_index(pd.DatetimeIndex(created[0].values)) predictions.rename(columns={0: 'measures'}, inplace=True) axes = predictions.plot(style='.-', color=['blue', 'red', 'green', 'red']) print("Mean absolute error " + str( mean_absolute_error(y_pred=(predictions['lower'] + predictions['upper']) / 2, y_true=y_test))) print("Quantile loss {}".format( full_quantile_loss(y_test, predictions['lower'], predictions['upper'], alpha=0.05))) predictions.rename(columns={'measures': 0}, inplace=True) print("Coverage {}".format(coverage(predictions))) predictions.rename(columns={0: 'measures'}, inplace=True) # Count the number of value under the min interval bound and above the upper interval bound zz1 = np.greater_equal(y_test, predictions['upper']) zz2 = np.greater_equal(predictions['lower'], y_test) print(sum(zz1)) print(sum(zz2)) anomalies = [x or y for (x, y) in zip(zz1, zz2)] # anomalies = int(anomalies) anomalies = list(map(int, anomalies)) anomalies = [element * 400 for element in anomalies] # print(anomalies) # anomalies = np.asarray(anomalies) anomalies = pd.DataFrame(anomalies) # print(anomalies[0].sum()) # Displays a star for each anomaly anomalies = anomalies.set_index(pd.DatetimeIndex(created[0].values)) anomalies.plot(color='r', marker="*", linewidth=0, ax=axes) predictions.rename(columns={'measures': 0}, inplace=True) zz1 = np.greater_equal(predictions[0], predictions['upper']) zz2 = np.greater_equal(predictions['lower'], predictions[0]) deriv = [x or y for (x, y) in zip(zz1, zz2)] filename = "lin_swp_orp_" + str(nb) np.save(filename, deriv) filename = "lin_index_orp_" + str(nb) np.save(filename, index2) predictions.rename(columns={0: 'measures'}, inplace=True) return predictions
def fit(self, maxlag=None, method='cmle', ic=None, trend='c', transparams=True, start_params=None, solver='lbfgs', maxiter=35, full_output=1, disp=1, callback=None, **kwargs): """ Fit the unconditional maximum likelihood of an AR(p) process. Parameters ---------- maxlag : int If `ic` is None, then maxlag is the lag length used in fit. If `ic` is specified then maxlag is the highest lag order used to select the correct lag order. If maxlag is None, the default is round(12*(nobs/100.)**(1/4.)) method : str {'cmle', 'mle'}, optional cmle - Conditional maximum likelihood using OLS mle - Unconditional (exact) maximum likelihood. See `solver` and the Notes. ic : str {'aic','bic','hic','t-stat'} Criterion used for selecting the optimal lag length. aic - Akaike Information Criterion bic - Bayes Information Criterion t-stat - Based on last lag hqic - Hannan-Quinn Information Criterion If any of the information criteria are selected, the lag length which results in the lowest value is selected. If t-stat, the model starts with maxlag and drops a lag until the highest lag has a t-stat that is significant at the 95 % level. trend : str {'c','nc'} Whether to include a constant or not. 'c' - include constant. 'nc' - no constant. The below can be specified if method is 'mle' transparams : bool, optional Whether or not to transform the parameters to ensure stationarity. Uses the transformation suggested in Jones (1980). start_params : array_like, optional A first guess on the parameters. Default is cmle estimates. solver : str or None, optional Solver to be used if method is 'mle'. The default is 'lbfgs' (limited memory Broyden-Fletcher-Goldfarb-Shanno). Other choices are 'bfgs', 'newton' (Newton-Raphson), 'nm' (Nelder-Mead), 'cg' - (conjugate gradient), 'ncg' (non-conjugate gradient), and 'powell'. maxiter : int, optional The maximum number of function evaluations. Default is 35. tol : float The convergence tolerance. Default is 1e-08. full_output : bool, optional If True, all output from solver will be available in the Results object's mle_retvals attribute. Output is dependent on the solver. See Notes for more information. disp : bool, optional If True, convergence information is output. callback : function, optional Called after each iteration as callback(xk) where xk is the current parameter vector. kwargs See Notes for keyword arguments that can be passed to fit. References ---------- Jones, R.H. 1980 "Maximum likelihood fitting of ARMA models to time series with missing observations." `Technometrics`. 22.3. 389-95. See Also -------- statsmodels.base.model.LikelihoodModel.fit """ method = method.lower() if method not in ['cmle', 'yw', 'mle']: raise ValueError("Method %s not recognized" % method) self.method = method self.trend = trend self.transparams = transparams nobs = len(self.endog) # overwritten if method is 'cmle' endog = self.endog if maxlag is None: maxlag = int(round(12 * (nobs / 100.)**(1 / 4.))) k_ar = maxlag # stays this if ic is None # select lag length if ic is not None: ic = ic.lower() if ic not in ['aic', 'bic', 'hqic', 't-stat']: raise ValueError("ic option %s not understood" % ic) k_ar = self.select_order(k_ar, ic, trend, method) self.k_ar = k_ar # change to what was chosen by ic # redo estimation for best lag # make LHS Y = endog[k_ar:, :] # make lagged RHS X = self._stackX(k_ar, trend) # sets self.k_trend k_trend = self.k_trend self.exog_names = util.make_lag_names(self.endog_names, k_ar, k_trend) self.Y = Y self.X = X if method == "cmle": # do OLS arfit = OLS(Y, X).fit() params = arfit.params self.nobs = nobs - k_ar self.sigma2 = arfit.ssr / arfit.nobs # needed for predict fcasterr elif method == "mle": solver = solver.lower() self.nobs = nobs if start_params is None: start_params = OLS(Y, X).fit().params else: if len(start_params) != k_trend + k_ar: raise ValueError("Length of start params is %d. There" " are %d parameters." % (len(start_params), k_trend + k_ar)) start_params = self._invtransparams(start_params) if solver == 'lbfgs': kwargs.setdefault('pgtol', 1e-8) kwargs.setdefault('factr', 1e2) kwargs.setdefault('m', 12) kwargs.setdefault('approx_grad', True) mlefit = super(AR, self).fit(start_params=start_params, method=solver, maxiter=maxiter, full_output=full_output, disp=disp, callback=callback, **kwargs) params = mlefit.params if self.transparams: params = self._transparams(params) self.transparams = False # turn off now for other results # don't use yw, because we can't estimate the constant #elif method == "yw": # params, omega = yule_walker(endog, order=maxlag, # method="mle", demean=False) # # how to handle inference after Yule-Walker? # self.params = params #TODO: don't attach here # self.omega = omega pinv_exog = np.linalg.pinv(X) normalized_cov_params = np.dot(pinv_exog, pinv_exog.T) arfit = ARResults(self, params, normalized_cov_params) if method == 'mle' and full_output: arfit.mle_retvals = mlefit.mle_retvals arfit.mle_settings = mlefit.mle_settings return ARResultsWrapper(arfit)
def calc_ols_rsquared(df, idx): return OLS(df.iloc[:, idx], df.loc[:, np.arange(df.shape[1]) != idx]).fit().rsquared
def setupClass(cls): super(TestNxNxOne, cls).setupClass() cls.mod2 = OLS(cls.endog_n_, cls.exog_n_one) cls.mod2.df_model += 1 cls.res2 = cls.mod2.fit()
def setup_class(cls): data = stackloss.load(as_pandas=False) data.exog = add_constant(data.exog) cls.res1 = OLS(data.endog, data.exog).fit() cls.res2 = RegressionResults()
def factor_alpha_beta(factor_data, returns=None, demeaned=True, group_adjust=False, equal_weight=False): """ Compute the alpha (excess returns), alpha t-stat (alpha significance), and beta (market exposure) of a factor. A regression is run with the period wise factor universe mean return as the independent variable and mean period wise return from a portfolio weighted by factor values as the dependent variable. Parameters ---------- factor_data : pd.DataFrame - MultiIndex A MultiIndex DataFrame indexed by date (level 0) and asset (level 1), containing the values for a single alpha factor, forward returns for each period, the factor quantile/bin that factor value belongs to, and (optionally) the group the asset belongs to. - See full explanation in utils.get_clean_factor_and_forward_returns returns : pd.DataFrame, optional Period wise factor returns. If this is None then it will be computed with 'factor_returns' function and the passed flags: 'demeaned', 'group_adjust', 'equal_weight' demeaned : bool Control how to build factor returns used for alpha/beta computation -- see performance.factor_return for a full explanation group_adjust : bool Control how to build factor returns used for alpha/beta computation -- see performance.factor_return for a full explanation equal_weight : bool, optional Control how to build factor returns used for alpha/beta computation -- see performance.factor_return for a full explanation Returns ------- alpha_beta : pd.Series A list containing the alpha, beta, a t-stat(alpha) for the given factor and forward returns. """ if returns is None: returns = \ factor_returns(factor_data, demeaned, group_adjust, equal_weight) universe_ret = factor_data.groupby(level='date')[ utils.get_forward_returns_columns(factor_data.columns)] \ .mean().loc[returns.index] if isinstance(returns, pd.Series): returns.name = universe_ret.columns.values[0] returns = pd.DataFrame(returns) alpha_beta = pd.DataFrame() for period in returns.columns.values: x = universe_ret[period].values y = returns[period].values x = add_constant(x) reg_fit = OLS(y, x).fit() try: alpha, beta = reg_fit.params except ValueError: alpha_beta.loc['Ann. alpha', period] = np.nan alpha_beta.loc['beta', period] = np.nan else: freq_adjust = pd.Timedelta('252Days') / pd.Timedelta(period) alpha_beta.loc['Ann. alpha', period] = \ (1 + alpha) ** freq_adjust - 1 alpha_beta.loc['beta', period] = beta return alpha_beta
def ax_regress(ax, x, vector, display='equation', pos_xy=[0.1, 0.9], args_pt={'ls': '-'}, args_ln={'color': 'k'}, args_ci={ 'color': 'k', 'alpha': 0.2 }, args_tx={'color': 'k'}): """ Plot the time series with trend. Parameters ---------- ax: matplotlib.pyplot.axis x: 1-d array The x-values in the regression. vector: 1-d array The y-values in the regression. display: None or str If None, does not display the regression equation. If 'equation', display the regression equation. If 'pearson', display the Pearson correlation. pos_xy: [float, float] The position to place the annotation in the normalized axis unit. args_pt, args_ln, args_ci, args_txt: dict Keyword arguments to be passed into the scatter plot, regression line, confidence interval for the regression line, and annotation text plotting functions. """ temp = (~np.isnan(vector)) & (~np.isnan(x)) x = x[temp] vector = vector[temp] ax.plot(x, vector, **args_pt) reg = OLS(vector, add_constant(x)).fit() ax.plot(x, x * reg.params[1] + reg.params[0], **args_ln) _, predict_ci_low, predict_ci_upp = wls_prediction_std(reg, \ exog = reg.model.exog, weights = np.ones(len(reg.model.exog))) x_ind = np.argsort(x) ax.fill_between(x[x_ind], predict_ci_low[x_ind], predict_ci_upp[x_ind], interpolate=True, **args_ci) if display == 'equation': ax.text(pos_xy[0], pos_xy[1], ppf(reg.params[1], reg.params[0], reg.pvalues[1], reg.pvalues[0]), transform=ax.transAxes, **args_tx) elif display == 'pearson': r, pval = pearsonr(x, vector) ax.text(pos_xy[0], pos_xy[1], ('%.3f' % r) + ppp(pval), transform=ax.transAxes, **args_tx)
def detrend(y, order): if order == -1: return y return OLS(y, np.vander(np.linspace(-1, 1, len(y)), order + 1)).fit().resid
def perform_regional_gwas_helper(outfile, pheno_and_covars_fname, shared_covars_fname, untransformed_phenotypes_fname, get_genotype_iter, phenotype, binary, region, runtype, conditional_covars_fname=None): outfile.write("chrom\tpos\talleles\tlocus_filtered\t" f"p_{phenotype}\tcoeff_{phenotype}\t") if binary != 'logistic': outfile.write(f'se_{phenotype}\tR^2\t') else: outfile.write("unused_col\tunused_col\t") outfile.flush() n_loci = 0 batch_time = 0 batch_size = 50 total_time = 0 pheno_specific_covars = np.load(pheno_and_covars_fname) shared_covars = np.load(shared_covars_fname) covars = utils.merge_arrays(pheno_specific_covars, shared_covars) if conditional_covars_fname: gt_covars = np.load(conditional_covars_fname) covars = utils.merge_arrays(covars, gt_covars) # order samples according to order in genetics files bgen_samples = sample_utils.get_all_samples() assert len(bgen_samples) == 487409 samples_array = np.array(bgen_samples, dtype=float).reshape(-1, 1) merge = utils.merge_arrays(samples_array, covars) unfiltered_samples = ~np.isnan(merge[:, 1]) outcome = merge[unfiltered_samples, 1].copy() covars = merge[unfiltered_samples, :] covars = (covars - np.mean(covars, axis=0)) / np.std(covars, axis=0) covars[:, 1] = 1 # reuse the column that was the outcome as the intercept ori_phenotypes = np.load(untransformed_phenotypes_fname) ori_phenotypes = utils.merge_arrays(samples_array, ori_phenotypes)[:, 1] ori_phenotypes = ori_phenotypes[unfiltered_samples] # first yield is special genotype_iter = get_genotype_iter(unfiltered_samples) extra_detail_fields = next(genotype_iter) outfile.write('\t'.join(extra_detail_fields) + '\t') if not binary: stat = 'mean' else: stat = 'fraction' outfile.write(f'{stat}_{phenotype}_per_single_dosage\t' '0.05_significance_CI\t' '5e-8_significance_CI') if runtype == 'strs': outfile.write('\ttotal_subset_dosage_per_summed_gt\t' f'{stat}_{phenotype}_per_paired_dosage\t' '0.05_significance_CI\t' '5e-8_significance_CI') outfile.write('\n') outfile.flush() start_time = time.time() for dosage_gts, unique_alleles, chrom, pos, locus_filtered, locus_details in genotype_iter: assert len(locus_details) == len(extra_detail_fields) covars[:, 0] = np.nan # reuse the column that was the ids as the genotypes n_loci += 1 allele_names = ','.join(list(unique_alleles.astype(str))) outfile.write(f"{chrom}\t{pos}\t{allele_names}\t") if locus_filtered: outfile.write(f'{locus_filtered}\t1\tnan\tnan\tnan\t') outfile.write('\t'.join(locus_details)) if runtype == 'strs': outfile.write('\tnan' * 6 + '\n') else: outfile.write('\tnan' * 3 + '\n') outfile.flush() continue else: outfile.write('False\t') if runtype == 'strs': gts = np.sum([ _len * np.sum(dosages, axis=1) for _len, dosages in dosage_gts.items() ], axis=0) else: gts = dosage_gts[:, 1] + 2 * dosage_gts[:, 2] std = np.std(gts) gts = (gts - np.mean(gts)) / np.std(gts) covars[:, 0] = gts if not binary or binary == 'linear': #do da regression model = OLS( outcome, covars, missing='drop', ) reg_result = model.fit() pval = reg_result.pvalues[0] coef = reg_result.params[0] se = reg_result.bse[0] rsquared = reg_result.rsquared outfile.write(f"{pval:.2e}\t{coef/std}\t{se/std}\t{rsquared}\t") else: model = sm.GLM(outcome, covars, missing='drop', family=sm.families.Binomial()) reg_result = model.fit() pval = reg_result.pvalues[0] coef = reg_result.params[0] outfile.write(f'{pval:.2e}\t{coef/std}\tnan\tnan\t') outfile.write('\t'.join(locus_details) + '\t') if runtype == 'strs': single_dosages = {} paired_dosages = {} for len1 in unique_alleles: for len2 in unique_alleles: if len1 > len2: continue if len1 != len2: dosages = ( dosage_gts[len1][:, 0] * dosage_gts[len2][:, 1] + dosage_gts[len1][:, 1] * dosage_gts[len2][:, 0]) else: dosages = dosage_gts[len1][:, 0] * dosage_gts[len1][:, 1] if np.sum(dosages) <= 0: continue summed_len = round(len1 + len2, 2) if summed_len not in single_dosages: single_dosages[summed_len] = dosages else: single_dosages[summed_len] += dosages minlen = min(len1, len2) maxlen = max(len1, len2) paired_dosages[(minlen, maxlen)] = dosages single_dosage_stat = {} single_dosage_95_CI = {} single_dosage_GWAS_CI = {} paired_dosage_stat = {} paired_dosage_95_CI = {} paired_dosage_GWAS_CI = {} if not binary: for _len, dosages in single_dosages.items(): if len(np.unique(ori_phenotypes[dosages != 0])) <= 1: continue mean_stats = statsmodels.stats.weightstats.DescrStatsW( ori_phenotypes, weights=dosages) single_dosage_stat[_len] = mean_stats.mean single_dosage_95_CI[_len] = mean_stats.tconfint_mean() single_dosage_GWAS_CI[_len] = mean_stats.tconfint_mean( 5e-8) for _len, dosages in paired_dosages.items(): if len(np.unique(ori_phenotypes[dosages != 0])) <= 1: continue mean_stats = statsmodels.stats.weightstats.DescrStatsW( ori_phenotypes, weights=dosages) paired_dosage_stat[_len] = mean_stats.mean paired_dosage_95_CI[_len] = mean_stats.tconfint_mean() paired_dosage_GWAS_CI[_len] = mean_stats.tconfint_mean( 5e-8) else: for _len, dosages in single_dosages.items(): if not np.any(dosages != 0): continue p, lower, upper = weighted_binom_conf.weighted_binom_conf( dosages, ori_phenotypes, 0.05) single_dosage_stat[_len] = p single_dosage_95_CI[_len] = (lower, upper) _, lower_gwas, upper_gwas = weighted_binom_conf.weighted_binom_conf( dosages, ori_phenotypes, 5e-8) single_dosage_GWAS_CI[_len] = (lower_gwas, upper_gwas) for _len, dosages in paired_dosages.items(): if not np.any(dosages != 0): continue p, lower, upper = weighted_binom_conf.weighted_binom_conf( dosages, ori_phenotypes, 0.05) paired_dosage_stat[_len] = p paired_dosage_95_CI[_len] = (lower, upper) _, lower_gwas, upper_gwas = weighted_binom_conf.weighted_binom_conf( dosages, ori_phenotypes, 5e-8) paired_dosage_GWAS_CI[_len] = (lower_gwas, upper_gwas) outfile.write( load_and_filter_genotypes.dict_str(single_dosage_stat) + '\t') outfile.write( load_and_filter_genotypes.dict_str(single_dosage_95_CI) + '\t') outfile.write( load_and_filter_genotypes.dict_str(single_dosage_GWAS_CI) + '\t') outfile.write( load_and_filter_genotypes.dict_str( {key: np.sum(arr) for key, arr in single_dosages.items()}) + '\t') outfile.write( load_and_filter_genotypes.dict_str(paired_dosage_stat) + '\t') outfile.write( load_and_filter_genotypes.dict_str(paired_dosage_95_CI) + '\t') outfile.write( load_and_filter_genotypes.dict_str(paired_dosage_GWAS_CI) + '\n') else: single_dosage_stat = {} single_dosage_95_CI = {} single_dosage_GWAS_CI = {} if not binary: for alt_count in range(3): mean_stats = statsmodels.stats.weightstats.DescrStatsW( ori_phenotypes, weights=dosage_gts[:, alt_count]) single_dosage_stat[alt_count] = mean_stats.mean single_dosage_95_CI[alt_count] = mean_stats.tconfint_mean() single_dosage_GWAS_CI[ alt_count] = mean_stats.tconfint_mean(5e-8) else: for alt_count in range(3): p, lower, upper = weighted_binom_conf.weighted_binom_conf( dosage_gts[:, alt_count], ori_phenotypes, 0.05) single_dosage_stat[alt_count] = p single_dosage_95_CI[alt_count] = (lower, upper) _, lower_gwas, upper_gwas = weighted_binom_conf.weighted_binom_conf( dosage_gts[:, alt_count], ori_phenotypes, 5e-8) single_dosage_GWAS_CI[alt_count] = (lower_gwas, upper_gwas) outfile.write( load_and_filter_genotypes.dict_str(single_dosage_stat) + '\t') outfile.write( load_and_filter_genotypes.dict_str(single_dosage_95_CI) + '\t') outfile.write( load_and_filter_genotypes.dict_str(single_dosage_GWAS_CI) + '\n') outfile.flush() duration = time.time() - start_time total_time += duration batch_time += duration if n_loci % batch_size == 0: print( f"time/locus (last {batch_size}): " f"{batch_time/batch_size}s\n" f"time/locus ({n_loci} total loci): {total_time/n_loci}s\n", flush=True) batch_time = 0 start_time = time.time() if n_loci > 0: print( f"Done.\nTotal loci: {n_loci}\nTotal time: {total_time}s\ntime/locus: {total_time/n_loci}s\n", flush=True) else: print(f"No variants found in the region {region}\n", flush=True)
def qqline(ax, line, x=None, y=None, dist=None, fmt="r-", **lineoptions): """ Plot a reference line for a qqplot. Parameters ---------- ax : matplotlib axes instance The axes on which to plot the line line : str {"45","r","s","q"} Options for the reference line to which the data is compared.: - "45" - 45-degree line - "s" - standardized line, the expected order statistics are scaled by the standard deviation of the given sample and have the mean added to them - "r" - A regression line is fit - "q" - A line is fit through the quartiles. - None - By default no reference line is added to the plot. x : ndarray X data for plot. Not needed if line is "45". y : ndarray Y data for plot. Not needed if line is "45". dist : scipy.stats.distribution A scipy.stats distribution, needed if line is "q". fmt : str, optional Line format string passed to `plot`. **lineoptions Additional arguments to be passed to the `plot` command. Notes ----- There is no return value. The line is plotted on the given `ax`. Examples -------- Import the food expenditure dataset. Plot annual food expenditure on x-axis and household income on y-axis. Use qqline to add regression line into the plot. >>> import statsmodels.api as sm >>> import numpy as np >>> import matplotlib.pyplot as plt >>> from statsmodels.graphics.gofplots import qqline >>> foodexp = sm.datasets.engel.load() >>> x = foodexp.exog >>> y = foodexp.endog >>> ax = plt.subplot(111) >>> plt.scatter(x, y) >>> ax.set_xlabel(foodexp.exog_name[0]) >>> ax.set_ylabel(foodexp.endog_name) >>> qqline(ax, "r", x, y) >>> plt.show() .. plot:: plots/graphics_gofplots_qqplot_qqline.py """ lineoptions = lineoptions.copy() for ls in ("-", "--", "-.", ":"): if ls in fmt: lineoptions.setdefault("linestyle", ls) fmt = fmt.replace(ls, "") break for marker in ( ".", ",", "o", "v", "^", "<", ">", "1", "2", "3", "4", "8", "s", "p", "P", "*", "h", "H", "+", "x", "X", "D", "d", "|", "_", ): if marker in fmt: lineoptions.setdefault("marker", marker) fmt = fmt.replace(marker, "") break if fmt: lineoptions.setdefault("color", fmt) if line == "45": end_pts = lzip(ax.get_xlim(), ax.get_ylim()) end_pts[0] = min(end_pts[0]) end_pts[1] = max(end_pts[1]) ax.plot(end_pts, end_pts, **lineoptions) ax.set_xlim(end_pts) ax.set_ylim(end_pts) return # does this have any side effects? if x is None or y is None: raise ValueError("If line is not 45, x and y cannot be None.") x = np.array(x) y = np.array(y) if line == "r": # could use ax.lines[0].get_xdata(), get_ydata(), # but don't know axes are "clean" y = OLS(y, add_constant(x)).fit().fittedvalues ax.plot(x, y, **lineoptions) elif line == "s": m, b = np.std(y), np.mean(y) ref_line = x * m + b ax.plot(x, ref_line, **lineoptions) elif line == "q": _check_for(dist, "ppf") q25 = stats.scoreatpercentile(y, 25) q75 = stats.scoreatpercentile(y, 75) theoretical_quartiles = dist.ppf([0.25, 0.75]) m = (q75 - q25) / np.diff(theoretical_quartiles) b = q25 - m * theoretical_quartiles[0] ax.plot(x, m * x + b, **lineoptions)
def start(self): self.print_arguments() print("### STEP 1 ###") print("Loading genotype data and dataset info.") geno_df = self.load_file(self.geno_path, header=0, index_col=0, nrows=self.nrows) dataset_mask = np.ones(geno_df.shape[1], dtype=bool) if self.std_path is not None: std_df = self.load_file(self.std_path, header=0, index_col=None) # Validate that the input data matches. self.validate_data(std_df=std_df, geno_df=geno_df) # Filter on datasets. if self.datasets is not None: print("Filtering datasets.") dataset_mask = std_df["dataset"].isin(self.datasets).to_numpy() std_df = std_df.loc[dataset_mask, :] geno_df = geno_df.loc[:, dataset_mask] else: # Create sample-to-dataset file with all the samples having the # same dataset. std_df = pd.DataFrame({"sample": geno_df.columns, "dataset": "None"}) print("Checking dataset sample sizes") dataset_sample_counts = list(zip(*np.unique(std_df.iloc[:, 1], return_counts=True))) dataset_sample_counts.sort(key=lambda x: -x[1]) datasets = [x[0] for x in dataset_sample_counts] max_dataset_length = np.max([len(str(dataset[0])) for dataset in dataset_sample_counts]) for dataset, sample_size in dataset_sample_counts: print("\t{:{}s} {:,} samples".format(dataset, max_dataset_length, sample_size)) if dataset_sample_counts[-1][1] <= 1: print("\t One or more datasets have a smaller sample " "size than recommended. Consider excluded these") exit() # Construct dataset df. dataset_df = self.construct_dataset_df(std_df=std_df) print("Calculating genotype call rate per dataset") geno_df, call_rate_df = self.calculate_call_rate(geno_df=geno_df, std_df=std_df, datasets=datasets) call_rate_n_skipped = (call_rate_df.min(axis=1) < self.call_rate).sum() if call_rate_n_skipped > 0: print("\t{:,} eQTLs have had dataset(s) filled with NaN " "values due to call rate threshold ".format(call_rate_n_skipped)) print("Calculating genotype stats for inclusing criteria") cr_keep_mask = ~(geno_df == self.genotype_na).all(axis=1).to_numpy(dtype=bool) geno_stats_df = pd.DataFrame(np.nan, index=geno_df.index, columns=["N", "NaN", "0", "1", "2", "min GS", "HW pval", "allele1", "allele2", "MA", "MAF"]) geno_stats_df["N"] = 0 geno_stats_df["NaN"] = geno_df.shape[1] geno_stats_df.loc[cr_keep_mask, :] = self.calculate_genotype_stats(df=geno_df.loc[cr_keep_mask, :]) # Checking which eQTLs pass the requirements n_keep_mask = (geno_stats_df.loc[:, "N"] >= 6).to_numpy() hwpval_keep_mask = (geno_stats_df.loc[:, "HW pval"] >= self.hw_pval).to_numpy() maf_keep_mask = (geno_stats_df.loc[:, "MAF"] > self.maf).to_numpy() combined_keep_mask = n_keep_mask & hwpval_keep_mask & maf_keep_mask geno_n_skipped = np.size(combined_keep_mask) - np.sum(combined_keep_mask) if geno_n_skipped > 0: print("\t{:,} eQTL(s) failed the sample size threshold".format(np.size(n_keep_mask) - np.sum(n_keep_mask))) print("\t{:,} eQTL(s) failed the Hardy-Weinberg p-value threshold".format(np.size(hwpval_keep_mask) - np.sum(hwpval_keep_mask))) print("\t{:,} eQTL(s) failed the MAF threshold".format(np.size(maf_keep_mask) - np.sum(maf_keep_mask))) print("\t----------------------------------------") print("\t{:,} eQTL(s) are discarded in total".format(geno_n_skipped)) # Add mask to genotype stats data frame. geno_stats_df["mask"] = 0 geno_stats_df.loc[combined_keep_mask, "mask"] = 1 self.save_file(df=call_rate_df, outpath=os.path.join(self.outdir, "call_rate.txt.gz")) self.save_file(df=geno_stats_df, outpath=os.path.join(self.outdir, "geno_stats.txt.gz")) del call_rate_df, geno_stats_df if geno_n_skipped == self.nrows: print("Error, no valid eQTLs.") exit() print("") ######################################################################## print("### STEP 2 ###") print("Loading other data.") alle_df = self.load_file(self.alle_path, header=0, index_col=0, nrows=self.nrows) expr_df = self.load_file(self.expr_path, header=0, index_col=0, nrows=self.nrows) cova_df = self.load_file(self.cova_path, header=0, index_col=0) # Transpose if need be. We want samples always as columns. if cova_df.shape[0] == np.size(dataset_mask): print("\t Transposing covariate matrix.") cova_df = cova_df.T # Filter the datasets. if dataset_mask is not None: expr_df = expr_df.loc[:, dataset_mask] cova_df = cova_df.loc[:, dataset_mask] # Select eQTL rows that meet requirements. geno_df = geno_df.loc[combined_keep_mask, :] alle_df = alle_df.loc[combined_keep_mask, :] expr_df = expr_df.loc[combined_keep_mask, :] print("\tValidating input.") self.validate_data(std_df=std_df, geno_df=geno_df, alle_df=alle_df, expr_df=expr_df, cova_df=cova_df) print("", flush=True) del std_df ######################################################################## print("### STEP 3 ###") print("Pre-processing data.") # Add the allele assed column. alle_df["AlleleAssessed"] = alle_df["Alleles"].str.split("/", n=None, expand=True)[1] alle_df.drop(["AltAllele"], axis=1, inplace=True) alle_df.reset_index(drop=True, inplace=True) # Convert to numpy for speed. geno_m = geno_df.to_numpy(np.float64) expr_m = expr_df.to_numpy(np.float64) dataset_m = dataset_df.to_numpy(np.uint8) cova_m = cova_df.to_numpy(np.float64) # Replace missing values with nan geno_m[geno_m == self.genotype_na] = np.nan cova_m[cova_m == self.covariate_na] = np.nan # Save properties. snps = list(geno_df.index) genes = list(expr_df.index) covariates = list(cova_df.index) datasets = list(dataset_df.columns) del geno_df, expr_df, dataset_df, cova_df # Print info. n_eqtls = geno_m.shape[0] n_samples = geno_m.shape[1] n_covariates = cova_m.shape[0] print("Summary stats:") print("\tN-eQTLs: {:,}".format(n_eqtls)) print("\tN-samples: {:,}".format(n_samples)) print("\tN-covariates: {:,}".format(n_covariates)) print("\tN-datasets: {:,}".format(len(datasets))) print("", flush=True) ######################################################################## print("### STEP 4 ###") print("Analyzing eQTLs.") # Initializing output matrices / arrays. ieqtl_results = {cov: np.empty((n_eqtls, 14), dtype=np.float64) for cov in covariates} # Start loop. start_time = int(time.time()) last_print_time = None for eqtl_index in range(n_eqtls): # Print update for user. now_time = int(time.time()) if n_eqtls > 1 and (last_print_time is None or (now_time - last_print_time) >= self.print_interval or eqtl_index == (n_eqtls - 1)): print("\t[{}] {:,}/{:,} eQTLs analysed [{:.2f}%]".format(time.strftime('%H:%M:%S', time.gmtime(now_time - start_time)), eqtl_index, (n_eqtls - 1), (100 / (n_eqtls - 1)) * eqtl_index), flush=True) last_print_time = now_time # Get the genotype. genotype = geno_m[eqtl_index, :] for cov_index, cov in enumerate(covariates): # Get the covariate. covariate = cova_m[cov_index, :] # Construct the mask to remove missing values. mask = np.logical_and(~np.isnan(genotype), ~np.isnan(covariate)) n = np.sum(mask) # Create the matrix. X = np.empty((n, 4), np.float32) X[:, 0] = 1 X[:, 1] = genotype[mask] X[:, 2] = cova_m[cov_index, mask] X[:, 3] = X[:, 1] * X[:, 2] # Get the expression. y = expr_m[eqtl_index, mask] # Check if there is variance on each column. Also check # if each column is unique. if (np.min(np.std(X[:, 1:], axis=0)) == 0) or (np.unique(X, axis=1).shape[1] != 4): ieqtl_results[cov][eqtl_index, :] = np.array([n] + [np.nan] * 13) continue if self.dataset_correct: # Correct expression for dataset differences. dataset_subset_m = dataset_m[mask, :].copy() dataset_subset_m = dataset_subset_m[:, np.sum(dataset_subset_m, axis=0) > 0] corr_m = np.hstack((X[:, [0]], dataset_subset_m[:, 1:], dataset_subset_m * genotype[mask][:, np.newaxis])) y = OLS(y, corr_m).fit().resid # First calculate the rss for the matrix minux the interaction # term. rss_null = self.calc_rss(y=y, y_hat=self.fit_and_predict(X=X[:, :3], y=y)) # Calculate the rss for the interaction model. inv_m = self.inverse(X) betas = self.fit(X=X, y=y, inv_m=inv_m) rss_alt = self.calc_rss(y=y, y_hat=self.predict(X=X, betas=betas)) std = self.calc_std(rss=rss_alt, n=n, df=4, inv_m=inv_m) # Calculate interaction p-value. p_value = self.calc_p_value(rss1=rss_null, rss2=rss_alt, df1=3, df2=4, n=n) # Calculate the t-values. t_values = betas / std # Save results. ieqtl_results[cov][eqtl_index, :] = np.hstack((np.array([n]), betas, std, t_values, np.array([p_value]))) print("", flush=True) ######################################################################## print("### STEP 5 ###") print("Saving results.") for covariate in covariates: print(" {}:".format(covariate)) # Convert to pandas data frame. df = pd.DataFrame(ieqtl_results[covariate], columns=["N", "beta-intercept", "beta-genotype", "beta-covariate", "beta-interaction", "std-intercept", "std-genotype", "std-covariate", "std-interaction", "tvalue-intercept", "tvalue-genotype", "tvalue-covariate", "tvalue-interaction", "p-value"] ) df = pd.concat([alle_df, df], axis=1) df.insert(0, "ProbeName", genes) df.insert(0, "SNPName", snps) df["FDR"] = np.nan df.loc[~df["p-value"].isnull(), "FDR"] = multitest.multipletests(df.loc[~df["p-value"].isnull(), "p-value"], method='fdr_bh')[1] print("\t{:,} ieQTLs (p-value <0.05)".format(df.loc[df["p-value"] < 0.05, :].shape[0])) print("\t{:,} ieQTLs (BH-FDR <0.05)".format(df.loc[df["FDR"] < 0.05, :].shape[0])) # Save. self.save_file(df=df, outpath=os.path.join(self.outdir, "{}_InteractionResults.txt.gz".format(covariate.replace(" ", ""))), index=False) # tmp. df["chr"] = [int(x.split(":")[0]) for x in df["SNPName"]] tested_counts = df["chr"].value_counts() signif_counts = df.loc[df["FDR"] < 0.05, "chr"].value_counts() print("") print(" Hits per chromosome:") for i in range(1, 23): n_tested = 0 if i in tested_counts.index: n_tested = tested_counts[i] n_signif = 0 if i in signif_counts.index: n_signif = signif_counts[i] perc = 0 if n_tested > 0: perc = (100 / n_tested) * n_signif print("\t{}: {:,} / {:,} [{:.2f}%]".format(i, n_signif, n_tested, perc)) print("", flush=True)
def setupClass(cls): data = longley.load() data.exog = add_constant(data.exog, prepend=False) cls.res1 = GLS(data.endog, data.exog).fit() cls.res2 = OLS(data.endog, data.exog).fit()