def evaluate(self, use="rmse", ad=True, check_VIF=False, exclude=True): use = use.lower() self.eval = use if use == "r2": metric = abs(self.r2) - 1.0 elif use == "r2a": metric = abs(self.results.rsquared_adj) - 1.0 elif use == "rmse": metric = self.rmse elif use == "press": r = smo.OLSInfluence(self.results) metric = r.ess_press elif use == "aic": metric = self.aic elif use == "caic": k = self.data.shape[1] - 1 n = self.results.nobs metric = self.aic + ((2*(k*k) + 2*k)/(n - k - 1)) elif use == "bic": metric = self.bic else: metric = self.mse if ad: if self.anderson_p < 0.05: if exclude: metric = float("inf") else: metric = 10000 if check_VIF: if not self.evaluate_VIF(): if exclude: metric = float("inf") else: metric = 10000 # Allows for model to still be on the list but will let better models get added. return metric
def test_influence(self): res = self.res #this test is slow infl = oi.OLSInfluence(res) try: import json except ImportError: raise SkipTest fp = open(os.path.join(cur_dir, "results/influence_lsdiag_R.json")) lsdiag = json.load(fp) #basic assert_almost_equal(lsdiag['cov.scaled'], res.cov_params().ravel(), decimal=14) assert_almost_equal(lsdiag['cov.unscaled'], res.normalized_cov_params.ravel(), decimal=14) c0, c1 = infl.cooks_distance #TODO: what's c1 assert_almost_equal(c0, lsdiag['cooks'], decimal=14) assert_almost_equal(infl.hat_matrix_diag, lsdiag['hat'], decimal=14) assert_almost_equal(infl.resid_studentized_internal, lsdiag['std.res'], decimal=14) #slow: #infl._get_all_obs() #slow, nobs estimation loop, called implicitly dffits, dffth = infl.dffits assert_almost_equal(dffits, lsdiag['dfits'], decimal=14) assert_almost_equal(infl.resid_studentized_external, lsdiag['stud.res'], decimal=14) import pandas fn = os.path.join(cur_dir, "results/influence_measures_R.csv") infl_r = pandas.read_csv(fn, index_col=0) conv = lambda s: 1 if s == 'TRUE' else 0 fn = os.path.join(cur_dir, "results/influence_measures_bool_R.csv") #not used yet: #infl_bool_r = pandas.read_csv(fn, index_col=0, # converters=dict(zip(range(7),[conv]*7))) infl_r2 = np.asarray(infl_r) assert_almost_equal(infl.dfbetas, infl_r2[:, :3], decimal=13) assert_almost_equal(infl.cov_ratio, infl_r2[:, 4], decimal=14) #duplicates assert_almost_equal(dffits, infl_r2[:, 3], decimal=14) assert_almost_equal(c0, infl_r2[:, 5], decimal=14) assert_almost_equal(infl.hat_matrix_diag, infl_r2[:, 6], decimal=14) #Note: for dffits, R uses a threshold around 0.36, mine: dffits[1]=0.24373 #TODO: finish and check thresholds and pvalues '''
def armonic(t, m, f, merr): ws = pd.DataFrame({ 'x': m, 'y1': np.sin(2 * np.pi * t * f), 'y2': np.cos(2 * np.pi * t * f), 'y3': np.sin(4 * np.pi * t * f), 'y4': np.cos(4 * np.pi * t * f), 'y5': np.sin(6 * np.pi * t * f), 'y6': np.cos(6 * np.pi * t * f), 'y7': np.sin(8 * np.pi * t * f), 'y8': np.cos(8 * np.pi * t * f) }) weights = pd.Series(merr) wls_fit = sm.wls('x ~ y1+y2+y3+y4+y5+y6+y7+y8-1', data=ws, weights=1 / weights).fit() pred = wls_fit.predict() r = m - pred A = np.zeros(4) PH = np.zeros(4) A[0] = np.sqrt(wls_fit.params[0]**2 + wls_fit.params[1]**2) A[1] = np.sqrt(wls_fit.params[2]**2 + wls_fit.params[3]**2) A[2] = np.sqrt(wls_fit.params[4]**2 + wls_fit.params[5]**2) A[3] = np.sqrt(wls_fit.params[6]**2 + wls_fit.params[7]**2) PH[0] = np.arctan2(wls_fit.params[1], wls_fit.params[0]) - ( 1 * f / f) * np.arctan2(wls_fit.params[1], wls_fit.params[0]) PH[1] = np.arctan2(wls_fit.params[3], wls_fit.params[2]) - ( 2 * f / f) * np.arctan2(wls_fit.params[1], wls_fit.params[0]) PH[2] = np.arctan2(wls_fit.params[5], wls_fit.params[4]) - ( 3 * f / f) * np.arctan2(wls_fit.params[1], wls_fit.params[0]) PH[3] = np.arctan2(wls_fit.params[7], wls_fit.params[6]) - ( 4 * f / f) * np.arctan2(wls_fit.params[1], wls_fit.params[0]) influence = inf.OLSInfluence(wls_fit) dffits = influence.dffits cook = influence.cooks_distance leverage = influence.hat_matrix_diag inf1 = np.where(dffits[0] > dffits[1]) inf2 = np.where(cook[1] < 0.05) inffin = np.concatenate((inf1, inf2), axis=1) return pred, r, A, PH, inffin
def test_all(self): d = macrodata.load().data #import datasetswsm.greene as g #d = g.load('5-1') #growth rates gs_l_realinv = 400 * np.diff(np.log(d['realinv'])) gs_l_realgdp = 400 * np.diff(np.log(d['realgdp'])) #simple diff, not growthrate, I want heteroscedasticity later for testing endogd = np.diff(d['realinv']) exogd = add_constant(np.c_[np.diff(d['realgdp']), d['realint'][:-1]]) endogg = gs_l_realinv exogg = add_constant(np.c_[gs_l_realgdp, d['realint'][:-1]]) res_ols = OLS(endogg, exogg).fit() #print res_ols.params mod_g1 = GLSAR(endogg, exogg, rho=-0.108136) res_g1 = mod_g1.fit() #print res_g1.params mod_g2 = GLSAR(endogg, exogg, rho=-0.108136) #-0.1335859) from R res_g2 = mod_g2.iterative_fit(maxiter=5) #print res_g2.params rho = -0.108136 # coefficient std. error t-ratio p-value 95% CONFIDENCE INTERVAL partable = np.array([ [-9.50990, 0.990456, -9.602, 3.65e-018, -11.4631, -7.55670], # *** [ 4.37040, 0.208146, 21.00, 2.93e-052, 3.95993, 4.78086], # *** [-0.579253, 0.268009, -2.161, 0.0319, -1.10777, -0.0507346]]) # ** #Statistics based on the rho-differenced data: result_gretl_g1 = dict( endog_mean = ("Mean dependent var", 3.113973), endog_std = ("S.D. dependent var", 18.67447), ssr = ("Sum squared resid", 22530.90), mse_resid_sqrt = ("S.E. of regression", 10.66735), rsquared = ("R-squared", 0.676973), rsquared_adj = ("Adjusted R-squared", 0.673710), fvalue = ("F(2, 198)", 221.0475), f_pvalue = ("P-value(F)", 3.56e-51), resid_acf1 = ("rho", -0.003481), dw = ("Durbin-Watson", 1.993858)) #fstatistic, p-value, df1, df2 reset_2_3 = [5.219019, 0.00619, 2, 197, "f"] reset_2 = [7.268492, 0.00762, 1, 198, "f"] reset_3 = [5.248951, 0.023, 1, 198, "f"] #LM-statistic, p-value, df arch_4 = [7.30776, 0.120491, 4, "chi2"] #multicollinearity vif = [1.002, 1.002] cond_1norm = 6862.0664 determinant = 1.0296049e+009 reciprocal_condition_number = 0.013819244 #Chi-square(2): test-statistic, pvalue, df normality = [20.2792, 3.94837e-005, 2] #tests res = res_g1 #with rho from Gretl #basic assert_almost_equal(res.params, partable[:,0], 4) assert_almost_equal(res.bse, partable[:,1], 6) assert_almost_equal(res.tvalues, partable[:,2], 2) assert_almost_equal(res.ssr, result_gretl_g1['ssr'][1], decimal=2) #assert_almost_equal(res.llf, result_gretl_g1['llf'][1], decimal=7) #not in gretl #assert_almost_equal(res.rsquared, result_gretl_g1['rsquared'][1], decimal=7) #FAIL #assert_almost_equal(res.rsquared_adj, result_gretl_g1['rsquared_adj'][1], decimal=7) #FAIL assert_almost_equal(np.sqrt(res.mse_resid), result_gretl_g1['mse_resid_sqrt'][1], decimal=5) assert_almost_equal(res.fvalue, result_gretl_g1['fvalue'][1], decimal=4) assert_approx_equal(res.f_pvalue, result_gretl_g1['f_pvalue'][1], significant=2) #assert_almost_equal(res.durbin_watson, result_gretl_g1['dw'][1], decimal=7) #TODO #arch #sm_arch = smsdia.acorr_lm(res.wresid**2, maxlag=4, autolag=None) sm_arch = smsdia.het_arch(res.wresid, maxlag=4) assert_almost_equal(sm_arch[0], arch_4[0], decimal=4) assert_almost_equal(sm_arch[1], arch_4[1], decimal=6) #tests res = res_g2 #with estimated rho #estimated lag coefficient assert_almost_equal(res.model.rho, rho, decimal=3) #basic assert_almost_equal(res.params, partable[:,0], 4) assert_almost_equal(res.bse, partable[:,1], 3) assert_almost_equal(res.tvalues, partable[:,2], 2) assert_almost_equal(res.ssr, result_gretl_g1['ssr'][1], decimal=2) #assert_almost_equal(res.llf, result_gretl_g1['llf'][1], decimal=7) #not in gretl #assert_almost_equal(res.rsquared, result_gretl_g1['rsquared'][1], decimal=7) #FAIL #assert_almost_equal(res.rsquared_adj, result_gretl_g1['rsquared_adj'][1], decimal=7) #FAIL assert_almost_equal(np.sqrt(res.mse_resid), result_gretl_g1['mse_resid_sqrt'][1], decimal=5) assert_almost_equal(res.fvalue, result_gretl_g1['fvalue'][1], decimal=0) assert_almost_equal(res.f_pvalue, result_gretl_g1['f_pvalue'][1], decimal=6) #assert_almost_equal(res.durbin_watson, result_gretl_g1['dw'][1], decimal=7) #TODO c = oi.reset_ramsey(res, degree=2) compare_ftest(c, reset_2, decimal=(2,4)) c = oi.reset_ramsey(res, degree=3) compare_ftest(c, reset_2_3, decimal=(2,4)) #arch #sm_arch = smsdia.acorr_lm(res.wresid**2, maxlag=4, autolag=None) sm_arch = smsdia.het_arch(res.wresid, maxlag=4) assert_almost_equal(sm_arch[0], arch_4[0], decimal=1) assert_almost_equal(sm_arch[1], arch_4[1], decimal=2) ''' Performing iterative calculation of rho... ITER RHO ESS 1 -0.10734 22530.9 2 -0.10814 22530.9 Model 4: Cochrane-Orcutt, using observations 1959:3-2009:3 (T = 201) Dependent variable: ds_l_realinv rho = -0.108136 coefficient std. error t-ratio p-value ------------------------------------------------------------- const -9.50990 0.990456 -9.602 3.65e-018 *** ds_l_realgdp 4.37040 0.208146 21.00 2.93e-052 *** realint_1 -0.579253 0.268009 -2.161 0.0319 ** Statistics based on the rho-differenced data: Mean dependent var 3.113973 S.D. dependent var 18.67447 Sum squared resid 22530.90 S.E. of regression 10.66735 R-squared 0.676973 Adjusted R-squared 0.673710 F(2, 198) 221.0475 P-value(F) 3.56e-51 rho -0.003481 Durbin-Watson 1.993858 ''' ''' RESET test for specification (squares and cubes) Test statistic: F = 5.219019, with p-value = P(F(2,197) > 5.21902) = 0.00619 RESET test for specification (squares only) Test statistic: F = 7.268492, with p-value = P(F(1,198) > 7.26849) = 0.00762 RESET test for specification (cubes only) Test statistic: F = 5.248951, with p-value = P(F(1,198) > 5.24895) = 0.023: ''' ''' Test for ARCH of order 4 coefficient std. error t-ratio p-value -------------------------------------------------------- alpha(0) 97.0386 20.3234 4.775 3.56e-06 *** alpha(1) 0.176114 0.0714698 2.464 0.0146 ** alpha(2) -0.0488339 0.0724981 -0.6736 0.5014 alpha(3) -0.0705413 0.0737058 -0.9571 0.3397 alpha(4) 0.0384531 0.0725763 0.5298 0.5968 Null hypothesis: no ARCH effect is present Test statistic: LM = 7.30776 with p-value = P(Chi-square(4) > 7.30776) = 0.120491: ''' ''' Variance Inflation Factors Minimum possible value = 1.0 Values > 10.0 may indicate a collinearity problem ds_l_realgdp 1.002 realint_1 1.002 VIF(j) = 1/(1 - R(j)^2), where R(j) is the multiple correlation coefficient between variable j and the other independent variables Properties of matrix X'X: 1-norm = 6862.0664 Determinant = 1.0296049e+009 Reciprocal condition number = 0.013819244 ''' ''' Test for ARCH of order 4 - Null hypothesis: no ARCH effect is present Test statistic: LM = 7.30776 with p-value = P(Chi-square(4) > 7.30776) = 0.120491 Test of common factor restriction - Null hypothesis: restriction is acceptable Test statistic: F(2, 195) = 0.426391 with p-value = P(F(2, 195) > 0.426391) = 0.653468 Test for normality of residual - Null hypothesis: error is normally distributed Test statistic: Chi-square(2) = 20.2792 with p-value = 3.94837e-005: ''' #no idea what this is ''' Augmented regression for common factor test OLS, using observations 1959:3-2009:3 (T = 201) Dependent variable: ds_l_realinv coefficient std. error t-ratio p-value --------------------------------------------------------------- const -10.9481 1.35807 -8.062 7.44e-014 *** ds_l_realgdp 4.28893 0.229459 18.69 2.40e-045 *** realint_1 -0.662644 0.334872 -1.979 0.0492 ** ds_l_realinv_1 -0.108892 0.0715042 -1.523 0.1294 ds_l_realgdp_1 0.660443 0.390372 1.692 0.0923 * realint_2 0.0769695 0.341527 0.2254 0.8219 Sum of squared residuals = 22432.8 Test of common factor restriction Test statistic: F(2, 195) = 0.426391, with p-value = 0.653468 ''' ################ with OLS, HAC errors #Model 5: OLS, using observations 1959:2-2009:3 (T = 202) #Dependent variable: ds_l_realinv #HAC standard errors, bandwidth 4 (Bartlett kernel) #coefficient std. error t-ratio p-value 95% CONFIDENCE INTERVAL #for confidence interval t(199, 0.025) = 1.972 partable = np.array([ [-9.48167, 1.17709, -8.055, 7.17e-014, -11.8029, -7.16049], # *** [4.37422, 0.328787, 13.30, 2.62e-029, 3.72587, 5.02258], #*** [-0.613997, 0.293619, -2.091, 0.0378, -1.19300, -0.0349939]]) # ** result_gretl_g1 = dict( endog_mean = ("Mean dependent var", 3.257395), endog_std = ("S.D. dependent var", 18.73915), ssr = ("Sum squared resid", 22799.68), mse_resid_sqrt = ("S.E. of regression", 10.70380), rsquared = ("R-squared", 0.676978), rsquared_adj = ("Adjusted R-squared", 0.673731), fvalue = ("F(2, 199)", 90.79971), f_pvalue = ("P-value(F)", 9.53e-29), llf = ("Log-likelihood", -763.9752), aic = ("Akaike criterion", 1533.950), bic = ("Schwarz criterion", 1543.875), hqic = ("Hannan-Quinn", 1537.966), resid_acf1 = ("rho", -0.107341), dw = ("Durbin-Watson", 2.213805)) linear_logs = [1.68351, 0.430953, 2, "chi2"] #for logs: dropping 70 nan or incomplete observations, T=133 #(res_ols.model.exog <=0).any(1).sum() = 69 ?not 70 linear_squares = [7.52477, 0.0232283, 2, "chi2"] #Autocorrelation, Breusch-Godfrey test for autocorrelation up to order 4 lm_acorr4 = [1.17928, 0.321197, 4, 195, "F"] lm2_acorr4 = [4.771043, 0.312, 4, "chi2"] acorr_ljungbox4 = [5.23587, 0.264, 4, "chi2"] #break cusum_Harvey_Collier = [0.494432, 0.621549, 198, "t"] #stats.t.sf(0.494432, 198)*2 #see cusum results in files break_qlr = [3.01985, 0.1, 3, 196, "maxF"] #TODO check this, max at 2001:4 break_chow = [13.1897, 0.00424384, 3, "chi2"] # break at 1984:1 arch_4 = [3.43473, 0.487871, 4, "chi2"] normality = [23.962, 0.00001, 2, "chi2"] het_white = [33.503723, 0.000003, 5, "chi2"] het_breusch_pagan = [1.302014, 0.521520, 2, "chi2"] #TODO: not available het_breusch_pagan_konker = [0.709924, 0.701200, 2, "chi2"] reset_2_3 = [5.219019, 0.00619, 2, 197, "f"] reset_2 = [7.268492, 0.00762, 1, 198, "f"] reset_3 = [5.248951, 0.023, 1, 198, "f"] #not available cond_1norm = 5984.0525 determinant = 7.1087467e+008 reciprocal_condition_number = 0.013826504 vif = [1.001, 1.001] names = 'date residual leverage influence DFFITS'.split() cur_dir = os.path.abspath(os.path.dirname(__file__)) fpath = os.path.join(cur_dir, 'results/leverage_influence_ols_nostars.txt') lev = np.genfromtxt(fpath, skip_header=3, skip_footer=1, converters={0:lambda s: s}) #either numpy 1.6 or python 3.2 changed behavior if np.isnan(lev[-1]['f1']): lev = np.genfromtxt(fpath, skip_header=3, skip_footer=2, converters={0:lambda s: s}) lev.dtype.names = names res = res_ols #for easier copying cov_hac = sw.cov_hac_simple(res, nlags=4, use_correction=False) bse_hac = sw.se_cov(cov_hac) assert_almost_equal(res.params, partable[:,0], 5) assert_almost_equal(bse_hac, partable[:,1], 5) #TODO assert_almost_equal(res.ssr, result_gretl_g1['ssr'][1], decimal=2) assert_almost_equal(res.llf, result_gretl_g1['llf'][1], decimal=4) #not in gretl assert_almost_equal(res.rsquared, result_gretl_g1['rsquared'][1], decimal=6) #FAIL assert_almost_equal(res.rsquared_adj, result_gretl_g1['rsquared_adj'][1], decimal=6) #FAIL assert_almost_equal(np.sqrt(res.mse_resid), result_gretl_g1['mse_resid_sqrt'][1], decimal=5) #f-value is based on cov_hac I guess #res2 = res.get_robustcov_results(cov_type='HC1') # TODO: fvalue differs from Gretl, trying any of the HCx #assert_almost_equal(res2.fvalue, result_gretl_g1['fvalue'][1], decimal=0) #FAIL #assert_approx_equal(res.f_pvalue, result_gretl_g1['f_pvalue'][1], significant=1) #FAIL #assert_almost_equal(res.durbin_watson, result_gretl_g1['dw'][1], decimal=7) #TODO c = oi.reset_ramsey(res, degree=2) compare_ftest(c, reset_2, decimal=(6,5)) c = oi.reset_ramsey(res, degree=3) compare_ftest(c, reset_2_3, decimal=(6,5)) linear_sq = smsdia.linear_lm(res.resid, res.model.exog) assert_almost_equal(linear_sq[0], linear_squares[0], decimal=6) assert_almost_equal(linear_sq[1], linear_squares[1], decimal=7) hbpk = smsdia.het_breuschpagan(res.resid, res.model.exog) assert_almost_equal(hbpk[0], het_breusch_pagan_konker[0], decimal=6) assert_almost_equal(hbpk[1], het_breusch_pagan_konker[1], decimal=6) hw = smsdia.het_white(res.resid, res.model.exog) assert_almost_equal(hw[:2], het_white[:2], 6) #arch #sm_arch = smsdia.acorr_lm(res.resid**2, maxlag=4, autolag=None) sm_arch = smsdia.het_arch(res.resid, maxlag=4) assert_almost_equal(sm_arch[0], arch_4[0], decimal=5) assert_almost_equal(sm_arch[1], arch_4[1], decimal=6) vif2 = [oi.variance_inflation_factor(res.model.exog, k) for k in [1,2]] infl = oi.OLSInfluence(res_ols) #print np.max(np.abs(lev['DFFITS'] - infl.dffits[0])) #print np.max(np.abs(lev['leverage'] - infl.hat_matrix_diag)) #print np.max(np.abs(lev['influence'] - infl.influence)) #just added this based on Gretl #just rough test, low decimal in Gretl output, assert_almost_equal(lev['residual'], res.resid, decimal=3) assert_almost_equal(lev['DFFITS'], infl.dffits[0], decimal=3) assert_almost_equal(lev['leverage'], infl.hat_matrix_diag, decimal=3) assert_almost_equal(lev['influence'], infl.influence, decimal=4)
def test_influence_wrapped(): from pandas import DataFrame d = macrodata.load_pandas().data #growth rates gs_l_realinv = 400 * np.log(d['realinv']).diff().dropna() gs_l_realgdp = 400 * np.log(d['realgdp']).diff().dropna() lint = d['realint'][:-1] # re-index these because they will not conform to lint gs_l_realgdp.index = lint.index gs_l_realinv.index = lint.index data = dict(const=np.ones_like(lint), lint=lint, lrealgdp=gs_l_realgdp) #order is important exog = DataFrame(data, columns=['const', 'lrealgdp', 'lint']) res = OLS(gs_l_realinv, exog).fit() #basic # already tested #assert_almost_equal(lsdiag['cov.scaled'], # res.cov_params().values.ravel(), decimal=14) #assert_almost_equal(lsdiag['cov.unscaled'], # res.normalized_cov_params.values.ravel(), decimal=14) infl = oi.OLSInfluence(res) # smoke test just to make sure it works, results separately tested df = infl.summary_frame() assert_(isinstance(df, DataFrame)) #this test is slow path = os.path.join(cur_dir, "results", "influence_lsdiag_R.json") with open(path, "r") as fp: lsdiag = json.load(fp) c0, c1 = infl.cooks_distance #TODO: what's c1, it's pvalues? -ss #NOTE: we get a hard-cored 5 decimals with pandas testing assert_almost_equal(c0, lsdiag['cooks'], 14) assert_almost_equal(infl.hat_matrix_diag, (lsdiag['hat']), 14) assert_almost_equal(infl.resid_studentized_internal, lsdiag['std.res'], 14) #slow: dffits, dffth = infl.dffits assert_almost_equal(dffits, lsdiag['dfits'], 14) assert_almost_equal(infl.resid_studentized_external, lsdiag['stud.res'], 14) import pandas fn = os.path.join(cur_dir, "results/influence_measures_R.csv") infl_r = pandas.read_csv(fn, index_col=0) conv = lambda s: 1 if s == 'TRUE' else 0 fn = os.path.join(cur_dir, "results/influence_measures_bool_R.csv") #not used yet: #infl_bool_r = pandas.read_csv(fn, index_col=0, # converters=dict(zip(lrange(7),[conv]*7))) infl_r2 = np.asarray(infl_r) #TODO: finish wrapping this stuff assert_almost_equal(infl.dfbetas, infl_r2[:, :3], decimal=13) assert_almost_equal(infl.cov_ratio, infl_r2[:, 4], decimal=14)
# ### Outliers and Influential cases # #### references # # https://www.statsmodels.org/stable/generated/statsmodels.stats.outliers_influence.OLSInfluence.html#statsmodels.stats.outliers_influence.OLSInfluence # # https://www.statsmodels.org/dev/generated/statsmodels.regression.linear_model.OLS.html # # https://stackoverflow.com/questions/46304514/access-standardized-residuals-cooks-values-hatvalues-leverage-etc-easily-i # # https://www.geeksforgeeks.org/reduce-in-python/ # In[71]: summary_frame = sms.OLSInfluence(m02).summary_frame() print(summary_frame.head()) # In[72]: summary_frame = summary_frame[[ 'cooks_d', 'standard_resid', 'student_resid', 'hat_diag' ]] print(summary_frame.head()) # In[73]: resid = pd.DataFrame(df['sales'] - m02.fittedvalues) resid.columns = ['residual'] # In[74]:
plt.legend(loc='upper left') ax2 = fig.add_subplot(3, 1, 3) plt.plot(resid_studentized, 'o', label='studentized_resid') plt.plot(dffits, 'o', label='DFFITS') leg = plt.legend(loc='lower left', fancybox=True) leg.get_frame().set_alpha(0.5) #, fontsize='small') ltext = leg.get_texts() # all the text.Text instance in the legend plt.setp(ltext, fontsize='small') # the legend text fontsize print(oi.reset_ramsey(res, degree=3)) #note, constant in last column for i in range(1): print(oi.variance_inflation_factor(res.model.exog, i)) infl = oi.OLSInfluence(res_ols) print(infl.resid_studentized_external) print(infl.resid_studentized_internal) print(infl.summary_table()) print(oi.summary_table(res, alpha=0.05)[0]) ''' >>> res.resid array([ 4.28571429, 4. , 0.57142857, -3.64285714, -4.71428571, 1.92857143, 10. , -6.35714286, -11. , -1.42857143, 1.71428571, 4.64285714]) >>> infl.hat_matrix_diag array([ 0.10084034, 0.11764706, 0.28571429, 0.20168067, 0.10084034, 0.16806723, 0.11764706, 0.08403361, 0.11764706, 0.28571429, 0.33613445, 0.08403361]) >>> infl.resid_press array([ 4.76635514, 4.53333333, 0.8 , -4.56315789,
def chapter_3(): """ Notes for Linear Regression - coefficients -> give average change in Y with a one-unit increase in X - confidence interval -> B1_hat +- 2 * SE(B1_hat) - 95% chance the interval contains true value of B - SE(B1_hat) -> var(e) / SSE - t-statistic - t = (B1_hat - 0)/ SE(B1_hat) - test for synergy (additive assumption) - effect of each predictor on response is independent of other predictors - include interaction term -> x1 * x2 - if interaction term has small p value, then not additive (synergy exists) - if results in substantial increase in r2, then not additive (synergy exists) - relationship exists - p value < 0.0005 or < 0.0001 - F statistic greater than 1 - strength of relationship - RSE -> estimates standard deviation of response from regression line - R squared -> % variability in response explained by predictors - percent error -> 100 * residual_standard_error / ys.mean() - accuracy of prediction - prediction interval (individual response) - confidence interval (average response) - non-linearity - residual plots (fitted values vs. studentized/standardized residuals) - if residual plots are not random, transform with log(x), sqrt(x), or x2 - correlation of error terms - will underestimate p value and narrow confidence/prediction intervals - heteroscedasticity (funnel shape of residual plot) - non-constant variances in the errors - if exists, transform the response with log(y) or sqrt(y) - co-linearity of features - (VIF) variance inflation factor -> 1 / (1 - r2) - correlation matrix - reduces t-statistic and increases standard error - outliers - leverage -> high impact on RSE and/or regression line - look at studentized residuals (observations > 3 are outliers) - influence (leverage) plots """ #3.8 -> Simple Linear Regression on Auto data set dat = pd.read_csv("Auto.csv") dat = dat.replace("?", np.nan).dropna() # add constant to x values to ensure mean of residuals = 0 xs = sm.add_constant(dat["horsepower"].astype(float)) ys = dat["mpg"].astype(float) model = sm.OLS(ys, xs).fit() intercept, slope = model.params r2 = model.rsquared # variance inflation factor -> test for co-linearity # min(VIF) = 1.0, if VIF > 5 or 10, features are most likely correlated vif = 1 / (1 - r2) f_stat = model.fvalue p_value = model.pvalues[1] # create new line with the coefficients fit = [slope * x + intercept for x in xs["horsepower"]] print("Simple OLS: %s" % model.summary()) prediction = model.predict() residuals = ys.astype(float) - prediction standardized_residuals = (residuals - residuals.mean()) / \ (residuals.max() - residuals.min()) #residual_standard_error = results.rmse #percent_error = 100 * residual_standard_error / ys.mean() """ Plot """ f = plt.figure() ax = f.add_subplot(221) ax2 = f.add_subplot(223) ax3 = f.add_subplot(222) ax4 = f.add_subplot(224) ax.scatter(xs["horsepower"], ys, label="r2=%f; f=%f; p=%f" % (r2, f_stat, p_value)) ax.plot(xs["horsepower"], fit, color="r", label="f(x) = %f * x + %f" % (slope, intercept)) # plot fitted values vs residuals to check for non-linearity ax2.scatter(model.fittedvalues, residuals, color="r") ax2.axhline(0, color="k") ax2.set_xlabel("fitted values") ax2.set_ylabel("residuals") # show leverage to identity observations that may have # more effect on the regression than other observations sm.graphics.influence_plot(model, ax=ax3) # show fitted values vs studentized residuals outlier_influence = outliers_influence.OLSInfluence(model).summary_frame() ax4.scatter(model.fittedvalues, outlier_influence["student_resid"]) ax4.axhline(0, color="k") ax4.set_xlabel("fitted values") ax4.set_ylabel("studentized residuals") for _ax in [ax, ax2, ax3, ax4]: _ax.legend(loc="best") plt.show() #3.9 -> Multiple Linear Regression on Auto data set xs = dat[[ "cylinders", "displacement", "horsepower", "weight", "acceleration", "year", "origin" ]].astype(float) # plot correlation matrix to check co-linearity # co-linearity reduces the t-statistic (power) of the test # and also increases standard error print("Correlations: %s" % xs.corr()) grid = sns.PairGrid(xs) grid = grid.map(plt.scatter) plt.show() results = pd.ols(y=ys, x=xs) model = sm.OLS(ys, xs).fit() print("Multiple OLS: %s" % results) # compute variance inflation factor (VIF) to check for co-linearity vif = list(map(lambda x: 1 / (1 - x), model.params)) print("VIFs: %s" % vif) """ Looking at the p-values associated with each predictor’s t-statistic, we see that displacement, weight, year, and origin have a statistically significant relationship, while cylinders, horsepower, and acceleration do not. """ print("Coefficients: %s" % results.beta) """ The regression coefficient for year, 0.7508, suggests that for every one year, mpg increases by the coefficient. In other words, cars become more fuel efficient every year by almost 1 mpg / year. """ residuals = results.resid standardized_residuals = (residuals - residuals.mean()) / \ (residuals.max() - residuals.min()) residual_standard_error = results.rmse percent_error = 100 * residual_standard_error / ys.mean() """ Plot """ f = plt.figure() ax = f.add_subplot(221) ax2 = f.add_subplot(223) ax3 = f.add_subplot(222) ax4 = f.add_subplot(224) ax.scatter(results.y_fitted, residuals) ax.axhline(0, color="k") ax.set_xlabel("y fitted values") ax.set_ylabel("residuals") ax2.scatter(results.y_fitted, standardized_residuals, label='percent error=%f' % percent_error) ax2.axhline(0, color="k") ax2.set_xlabel("y fitted values") ax2.set_ylabel("standardized residuals") sm.graphics.influence_plot(model, ax=ax3) for _ax in [ax, ax2, ax3, ax4]: _ax.legend(loc="best") plt.show()
def nagadan( target, npaths, duration, base, conductivity, porosity, thickness, wells, observations, xmin=np.nan, xmax=np.nan, ymin=np.nan, ymax=np.nan, buffer=100, spacing=10, umbra=10, confined=True, tol=1, maxstep=10): """ The entry-point for the NagadanPy project. Arguments --------- target : int The index identifying the target well in the wells. That is, the well for which we will compute a stochastic capture zone. This uses python's 0-based indexing. npaths : int The number of paths (starting points for the backtraces) to generate uniformly around the target well. 0 < npaths. duration : float The duration of the capture zone [d]. For example, a 10-year capture zone would have a duration = 10*365.25. 0 < duration. base : float The base elevation of the aquifer [m]. conductivity : float The hydraulic conductivity of the aquifer [m/d]. 0 < conductivity. porosity : float The porosity of the aquifer []. 0 < porosity < 1. thickness : float The thickness of the aquifer [m]. 0 < thickness. wells : list The list of well tuples. Each well tuple has four components. xw : float The x-coordinate of the well [m]. yw : float The y-coordinate of the well [m]. rw : float The radius of the well [m]. 0 < rw. qw : float The discharge of the well [m^3/d]. observations : list of observation tuples. An observation tuple contains four values: (x, y, z_ev, z_std), where x : float The x-coordinate of the observation [m]. y : float The y-coordinate of the observation [m]. z_ev : float The expected value of the observed static water level elevation [m]. z_std : float The standard deviation of the observed static water level elevation [m]. buffer : float, optional The buffer distance [m] around each well. If an obs falls within buffer of any well, it is removed. Default is 100 [m]. spacing : float, optional The spacing of the rows and the columns [m] in the square ProbabilityField grids. Default is 10 [m]. umbra : float, optional The vector-to-raster range [m] when mapping a particle path onto the ProbabilityField grids. If a grid node is within umbra of a particle path, it is marked as visited. Default is 10 [m]. confined : boolean, optional True if it is safe to assume that the aquifer is confined throughout the domain of interest, False otherwise. This is a speed kludge. Default is True. tol : float, optional The tolerance [m] for the local error when solving the backtrace differential equation. This is an inherent parameter for an adaptive Runge-Kutta method. Default is 1. maxstep : float, optional The maximum allowed step in space [m] when solving the backtrace differential equation. This is a maximum space step and NOT a maximum time step. Default is 10. Returns ------- None. Notes ----- o Most of the time-consuming work is orchestrated by the create_capturezone function. """ # Validate the arguments. assert(isinstance(target, int) and 0 <= target < len(wells)) assert(isinstance(npaths, int) and 0 < npaths) assert((isinstance(duration, int) or isinstance(duration, float)) and 0 < duration) assert(isinstance(base, int) or isinstance(base, float)) assert((isinstance(conductivity, int) or isinstance(conductivity, float)) and 0 < conductivity) assert(isinstance(porosity, float) and 0 < porosity < 1) assert((isinstance(thickness, int) or isinstance(thickness, float)) and 0 < thickness) assert(isinstance(wells, list) and len(wells) >= 1) for we in wells: assert(len(we) == 4 and (isinstance(we[0], int) or isinstance(we[0], float)) and (isinstance(we[1], int) or isinstance(we[1], float)) and (isinstance(we[2], int) or isinstance(we[2], float)) and 0 < we[2] and (isinstance(we[3], int) or isinstance(we[3], float))) assert(isinstance(observations, list) and len(observations) > 6) for ob in observations: assert(len(ob) == 4 and (isinstance(ob[0], int) or isinstance(ob[0], float)) and (isinstance(ob[1], int) or isinstance(ob[1], float)) and (isinstance(ob[2], int) or isinstance(ob[2], float)) and (isinstance(ob[3], int) or isinstance(ob[3], float)) and 0 <= ob[3]) assert((isinstance(buffer, int) or isinstance(buffer, float)) and 0 < buffer) assert((isinstance(spacing, int) or isinstance(spacing, float)) and 0 < spacing) assert((isinstance(umbra, int) or isinstance(umbra, float)) and 0 < umbra) assert(isinstance(confined, bool)) assert((isinstance(tol, int) or isinstance(tol, float)) and 0 < tol) assert((isinstance(maxstep, int) or isinstance(maxstep, float)) and 0 < maxstep) # Initialize the stopwatch. start_time = time.time() # Log the run information. log_the_run( target, npaths, duration, base, conductivity, porosity, thickness, wells, observations, buffer, spacing, umbra, confined, tol, maxstep) # Filter out all of the observations that are too close to any # pumping well, and average the duplicate observations. obs = filter_obs(observations, wells, buffer) nobs = len(obs) assert(nobs > 6) # Log summary statistics on the wells and the active observations. buf = summary_statistics(wells, ['Easting', 'Northing', 'Radius', 'Discharge'], ['12.2f', '12.2f', '12.3f', '12.2f'], 'Wells') log.info('\n') log.info(buf.getvalue()) buf = summary_statistics(obs, ['Easting', 'Northing', 'Head', 'Std'], ['12.2f', '12.2f', '10.2f', '10.2f'], 'Active Observations') log.info('\n') log.info(buf.getvalue()) # Set the target. xtarget, ytarget, rtarget = wells[target][0:3] # Create the model mo = Model(base, conductivity, porosity, thickness, wells) # General influence statistics WA, Wb = mo.construct_fit(obs, xtarget, ytarget) ols_model = sm.OLS(Wb, WA, hasconst=True) ols_results = ols_model.fit() ols_influence = smso.OLSInfluence(ols_results) log.info('\n') log.info(ols_results.summary( xname = ['A', 'B', 'C', 'D', 'E', 'F'], yname = 'scaled potential')) log.info('\n') log.info(ols_influence.summary_frame()) # Compute the exhaustive leave-one-out and leave-two-out boomerang analyses. kldiv_one, kldiv_two, kldiv_three = compute_boomerang(WA, Wb) kldiv_one.sort(reverse=True) kldiv_two.sort(reverse=True) kldiv_three.sort(reverse=True) most_influential_singleton = kldiv_one[0][1] most_influential_pair = [kldiv_two[0][1], kldiv_two[0][2]] most_influential_triple = [kldiv_three[0][1], kldiv_three[0][2], kldiv_three[0][3]] log.info('\n') log.info('Top 5 of the Leave-one-out analysis:') for i in range(min(len(kldiv_one), 5)): log.info(' {0}'.format(kldiv_one[i])) log.info('\n') log.info('Top 5 of the Leave-two-out analysis:') for i in range(min(len(kldiv_two), 5)): log.info(' {0}'.format(kldiv_two[i])) log.info('\n') log.info('Top 5 of the Leave-three-out analysis:') for i in range(min(len(kldiv_three), 5)): log.info(' {0}'.format(kldiv_three[i])) # Define the local backtracing velocity function. if confined: def feval(xy): Vx, Vy = mo.compute_velocity_confined(xy[0], xy[1]) return np.array([-Vx, -Vy]) else: def feval(xy): Vx, Vy = mo.compute_velocity(xy[0], xy[1]) return np.array([-Vx, -Vy]) # Compute the four capture zones around the target well --- # Using all of the obs. mo.fit_regional_flow(obs, xtarget, ytarget) pf0 = ProbabilityField(spacing, spacing, xtarget, ytarget) compute_capturezone( xtarget, ytarget, rtarget, npaths, duration, pf0, umbra, 1.0, tol, maxstep, feval) # Using all of the obs except the most influential singleton. obs1 = np.delete(obs, most_influential_singleton, 0) mo.fit_regional_flow(obs1, xtarget, ytarget) pf1 = ProbabilityField(spacing, spacing, xtarget, ytarget) compute_capturezone( xtarget, ytarget, rtarget, npaths, duration, pf1, umbra, 1.0, tol, maxstep, feval) # Using all of the obs except the most influential pair. obs2 = np.delete(obs, most_influential_pair, 0) mo.fit_regional_flow(obs2, xtarget, ytarget) pf2 = ProbabilityField(spacing, spacing, xtarget, ytarget) compute_capturezone( xtarget, ytarget, rtarget, npaths, duration, pf2, umbra, 1.0, tol, maxstep, feval) # Using all of the obs except the most influential triple. obs3 = np.delete(obs, most_influential_triple, 0) mo.fit_regional_flow(obs3, xtarget, ytarget) pf3 = ProbabilityField(spacing, spacing, xtarget, ytarget) compute_capturezone( xtarget, ytarget, rtarget, npaths, duration, pf3, umbra, 1.0, tol, maxstep, feval) # Compute the capture zone statistics. Xmin = min([pf0.xmin, pf1.xmin, pf2.xmin, pf3.xmin]) Xmax = max([pf0.xmax, pf1.xmax, pf2.xmax, pf3.xmax]) Ymin = min([pf0.ymin, pf1.ymin, pf2.ymin, pf3.ymin]) Ymax = max([pf0.ymax, pf1.ymax, pf2.ymax, pf3.ymax]) pf0.expand(Xmin, Xmax, Ymin, Ymax) pf1.expand(Xmin, Xmax, Ymin, Ymax) pf2.expand(Xmin, Xmax, Ymin, Ymax) pf3.expand(Xmin, Xmax, Ymin, Ymax) area0 = sum(sum(pf0.pgrid > 0)) * spacing**2 area1 = sum(sum(pf1.pgrid > 0)) * spacing**2 area2 = sum(sum(pf2.pgrid > 0)) * spacing**2 area3 = sum(sum(pf3.pgrid > 0)) * spacing**2 area01 = sum(sum((pf0.pgrid > 0) & (pf1.pgrid > 0))) * spacing**2 area012 = sum(sum((pf0.pgrid > 0) & (pf1.pgrid > 0) & (pf2.pgrid > 0))) * spacing**2 area0123 = sum(sum((pf0.pgrid > 0) & (pf1.pgrid > 0) & (pf2.pgrid > 0) & (pf3.pgrid > 0))) * spacing**2 area01 = sum(sum((pf0.pgrid > 0) & (pf1.pgrid > 0))) * spacing**2 area02 = sum(sum((pf0.pgrid > 0) & (pf2.pgrid > 0))) * spacing**2 area03 = sum(sum((pf0.pgrid > 0) & (pf3.pgrid > 0))) * spacing**2 log.info('\n') log.info('CAPTURE ZONE STATISTICS:') log.info(' 0 = capture zone using all observations.') log.info(' 1 = capture zone without most influenetial singleton.') log.info(' 2 = capture zone without most influenetial pair.') log.info(' 3 = capture zone without most influenetial triple.') log.info('') log.info(' area(0) = {0:.2f}'.format(area0)) log.info(' area(1) = {0:.2f}'.format(area1)) log.info(' area(2) = {0:.2f}'.format(area2)) log.info(' area(3) = {0:.2f}'.format(area3)) log.info('') log.info(' area(0 & 1) = {0:.2f}'.format(area01)) log.info(' area(0 & 1 & 2) = {0:.2f}'.format(area012)) log.info(' area(0 & 1 & 2 & 3) = {0:.2f}'.format(area0123)) log.info('') log.info(' area(0 & !1) = {0:.2f} ({1:.2f}%)'.format(area0 - area01, (area0-area01)/area0 * 100)) log.info(' area(1 & !0) = {0:.2f} ({1:.2f}%)'.format(area1 - area01, (area1-area01)/area1 * 100)) log.info('') log.info(' area(0 & !2) = {0:.2f} ({1:.2f}%)'.format(area0 - area02, (area0-area02)/area0 * 100)) log.info(' area(2 & !0) = {0:.2f} ({1:.2f}%)'.format(area2 - area02, (area2-area02)/area2 * 100)) log.info('') log.info(' area(0 & !3) = {0:.2f} ({1:.2f}%)'.format(area0 - area03, (area0-area03)/area0 * 100)) log.info(' area(3 & !0) = {0:.2f} ({1:.2f}%)'.format(area3 - area03, (area3-area03)/area3 * 100)) log.info('') elapsedtime = time.time() - start_time log.info('Computational elapsed time = %.4f seconds' % elapsedtime) log.info('') # ----------------------------------------------------- # GRAPHICAL OUTPUT STARTS HERE # ----------------------------------------------------- # --------------------------------- # PLOT: studentized residuals at the observation locations. # --------------------------------- plt.figure() plt.axis('equal') plot_locations(target, wells, obs) resid = ols_influence.resid_studentized max_resid = max(abs(resid)) xob = np.array([ob[0] for ob in obs]) yob = np.array([ob[1] for ob in obs]) a = 40 + (40 * abs(resid)/max_resid)**2 plt.scatter(xob[resid>0], yob[resid>0], s=a[resid>0], c='b', alpha=0.5) plt.scatter(xob[resid<0], yob[resid<0], s=a[resid<0], c='r', alpha=0.5) plt.xlabel('UTM Easting [m]') plt.ylabel('UTM Northing [m]') plt.title('Studentized Residuals', fontsize=14) plt.grid(True) # --------------------------------- # PLOT: studentized residuals # --------------------------------- plt.figure() resid = ols_influence.resid_studentized # Bar plot for the studentized residuals. plt.subplot(1, 2, 1) plt.bar(range(nobs), resid) threshold = 2 left, right = plt.xlim() plt.plot([left, right], [threshold, threshold], 'r', linewidth=3) plt.plot([left, right], [-threshold, -threshold], 'r', linewidth=3) plt.xlabel('Observation index') plt.ylabel('Studentized Residuals') plt.title('Studentized Residuals', fontsize=14) plt.grid(True) # Normal probability plot for the studentized residuals. plt.subplot(1, 2, 2) scipy.stats.probplot(resid, fit=True, plot=plt) plt.ylabel('Studentized Residuals') plt.title('Normal Probability Plot for Studentized Residuals', fontsize=14) plt.grid(True) # --------------------------------- # PLOT: locations of observation and wells, overlaying the head contours. # --------------------------------- plt.figure() plt.axis('equal') plot_locations(target, wells, obs) i = most_influential_singleton plt.plot(obs[i][0], obs[i][1], 's', markeredgecolor='k', fillstyle='none', markersize=10) for i in most_influential_pair: plt.plot(obs[i][0], obs[i][1], 'D', markeredgecolor='k', fillstyle='none', markersize=13) for i in most_influential_triple: plt.plot(obs[i][0], obs[i][1], 'o', markeredgecolor='k', fillstyle='none', markersize=16) nrows = 100 ncols = 100 xmin, xmax, ymin, ymax = plt.axis() contour_head(mo, xmin, xmax, ymin, ymax, nrows, ncols) plt.xlabel('UTM Easting [m]') plt.ylabel('UTM Northing [m]') plt.title('Locations', fontsize=14) plt.grid(True) # --------------------------------- # PLOT: sorted KL divergence # --------------------------------- plt.figure() # leave-one-out analysis. plt.subplot(1, 3, 1) plt.scatter(range(len(kldiv_one)), [p[0] for p in kldiv_one]) plt.xlabel('Sort Order') plt.ylabel('KL Divergence [bits]') plt.title('Leave-One-Out', fontsize=14) plt.grid(True) # leave-two-out analysis. plt.subplot(1, 3, 2) plt.scatter(range(len(kldiv_two)), [p[0] for p in kldiv_two]) plt.xlabel('Sort Order') plt.ylabel('KL Divergence [bits]') plt.title('Leave-Two-Out', fontsize=14) plt.grid(True) # leave-two-out analysis. plt.subplot(1, 3, 3) plt.scatter(range(len(kldiv_three)), [p[0] for p in kldiv_three]) plt.xlabel('Sort Order') plt.ylabel('KL Divergence [bits]') plt.title('Leave-Three-Out', fontsize=14) plt.grid(True) # --------------------------------- # PLOT: capture zones # --------------------------------- plt.figure() # With all data. plt.subplot(2, 2, 1) plt.axis('equal') X = np.linspace(pf0.xmin, pf0.xmax, pf0.ncols) Y = np.linspace(pf0.ymin, pf0.ymax, pf0.nrows) Z = pf0.pgrid plt.contourf(X, Y, Z, [0.0, 0.5, 1.0], cmap='tab10') plt.contour(X, Y, Z, [0.0, 0.5, 1.0], colors=['black']) plot_locations(target, wells, obs) plt.xlabel('UTM Easting [m]') plt.ylabel('UTM Northing [m]') plt.title('With All Data', fontsize=14) plt.grid(True) plt.axis([Xmin, Xmax, Ymin, Ymax]) # Used to plot the shadow on capture zones. [XX, YY] = np.meshgrid(X, Y) XX = np.reshape(XX[Z > 0.0], -1) YY = np.reshape(YY[Z > 0.0], -1) # Without the most influential singleton. plt.subplot(2, 2, 2) plt.axis('equal') X = np.linspace(pf1.xmin, pf1.xmax, pf1.ncols) Y = np.linspace(pf1.ymin, pf1.ymax, pf1.nrows) Z = pf1.pgrid plt.contourf(X, Y, Z, [0.0, 0.5, 1.0], cmap='tab10') plt.contour(X, Y, Z, [0.0, 0.5, 1.0], colors=['black']) plt.scatter(XX, YY, marker='.') plot_locations(target, wells, obs) plt.xlabel('UTM Easting [m]') plt.ylabel('UTM Northing [m]') plt.title('Without Most Influential Singleton', fontsize=14) plt.grid(True) plt.axis([Xmin, Xmax, Ymin, Ymax]) # Without the most influential pair. plt.subplot(2, 2, 3) plt.axis('equal') X = np.linspace(pf2.xmin, pf2.xmax, pf2.ncols) Y = np.linspace(pf2.ymin, pf2.ymax, pf2.nrows) Z = pf2.pgrid plt.contourf(X, Y, Z, [0.0, 0.5, 1.0], cmap='tab10') plt.contour(X, Y, Z, [0.0, 0.5, 1.0], colors=['black']) plt.scatter(XX, YY, marker='.') plot_locations(target, wells, obs) plt.xlabel('UTM Easting [m]') plt.ylabel('UTM Northing [m]') plt.title('Without Most Influential Pair', fontsize=14) plt.grid(True) plt.axis([Xmin, Xmax, Ymin, Ymax]) # Without the most influential triple. plt.subplot(2, 2, 4) plt.axis('equal') X = np.linspace(pf3.xmin, pf3.xmax, pf3.ncols) Y = np.linspace(pf3.ymin, pf3.ymax, pf3.nrows) Z = pf3.pgrid plt.contourf(X, Y, Z, [0.0, 0.5, 1.0], cmap='tab10') plt.contour(X, Y, Z, [0.0, 0.5, 1.0], colors=['black']) plt.scatter(XX, YY, marker='.') plot_locations(target, wells, obs) plt.xlabel('UTM Easting [m]') plt.ylabel('UTM Northing [m]') plt.title('Without Most Influential Triple', fontsize=14) plt.grid(True) plt.axis([Xmin, Xmax, Ymin, Ymax]) # --------------------------------- # PLOT: DFBETAS # --------------------------------- plt.figure() dfbetas = ols_influence.dfbetas for i in range(6): plt.subplot(3, 2, i+1) plt.bar(range(nobs), dfbetas[:, i]) threshold = 2/np.sqrt(nobs) left, right = plt.xlim() plt.plot([left, right], [threshold, threshold], 'r', linewidth=3) plt.plot([left, right], [-threshold, -threshold], 'r', linewidth=3) plt.xlabel('Observation index') plt.ylabel('DFBETAS') plt.title('DFBETAS {0}'.format(chr(65+i)), fontsize=10) plt.grid(True) plt.tight_layout() # --------------------------------- # PLOT: the influential data diagnostics # --------------------------------- plt.figure() # Leverage (diagonal of the Hat matrix) bar plot. plt.subplot(1, 2, 1) leverage = ols_influence.hat_matrix_diag plt.bar(range(nobs), leverage) threshold = 2*6/nobs left, right = plt.xlim() plt.plot([left, right], [threshold, threshold], 'r', linewidth=3) plt.xlabel('Observation index') plt.ylabel('Leverage') plt.title('Leverage', fontsize=14) plt.grid(True) # DFFITS bar plot. plt.subplot(1, 2 , 2) dffits, *_ = ols_influence.dffits plt.bar(range(nobs), dffits) threshold = 2*np.sqrt(6/nobs) left, right = plt.xlim() plt.plot([left, right], [threshold, threshold], 'r', linewidth=3) plt.plot([left, right], [-threshold, -threshold], 'r', linewidth=3) plt.xlabel('Observation index') plt.ylabel('DFFITS') plt.title('DFFITS', fontsize=14) plt.grid(True) #------------------------ plt.show()
def linear_regression_analysis(linear_regression): """ Compute and plot a complete analysis of a linear regression computed with Stats Models. Args: linear_regression (Stats Models Results): the result obtained with Stats Models. """ # Data resid = linear_regression.resid_pearson.copy() resid_index = linear_regression.resid.index exog = linear_regression.model.exog endog = linear_regression.model.endog fitted_values = linear_regression.fittedvalues influences = outliers_influence.OLSInfluence(linear_regression) p = exog.shape[1] # Number of features n = len(resid) # Number of individuals # Paramètres color1 = "#3498db" color2 = "#e74c3c" ############################################################################## # Tests statistiques # ############################################################################## # Homoscédasticité - Test de Breusch-Pagan ########################################## names = [ 'Lagrande multiplier statistic', 'p-value', 'f-value', 'f p-value' ] breusch_pagan = sm.stats.diagnostic.het_breuschpagan(resid, exog) print(lzip(names, breusch_pagan)) # Test de normalité - Shapiro-Wilk ################################### print(f"Shapiro pvalue : {st.shapiro(resid)[1]}") ############################################################################## # Analyses de forme # ############################################################################## # Histogramme des résidus ########################## data = resid data_filter = data[data < 5] data_filter = data[data > -5] len_data = len(data) len_data_filter = len(data_filter) ratio = len_data_filter / len_data fig, ax = plt.subplots() plt.hist(data_filter, bins=20, color=color1) plt.xlabel("Residual values") plt.ylabel("Number of residuals") plt.title(f"Histogramme des résidus de -5 à 5 ({ratio:.2%})") # Normal distribution vs residuals (QQ Plot, droite de Henry) ############################################################# data = pd.Series(resid).sort_values() len_data = len(data) normal = pd.Series(np.random.normal(size=len_data)).sort_values() fig, ax = plt.subplots() plt.scatter(data, normal, c=color1) plt.plot((-4, 4), (-4, 4), c=color2) plt.xlabel("Residuals") plt.ylabel("Normal distribution") plt.xlim(-4, 4) plt.ylim(-4, 4) plt.title("Residuals vs Normal (QQ Plot)") # Plot plt.show()
def OLSinfluence(X,y): ols_retults=ols(X,y) test_class = smo.OLSInfluence(ols_results) test =test_class.summary_frame() return test.head()
# #### # making some plots for assumption checking # In[11]: prediction = pd.DataFrame(m01.fittedvalues) prediction.columns = ['predicted'] prediction['standarized_prediction'] = ( prediction['predicted'] - prediction['predicted'].mean()) / prediction['predicted'].std() prediction.head() # In[12]: import statsmodels.stats.outliers_influence as sms summary_frame = sms.OLSInfluence(m01).summary_frame() summary_frame = pd.merge(summary_frame, prediction, how='inner', left_index=True, right_index=True) summary_frame.head() # In[13]: _ = sns.scatterplot(y='standard_resid', x='standarized_prediction', data=summary_frame) _ = plt.axhline(y=0) # #### # This graph can be used for testing homogeneity of variance. We encountered this kind of plot previously; essentially, if it has a funnel shape then we’re in trouble. The plot we have shows points that are equally spread for the three groups, which implies that variances are similar across groups (which was also the conclusion reached by Levene’s test).
def linear_regression_analysis(linear_regression): """ Compute and plot a complete analysis of a linear regression computed with Stats Models. Args: linear_regression (Stats Models Results): the result obtained with Stats Models. """ # Data resid = linear_regression.resid_pearson.copy() resid_index = linear_regression.resid.index exog = linear_regression.model.exog endog = linear_regression.model.endog fitted_values = linear_regression.fittedvalues influences = outliers_influence.OLSInfluence(linear_regression) p = exog.shape[1] # Number of features n = len(resid) # Number of individuals # Paramètres color1 = "#3498db" color2 = "#e74c3c" ############################################################################## # Tests statistiques # ############################################################################## # Homoscédasticité - Test de Breusch-Pagan ########################################## names = ['Lagrande multiplier statistic', 'p-value', 'f-value', 'f p-value'] breusch_pagan = sm.stats.diagnostic.het_breuschpagan(resid, exog) print(lzip(names, breusch_pagan)) # Test de normalité - Shapiro-Wilk ################################### print(f"Shapiro pvalue : {st.shapiro(resid)[1]}") ############################################################################## # Analyses de forme # ############################################################################## # Histogramme des résidus ########################## data = resid data_filter = data[data < 5] data_filter = data[data > -5] len_data = len(data) len_data_filter = len(data_filter) ratio = len_data_filter / len_data fig, ax = plt.subplots() plt.hist(data_filter, bins=20, color=color1) plt.xlabel("Residual values") plt.ylabel("Number of residuals") plt.title(f"Histogramme des résidus de -5 à 5 ({ratio:.2%})") # Normal distribution vs residuals (QQ Plot, droite de Henry) ############################################################# data = pd.Series(resid).sort_values() len_data = len(data) normal = pd.Series(np.random.normal(size=len_data)).sort_values() fig, ax = plt.subplots() plt.scatter(data, normal, c=color1) plt.plot((-4,4), (-4, 4), c=color2) plt.xlabel("Residuals") plt.ylabel("Normal distribution") plt.xlim(-4, 4) plt.ylim(-4, 4) plt.title("Residuals vs Normal (QQ Plot)") # Fitted vs Residuals ###################### data = resid fig, ax = plt.subplots() plt.scatter(fitted_values, data, alpha=0.5, c=color1) plt.xlabel("Fitted values") plt.ylabel("Residuals") plt.title("Fitted vs Residuals") # Actual vs Predict plot fig, ax = plt.subplots() plt.scatter(endog, fitted_values, c=color1, alpha=0.5) plt.plot(endog, endog, c=color2) plt.xlabel("Actual values") plt.ylabel("Fitted values") plt.title("Acutal vs Predict") ############################################################################## # Analyse des outliers # ############################################################################## # Leviers (hii, diagonale de la matrice chapeau) ################################################ # Individus atypiques (distance à la moyenne des observations) # Calcul de la proportion data = influences.hat_matrix_diag seuil = 2*p/n len_data = len(data) data_filter = data[data <= seuil] len_data_filter = len(data_filter) ratio = len_data_filter / len_data # Plot fig, ax = plt.subplots() plt.plot(data) plt.plot((0, len_data), (seuil, seuil), c="#d35400") plt.ylabel("Leverage values (hii)") plt.title(f"Leviers avec seuil à 2*p/n ({ratio:.2%})") # Résidus studentisés ##################### # Individus mal représentés par le modèle # Calcul de la proportion data = influences.resid_studentized_internal len_data = len(data) data_filter = data[data <= 2] data_filter = data_filter[data_filter >= -2] len_data_filter = len(data_filter) ratio = len_data_filter / len_data # Plot fig, ax = plt.subplots() plt.plot(data) plt.plot((0, len_data), (2, 2), c="#d35400") plt.plot((0, len_data), (-2, -2), c="#d35400") plt.ylabel("Studentized Residuals") plt.title(f"Résidus studentisés avec seuil à 2 et -2 ({ratio:.2%})") # Distances de cook ################### # Outliers dont la supression influencent fortement le modèle # Calcul de la proportion data = influences.cooks_distance[0] seuil = 4/(n-p) len_data = len(data) data_filter = data[data <= seuil] len_data_filter = len(data_filter) ratio = len_data_filter / len_data # Plot fig, ax = plt.subplots() plt.plot(data) plt.plot((0, len_data), (seuil, seuil)) plt.ylabel("Cook Distance") plt.title(f"Distances de Cook avec seuil à 4/(n-p) ({ratio:.2%})") # Plot plt.show()