def test_omni_normtest(): #tests against R fBasics from scipy import stats st_pv_R = np.array( [[3.994138321207883, -1.129304302161460, 1.648881473704978], [0.1357325110375005, 0.2587694866795507, 0.0991719192710234]]) nt = omni_normtest(x) assert_almost_equal(nt, st_pv_R[:, 0], 14) st = stats.skewtest(x) assert_almost_equal(st, st_pv_R[:, 1], 14) kt = stats.kurtosistest(x) assert_almost_equal(kt, st_pv_R[:, 2], 11) st_pv_R = np.array( [[34.523210399523926, 4.429509162503833, 3.860396220444025], [3.186985686465249e-08, 9.444780064482572e-06, 1.132033129378485e-04]]) x2 = x**2 #TODO: fix precision in these test with relative tolerance nt = omni_normtest(x2) assert_almost_equal(nt, st_pv_R[:, 0], 12) st = stats.skewtest(x2) assert_almost_equal(st, st_pv_R[:, 1], 12) kt = stats.kurtosistest(x2) assert_almost_equal(kt, st_pv_R[:, 2], 12)
def test_omni_normtest(): #tests against R fBasics from scipy import stats st_pv_R = np.array( [[3.994138321207883, -1.129304302161460, 1.648881473704978], [0.1357325110375005, 0.2587694866795507, 0.0991719192710234]]) nt = omni_normtest(x) assert_almost_equal(nt, st_pv_R[:,0], 14) st = stats.skewtest(x) assert_almost_equal(st, st_pv_R[:,1], 14) kt = stats.kurtosistest(x) assert_almost_equal(kt, st_pv_R[:,2], 11) st_pv_R = np.array( [[34.523210399523926, 4.429509162503833, 3.860396220444025], [3.186985686465249e-08, 9.444780064482572e-06, 1.132033129378485e-04]]) x2 = x**2 #TODO: fix precision in these test with relative tolerance nt = omni_normtest(x2) assert_almost_equal(nt, st_pv_R[:,0], 12) st = stats.skewtest(x2) assert_almost_equal(st, st_pv_R[:,1], 12) kt = stats.kurtosistest(x2) assert_almost_equal(kt, st_pv_R[:,2], 12)
def test_omni_normtest_axis(reset_randomstate): #test axis of omni_normtest x = np.random.randn(25, 3) nt1 = omni_normtest(x) nt2 = omni_normtest(x, axis=0) nt3 = omni_normtest(x.T, axis=1) assert_almost_equal(nt2, nt1, decimal=13) assert_almost_equal(nt3, nt1, decimal=13)
def test_omni_normtest_axis(): #test axis of omni_normtest x = np.random.randn(25, 3) nt1 = omni_normtest(x) nt2 = omni_normtest(x, axis=0) nt3 = omni_normtest(x.T, axis=1) assert_almost_equal(nt2, nt1, decimal=13) assert_almost_equal(nt3, nt1, decimal=13)
def homoscedasticity(self, all=False): # sns.scatterplot(data=self.df, x=self.indep[0], y=self.dep) # plt.show() # lev, p_lev = scipy.stats.levene(*self.dfs) # , p>0.05 good self.tests.loc['omnibus'] = [*omni_normtest(self.residuals)] self.tests.loc['normaltest'] = [ *scipy.stats.normaltest(self.residuals) ]
def regression_scores(timeseries, time_window_size, time_lag, reg, cv, scoring, timeseriesZ=None): """Compute regression scores for a given set of 3 timeseries according to the variable causality structures. """ global causality_structures if scoring == 'residual_tests': features_regression = np.zeros([len(causality_structures),7]) else: features_regression = np.zeros([len(causality_structures),2]) #added 2 dimensions to compute r2 and mse for j, (cs_train, cs_test) in enumerate(causality_structures): ts_train = timeseries[:,cs_train] if not(timeseriesZ is None): ts_train = np.hstack([ts_train, timeseriesZ]) if time_lag is None: time_lag=time_window_size ts_test = timeseries[:,cs_test] tmp_score = np.zeros([time_window_size,2]) #added 2 dimensions to compute r2 and mse residuals = np.zeros(timeseries.shape[0]-time_window_size) for i_reg in range(time_window_size): idx_example = np.arange(i_reg, timeseries.shape[0]-time_lag, time_window_size) X = np.zeros((idx_example.size, time_window_size, ts_train.shape[1])) for k in range(time_window_size): X[:,k] = ts_train[idx_example+k] X = X.reshape(X.shape[0], X.shape[1] * X.shape[2]) y = ts_test[idx_example + time_lag] if scoring == 'residual_tests': y_pred_i_reg = np.zeros(y.size) kfold = KFold(n=y.size, n_folds=cv) for train, test in kfold: reg.fit(X[train], y[train]) y_pred_i_reg[test] = reg.predict(X[test]) residuals[idx_example] = y - y_pred_i_reg #residuals else: tmp_predict = cross_val_predict(reg, X, y, cv=cv) tmp_score[i_reg,0] = r2_score(y,tmp_predict).mean() tmp_score[i_reg,1] = mean_squared_error(y,tmp_predict).mean() #tmp_score[i_reg] = cross_val_score(reg, X, y, cv=cv, scoring=scoring).mean() if scoring == 'residual_tests': features_regression[j,0] = durbin_watson(residuals) features_regression[j,[1,2]] = omni_normtest(residuals) features_regression[j,3:] = jarque_bera(residuals) else: features_regression[j] = tmp_score.mean(0) return features_regression
def process_linreg(x, y, metrics_dict, suffix): x = sm.add_constant(x) results = sm.OLS(y, x).fit() residuals = results.resid jb, jbpv, skew, kurtosis = jarque_bera(results.wresid) omni, omnipv = omni_normtest(results.wresid) res_mean = np.mean(residuals) res_std = np.std(residuals) _, normality_p_value_shapiro = shapiro(residuals) _, normality_p_value_ks_wo_params = kstest(residuals, 'norm') _, normality_p_value_ks_with_params = kstest(residuals, 'norm', (res_mean, res_std)) _, normality_p_value_dagostino = normaltest(residuals) metrics_dict['mean' + suffix].append(np.mean(y)) metrics_dict['R2' + suffix].append(results.rsquared) metrics_dict['R2_adj' + suffix].append(results.rsquared_adj) metrics_dict['f_stat' + suffix].append(results.fvalue) metrics_dict['prob(f_stat)' + suffix].append(results.f_pvalue) metrics_dict['log_likelihood' + suffix].append(results.llf) metrics_dict['AIC' + suffix].append(results.aic) metrics_dict['BIC' + suffix].append(results.bic) metrics_dict['omnibus' + suffix].append(omni) metrics_dict['prob(omnibus)' + suffix].append(omnipv) metrics_dict['skew' + suffix].append(skew) metrics_dict['kurtosis' + suffix].append(kurtosis) metrics_dict['durbin_watson' + suffix].append(durbin_watson( results.wresid)) metrics_dict['jarque_bera' + suffix].append(jb) metrics_dict['prob(jarque_bera)' + suffix].append(jbpv) metrics_dict['cond_no' + suffix].append(results.condition_number) metrics_dict['normality_p_value_shapiro' + suffix].append(normality_p_value_shapiro) metrics_dict['normality_p_value_ks_wo_params' + suffix].append(normality_p_value_ks_wo_params) metrics_dict['normality_p_value_ks_with_params' + suffix].append(normality_p_value_ks_with_params) metrics_dict['normality_p_value_dagostino' + suffix].append(normality_p_value_dagostino) metrics_dict['intercept' + suffix].append(results.params[0]) metrics_dict['slope' + suffix].append(results.params[1]) metrics_dict['intercept_std' + suffix].append(results.bse[0]) metrics_dict['slope_std' + suffix].append(results.bse[1]) metrics_dict['intercept_p_value' + suffix].append(results.pvalues[0]) metrics_dict['slope_p_value' + suffix].append(results.pvalues[1])
def __init__(self, model: x) -> None: self.title = model.model.__class__.__name__ + ' ' + "Regression Results" #top-left self.Dep_Variable = None self.Model = None self.Method = ['Least Squares'] self.Date = None self.Time = None self.No_Observations = None self.DfResiduals = None self.DfModel = None #top-right self.R_squared = ["%#8.3f" % model.rsquared] self.Adj_R_squared = ["%#8.3f" % model.rsquared_adj] self.F_statistic = ["%#8.4g" % model.fvalue] self.Prob_F_statistic = ["%#6.3g" % model.f_pvalue] self.Log_Likelihood = None self.AIC = ["%#8.4g" % model.aic] self.BIC = ["%#8.4g" % model.bic] from statsmodels.stats.stattools import (jarque_bera, omni_normtest, durbin_watson) jb, jbpv, skew, kurtosis = jarque_bera(model.wresid) omni, omnipv = omni_normtest(model.wresid) eigvals = model.eigenvals condno = model.condition_number #diagn_left self.Omnibus = ["%#6.3f" % omni] self.Prob_Omnibus = ["%#6.3f" % omnipv] self.Skew = ["%#6.3f" % skew] self.Kurtosis = ["%#6.3f" % kurtosis] #diagn_right self.Durbin_Watson = ["%#8.3f" % durbin_watson(model.wresid)] self.JarqueBera_JB = ["%#8.3f" % jb] self.Prob_JB = ["%#8.3g" % jbpv] self.Cond_No = ["%#8.3g" % condno]
def linear_new(types, intput): np.random.seed(9876789) df = pd.read_csv(intput, index_col=False) print(df) print(df.columns[:-1]) feature = df.columns[:-1] s1 = ' + '.join(feature) s2 = df.columns[-1] s = s2 + " ~ " + s1 if types == "ols": results = smf.ols(s, data=df).fit(use_t=True) elif types == "gls": results = smf.gls(s, data=df).fit(use_t=True) elif types == "glsar": results = smf.glsar(s, data=df).fit(use_t=True) elif types == "wls": results = smf.wls(s, data=df).fit(use_t=True) else: print("No this type!!!") exit(0) print( "**********************************************************************************\n" ) alpha = 0.05 print(results.summary()) data_t = { "coef": results.params, "std err": results.bse, "t": results.tvalues, "P>|t|": results.pvalues, "[" + str(alpha / 2.0): results.conf_int(alpha)[0], str(1 - alpha / 2.0) + "]": results.conf_int(alpha)[1] } sdata_df = pd.DataFrame(data_t) print(sdata_df) sdata_df.to_csv("out/data1.csv") from statsmodels.stats.stattools import (jarque_bera, omni_normtest, durbin_watson) jb, jbpv, skew, kurtosis = jarque_bera(results.wresid) omni, omnipv = omni_normtest(results.wresid) title = [ "Model", "R-squared", "Adj. R-squared", "F-statistic", "Prob (F-statistic)", "Log-Likelihood", "AIC", "BIC", "Omnibus", "Prob(Omnibus)", "Skew", "Kurtosis", "Durbin-Watson", "Jarque-Bera (JB)", "Prob(JB)", "Cond. No." ] value = [ results.model.__class__.__name__, results.rsquared, results.rsquared_adj, results.fvalue, results.f_pvalue, results.llf, results.aic, results.bic, omni, omnipv, skew, kurtosis, durbin_watson(results.wresid), jb, jbpv, results.diagn['condno'] ] datadf = {"title": np.array(title), "value": np.array(value)} select_df = pd.DataFrame(datadf) print(select_df) select_df.to_csv("out/data2.csv") # 画1D或者3D图形 predicted = results.predict(df) import matplotlib.pyplot as plt if len(feature) == 1: x = np.array(df[feature]).reshape(-1, 1) y = np.array(df[s2]).reshape(-1, 1) plt.figure(facecolor='white', figsize=(10, 5)) plt.scatter(x, y, marker='x') plt.plot(x, predicted, c='r') title = 'The Linear Graph of One Dimension' # 绘制x轴和y轴坐标 plt.xlabel(feature[0]) plt.ylabel(s2) plt.title(title) plt.grid() plt.savefig("out/plot_out.png", format='png') elif len(feature) == 2: from mpl_toolkits.mplot3d import Axes3D ax1 = plt.axes(projection='3d') x = np.array(df[feature[0]]).reshape(-1, 1) y = np.array(df[feature[1]]).reshape(-1, 1) z = np.array(df[s2]).reshape(-1, 1) ax1.scatter3D(x, y, z, cmap='Blues') # 绘制散点图 ax1.plot3D(x, y, predicted, 'gray') # 绘制空间曲线 ax1.set_xlabel(feature[0]) ax1.set_ylabel(feature[1]) ax1.set_zlabel(s2) plt.savefig("out/plot_out.png", format='png') else: print("The number of feature is big than 2 ,no plot!") return
def summary(self, yname=None, xname=None, title=None, alpha=.05): """Summarize the Regression Results Parameters ----------- yname : string, optional Default is `y` xname : list of strings, optional Default is `var_##` for ## in p the number of regressors title : string, optional Title for the top table. If not None, then this replaces the default title alpha : float significance level for the confidence intervals Returns ------- smry : Summary instance this holds the summary tables and text, which can be printed or converted to various output formats. See Also -------- statsmodels.iolib.summary.Summary : class to hold summary results """ #TODO: import where we need it (for now), add as cached attributes from statsmodels.stats.stattools import (jarque_bera, omni_normtest, durbin_watson) jb, jbpv, skew, kurtosis = jarque_bera(self.wresid) omni, omnipv = omni_normtest(self.wresid) eigvals = self.eigenvals condno = self.condition_number self.diagn = dict(jb=jb, jbpv=jbpv, skew=skew, kurtosis=kurtosis, omni=omni, omnipv=omnipv, condno=condno, mineigval=eigvals[0]) top_left = [('Dep. Variable:', None), ('Model:', None), ('Method:', ['Least Squares']), ('Date:', None), ('Time:', None) ] top_right = [('Pseudo R-squared:', ["%#8.4g" % self.prsquared]), ('Bandwidth:', ["%#8.4g" % self.bandwidth]), ('Sparsity:', ["%#8.4g" % self.sparsity]), ('No. Observations:', None), ('Df Residuals:', None), #[self.df_resid]), #TODO: spelling ('Df Model:', None) #[self.df_model]) ] diagn_left = [('Omnibus:', ["%#6.3f" % omni]), ('Prob(Omnibus):', ["%#6.3f" % omnipv]), ('Skew:', ["%#6.3f" % skew]), ('Kurtosis:', ["%#6.3f" % kurtosis]) ] diagn_right = [('Durbin-Watson:', ["%#8.3f" % durbin_watson(self.wresid)]), ('Jarque-Bera (JB):', ["%#8.3f" % jb]), ('Prob(JB):', ["%#8.3g" % jbpv]), ('Cond. No.', ["%#8.3g" % condno]) ] if title is None: title = self.model.__class__.__name__ + ' ' + "Regression Results" #create summary table instance from statsmodels.iolib.summary import Summary smry = Summary() smry.add_table_2cols(self, gleft=top_left, gright=top_right, yname=yname, xname=xname, title=title) smry.add_table_params(self, yname=yname, xname=xname, alpha=.05, use_t=True) # smry.add_table_2cols(self, gleft=diagn_left, gright=diagn_right, #yname=yname, xname=xname, #title="") #add warnings/notes, added to text format only etext = [] if eigvals[-1] < 1e-10: wstr = "The smallest eigenvalue is %6.3g. This might indicate " wstr += "that there are\n" wstr += "strong multicollinearity problems or that the design " wstr += "matrix is singular." wstr = wstr % eigvals[-1] etext.append(wstr) elif condno > 1000: #TODO: what is recommended wstr = "The condition number is large, %6.3g. This might " wstr += "indicate that there are\n" wstr += "strong multicollinearity or other numerical " wstr += "problems." wstr = wstr % condno etext.append(wstr) if etext: smry.add_extra_txt(etext) return smry
def summary(self, yname=None, xname=None, title=None, alpha=.05): """Summarize the Regression Results Parameters ----------- yname : string, optional Default is `y` xname : list of strings, optional Default is `var_##` for ## in p the number of regressors title : string, optional Title for the top table. If not None, then this replaces the default title alpha : float significance level for the confidence intervals Returns ------- smry : Summary instance this holds the summary tables and text, which can be printed or converted to various output formats. See Also -------- statsmodels.iolib.summary.Summary : class to hold summary results """ #TODO: import where we need it (for now), add as cached attributes from statsmodels.stats.stattools import (jarque_bera, omni_normtest, durbin_watson) jb, jbpv, skew, kurtosis = jarque_bera(self.wresid) omni, omnipv = omni_normtest(self.wresid) eigvals = self.eigenvals condno = self.condition_number self.diagn = dict(jb=jb, jbpv=jbpv, skew=skew, kurtosis=kurtosis, omni=omni, omnipv=omnipv, condno=condno, mineigval=eigvals[0]) top_left = [('Dep. Variable:', None), ('Model:', None), ('Method:', ['Least Squares']), ('Date:', None), ('Time:', None)] top_right = [ ('Pseudo R-squared:', ["%#8.4g" % self.prsquared]), ('Bandwidth:', ["%#8.4g" % self.bandwidth]), ('Sparsity:', ["%#8.4g" % self.sparsity]), ('No. Observations:', None), ('Df Residuals:', None), #[self.df_resid]), #TODO: spelling ('Df Model:', None) #[self.df_model]) ] diagn_left = [('Omnibus:', ["%#6.3f" % omni]), ('Prob(Omnibus):', ["%#6.3f" % omnipv]), ('Skew:', ["%#6.3f" % skew]), ('Kurtosis:', ["%#6.3f" % kurtosis])] diagn_right = [('Durbin-Watson:', ["%#8.3f" % durbin_watson(self.wresid)]), ('Jarque-Bera (JB):', ["%#8.3f" % jb]), ('Prob(JB):', ["%#8.3g" % jbpv]), ('Cond. No.', ["%#8.3g" % condno])] if title is None: title = self.model.__class__.__name__ + ' ' + "Regression Results" #create summary table instance from statsmodels.iolib.summary import Summary smry = Summary() smry.add_table_2cols(self, gleft=top_left, gright=top_right, yname=yname, xname=xname, title=title) smry.add_table_params(self, yname=yname, xname=xname, alpha=.05, use_t=True) # smry.add_table_2cols(self, gleft=diagn_left, gright=diagn_right, #yname=yname, xname=xname, #title="") #add warnings/notes, added to text format only etext = [] if eigvals[-1] < 1e-10: wstr = "The smallest eigenvalue is %6.3g. This might indicate " wstr += "that there are\n" wstr += "strong multicollinearity problems or that the design " wstr += "matrix is singular." wstr = wstr % eigvals[-1] etext.append(wstr) elif condno > 1000: #TODO: what is recommended wstr = "The condition number is large, %6.3g. This might " wstr += "indicate that there are\n" wstr += "strong multicollinearity or other numerical " wstr += "problems." wstr = wstr % condno etext.append(wstr) if etext: smry.add_extra_txt(etext) return smry
def single(self, item, config, configs_child): if config.experiment.data in [DataType.betas, DataType.betas_adj, DataType.residuals_common, DataType.residuals_special]: if config.experiment.method == Method.linreg: targets = self.get_strategy.get_target(config) x = sm.add_constant(targets) y = self.get_strategy.get_single_base(config, [item])[0] results = sm.OLS(y, x).fit() y = results.resid jb, jbpv, skew, kurtosis = jarque_bera(results.wresid) omni, omnipv = omni_normtest(results.wresid) res_mean = np.mean(y) res_std = np.std(y) _, normality_p_value_shapiro = shapiro(y) _, normality_p_value_ks_wo_params = kstest(y, 'norm') _, normality_p_value_ks_with_params = kstest(y, 'norm', (res_mean, res_std)) _, normality_p_value_dagostino = normaltest(y) config.metrics['item'].append(item) aux = self.get_strategy.get_aux(config, item) config.metrics['aux'].append(aux) config.metrics['R2'].append(results.rsquared) config.metrics['R2_adj'].append(results.rsquared_adj) config.metrics['f_stat'].append(results.fvalue) config.metrics['prob(f_stat)'].append(results.f_pvalue) config.metrics['log_likelihood'].append(results.llf) config.metrics['AIC'].append(results.aic) config.metrics['BIC'].append(results.bic) config.metrics['omnibus'].append(omni) config.metrics['prob(omnibus)'].append(omnipv) config.metrics['skew'].append(skew) config.metrics['kurtosis'].append(kurtosis) config.metrics['durbin_watson'].append(durbin_watson(results.wresid)) config.metrics['jarque_bera'].append(jb) config.metrics['prob(jarque_bera)'].append(jbpv) config.metrics['cond_no'].append(results.condition_number) config.metrics['normality_p_value_shapiro'].append(normality_p_value_shapiro) config.metrics['normality_p_value_ks_wo_params'].append(normality_p_value_ks_wo_params) config.metrics['normality_p_value_ks_with_params'].append(normality_p_value_ks_with_params) config.metrics['normality_p_value_dagostino'].append(normality_p_value_dagostino) config.metrics['intercept'].append(results.params[0]) config.metrics['slope'].append(results.params[1]) config.metrics['intercept_std'].append(results.bse[0]) config.metrics['slope_std'].append(results.bse[1]) config.metrics['intercept_p_value'].append(results.pvalues[0]) config.metrics['slope_p_value'].append(results.pvalues[1]) elif config.experiment.method == Method.cluster: x = self.get_strategy.get_target(config) x_normed = normalize_to_0_1(x) y = self.get_strategy.get_single_base(config, [item])[0] y_normed = normalize_to_0_1(y) min_samples = max(1, int(config.experiment.method_params['min_samples_percentage'] * len(x) / 100.0)) X = np.array([x_normed, y_normed]).T db = DBSCAN(eps=config.experiment.method_params['eps'], min_samples=min_samples).fit(X) core_samples_mask = np.zeros_like(db.labels_, dtype=bool) core_samples_mask[db.core_sample_indices_] = True labels = db.labels_ number_of_clusters = len(set(labels)) - (1 if -1 in labels else 0) number_of_noise_points = list(labels).count(-1) config.metrics['item'].append(item) config.metrics['aux'].append(self.get_strategy.get_aux(config, item)) config.metrics['number_of_clusters'].append(number_of_clusters) config.metrics['number_of_noise_points'].append(number_of_noise_points) elif config.experiment.method == Method.polygon: metrics_keys = get_method_metrics_keys(config) for config_child in configs_child: item_id = config_child.advanced_dict[item] for key in config_child.advanced_data: if key not in metrics_keys: advanced_data = config_child.advanced_data[key][item_id] suffix = str(config_child.attributes.observables) if suffix != '' and suffix not in key: key += '_' + suffix config.metrics[key].append(advanced_data) metrics_keys.append(key) if config.experiment.method_params['method'] == Method.linreg: polygons_region = [] polygons_slope = [] polygons_region_min = [] max_abs_slope = 0.0 is_inside = False mins = [min(self.get_strategy.get_target(config_child)) for config_child in configs_child] maxs = [max(self.get_strategy.get_target(config_child)) for config_child in configs_child] border_l = max(mins) border_r = min(maxs) if border_l > border_r: raise ValueError('Polygons borders are not consistent') for config_child in configs_child: targets = self.get_strategy.get_target(config_child) item_id = config_child.advanced_dict[item] intercept = config_child.advanced_data['intercept'][item_id] slope = config_child.advanced_data['slope'][item_id] intercept_std = config_child.advanced_data['intercept_std'][item_id] slope_std = config_child.advanced_data['slope_std'][item_id] pr = PolygonRoutines( x=targets, y=[], params={ 'intercept': intercept, 'slope': slope, 'intercept_std': intercept_std, 'slope_std': slope_std }, method=config_child.experiment.method ) points_region = pr.get_border_points() points_slope = [ geometry.Point(slope - 3.0 * slope_std, 0.0), geometry.Point(slope + 3.0 * slope_std, 0.0), geometry.Point(slope + 3.0 * slope_std, 1.0), geometry.Point(slope - 3.0 * slope_std, 1.0), ] max_abs_slope = max(max_abs_slope, abs(slope)) pr_min = PolygonRoutines( x=[border_l, border_r], y=[], params={ 'intercept': intercept, 'slope': slope, 'intercept_std': intercept_std, 'slope_std': slope_std }, method=config_child.experiment.method ) points_region_min = pr_min.get_border_points() polygon = geometry.Polygon([[point.x, point.y] for point in points_region]) polygons_region.append(polygon) polygon = geometry.Polygon([[point.x, point.y] for point in points_slope]) polygons_slope.append(polygon) polygon = geometry.Polygon([[point.x, point.y] for point in points_region_min]) polygons_region_min.append(polygon) intersection = polygons_region[0] union = polygons_region[0] for polygon in polygons_region[1::]: intersection = intersection.intersection(polygon) union = union.union(polygon) area_intersection_rel = intersection.area / union.area union = polygons_region_min[0] for polygon in polygons_region_min[1::]: union = union.union(polygon) for polygon in polygons_region_min: if union.area == polygon.area: is_inside = True intersection = polygons_slope[0] union = polygons_slope[0] for polygon in polygons_slope[1::]: intersection = intersection.intersection(polygon) union = union.union(polygon) slope_intersection_rel = intersection.area / union.area config.metrics['item'].append(item) aux = self.get_strategy.get_aux(config, item) config.metrics['aux'].append(aux) config.metrics['area_intersection_rel'].append(area_intersection_rel) config.metrics['slope_intersection_rel'].append(slope_intersection_rel) config.metrics['max_abs_slope'].append(max_abs_slope) config.metrics['is_inside'].append(is_inside) elif config.experiment.method_params['method'] == Method.variance: polygons_region_box = [] for config_child in configs_child: targets = self.get_strategy.get_target(config_child) data = self.get_strategy.get_single_base(config_child, [item]) targets = np.squeeze(np.asarray(targets)) data = np.squeeze(np.asarray(data)) exog = sm.add_constant(targets) endog = data results = sm.OLS(endog, exog).fit() residuals = results.resid semi_window = config_child.experiment.method_params['semi_window'] box_b = config_child.experiment.method_params['box_b'] box_t = config_child.experiment.method_params['box_t'] box_xs, box_bs, box_ms, box_ts = process_box(targets, residuals, semi_window, box_b, box_t) points_box = [] for p_id in range(0, len(box_xs)): points_box.append(geometry.Point( box_xs[p_id], box_ts[p_id] )) for p_id in range(len(box_xs) - 1, -1, -1): points_box.append(geometry.Point( box_xs[p_id], box_bs[p_id] )) polygon = geometry.Polygon([[point.x, point.y] for point in points_box]) polygons_region_box.append(polygon) intersection_box = polygons_region_box[0] union_box = polygons_region_box[0] for polygon in polygons_region_box[1::]: intersection_box = intersection_box.intersection(polygon) union_box = union_box.union(polygon) area_intersection_rel_box = intersection_box.area / union_box.area config.metrics['item'].append(item) aux = self.get_strategy.get_aux(config, item) config.metrics['aux'].append(aux) config.metrics['area_intersection_rel_box'].append(area_intersection_rel_box) elif config.experiment.method == Method.z_test_linreg: slopes = [] slopes_std = [] num_subs = [] metrics_keys = get_method_metrics_keys(config) for config_child in configs_child: item_id = config_child.advanced_dict[item] for key in config_child.advanced_data: if key not in metrics_keys: advanced_data = config_child.advanced_data[key][item_id] suffix = str(config_child.attributes.observables) if suffix != '' and suffix not in key: key += '_' + suffix config.metrics[key].append(advanced_data) metrics_keys.append(key) slopes.append(config_child.advanced_data['slope'][item_id]) slopes_std.append(config_child.advanced_data['slope_std'][item_id]) num_subs.append(len(config_child.attributes_dict['age'])) std_errors = [slopes_std[i] / np.sqrt(num_subs[i]) for i in range(0, len(slopes_std))] z_value = (slopes[0] - slopes[1]) / np.sqrt(sum([std_error * std_error for std_error in std_errors])) p_value = norm.sf(abs(z_value)) * 2.0 config.metrics['item'].append(item) aux = self.get_strategy.get_aux(config, item) config.metrics['aux'].append(aux) config.metrics['z_value'].append(z_value) config.metrics['p_value'].append(p_value) config.metrics['abs_z_value'].append(np.absolute(z_value)) elif config.experiment.method == Method.aggregator: metrics_keys = get_method_metrics_keys(config) for config_child in configs_child: item_id = config_child.advanced_dict[item] for key in config_child.advanced_data: if key not in metrics_keys: advanced_data = config_child.advanced_data[key][item_id] suffix = str(config_child.attributes.observables) if suffix != '' and suffix not in key: key += '_' + suffix config.metrics[key].append(advanced_data) metrics_keys.append(key) config.metrics['item'].append(item) aux = self.get_strategy.get_aux(config, item) config.metrics['aux'].append(aux) elif config.experiment.method == Method.variance: targets = self.get_strategy.get_target(config) data = self.get_strategy.get_single_base(config, [item]) targets = np.squeeze(np.asarray(targets)) data = np.squeeze(np.asarray(data)) semi_window = config.experiment.method_params['semi_window'] box_b = config.experiment.method_params['box_b'] box_t = config.experiment.method_params['box_t'] xs, bs, ms, ts = process_box(targets, data, semi_window, box_b, box_t) variance_processing(xs, bs, config.metrics, 'box_b') variance_processing(xs, ms, config.metrics, 'box_m') variance_processing(xs, ts, config.metrics, 'box_t') R2 = np.min([config.metrics['box_b_best_R2'][-1], config.metrics['box_t_best_R2'][-1]]) config.metrics['best_R2'].append(R2) config.metrics['item'].append(item) aux = self.get_strategy.get_aux(config, item) config.metrics['aux'].append(aux)
def check_error_term_normality(self) -> bool: """ Checks if the distribution of the error term is normal by: - Shapiro-Wilk's normality test, - Jarque-Bera's normality test, - Omnibus' normality test, - Kolmogorov-Smirnov's normality test, - Q-Q plot. If: - silent_mode = True, method returns: a) True (which means that the assumption is fulfilled) if the percentage of statistical tests for which the assumption is fulfilled is higher than or equal to set min_fulfill_ratio b) False (which means that the assumption is not fulfilled) if the percentage of statistical tests for which the assumption is fulfilled is lower than set min_fulfill_ratio - silent_mode = False, method returns True/False as above and shows additional statistics, descriptions which are helpful in assessing the fulfilment of assumption """ sw = stats.shapiro(self.residuals) jb = stats.jarque_bera(self.residuals) om = omni_normtest(self.residuals) ks = stats.kstest(self.residuals, "norm") ad = stats.anderson(self.residuals, dist="norm") normality_tests_names = [ "Shapiro-Wilk", "Jarque-Bera", "Omnibus", "Kolmogorov-Smirnov" ] normality_tests = [sw, jb, om, ks, ad] tests = zip(normality_tests_names, normality_tests) if not self.silent_mode: print( Color.BOLD + "Assumption 7. The error term is normally distributed." + Color.END, "\n") print("This assumption affects on: \n", "- interpretation \n") print( "REMARK: For datasets with sufficiently large sample size, the normality of " "errors distribution comes from Central Limit Theorem.\n") print( "OLS does not require that the error term follows a normal distribution to " "produce unbiased estimates with the minimum variance. However, satisfying this " "assumption allows you to perform statistical hypothesis testing and generate " "reliable confidence intervals and prediction intervals. \n") print( "Statistical tests for checking normality of the error term distribution: \n" ) true_counts = 0 for test in tests: print( Color.BOLD + f"{test[0]}: " + Color.END + f"test statistic: {test[1][0]:.4f}, p-value: {test[1][1]}") true_counts = true_counts + test_hypothesis( self.alpha, p_value=test[1][1], null_hypothesis="the error term is normally distributed") true_ratio = true_counts / len(normality_tests) check_fulfill_ratio(true_fulfill_ratio=true_ratio, min_fulfill_ratio=self.min_fulfill_ratio) print("Q-Q (quantile-quantile) plot: \n") print( "HINT: If error term's distribution is similar to normal distribution, the points " "in the Q–Q plot will approximately lie on the line y = x") sm.qqplot(self.residuals, line='s') plt.show() else: true_counts = 0 for test in tests: true_counts = true_counts + test_hypothesis( self.alpha, p_value=test[1][1], print_outcome=False) true_ratio = true_counts / len(normality_tests) return check_fulfill_ratio(true_fulfill_ratio=true_ratio, min_fulfill_ratio=self.min_fulfill_ratio, print_outcome=False)
np.savetxt('phenotypic_age_log10_deci.txt', phenotypic_age, fmt='%.2f') np.savetxt('mortality_score_1_year_log10_deci.txt', mortality_score_1_year, fmt='%.2f') np.savetxt('mortality_score_2_year_log10_deci.txt', mortality_score_2_year, fmt='%.2f') x = sm.add_constant(data_dict['Age']) results = sm.OLS(phenotypic_age, x).fit() residuals = results.resid jb, jbpv, skew, kurtosis = jarque_bera(results.wresid) omni, omnipv = omni_normtest(results.wresid) res_mean = np.mean(residuals) res_std = np.std(residuals) _, normality_p_value_shapiro = shapiro(residuals) _, normality_p_value_dagostino = normaltest(residuals) metrics_dict = {} metrics_dict['R2'] = results.rsquared metrics_dict['R2_adj'] = results.rsquared_adj metrics_dict['f_stat'] = results.fvalue metrics_dict['prob(f_stat)'] = results.f_pvalue metrics_dict['log_likelihood'] = results.llf metrics_dict['AIC'] = results.aic metrics_dict['BIC'] = results.bic
$\text{H}_0$:正規分布である $\text{H}_A$:$\text{H}_0$は成立しない BJ検定と同じように,正規性の判断には歪度(わいど;Skewness)と尖度(せんど;Kurtosis)に基づいている。 --- `statsmodels`のサブパッケージの一部として含まれている。 `data_norm`を使って試してみる。 <返り値> * テスト統計量 * $p$値 omni_normtest(data_norm) $p$値は高いため,10%有意水準でも$\text{H}_0$を棄却できない。 --- 次に`data_uniform`を試してみよう。 $p$値は非常に小さいため,1%有意水準でも$\text{H}_0$を棄却できる。 omni_normtest(data_uniform) --- 上で行った2つの回帰分析の結果を検定してみよう。 omni_normtest(res_wage.resid)