def test_durbin_watson(): #benchmark values from R car::durbinWatsonTest(x) #library("car") #> durbinWatsonTest(x) #[1] 1.95298958377419 #> durbinWatsonTest(x**2) #[1] 1.848802400319998 #> durbinWatsonTest(x[2:20]+0.5*x[1:19]) #[1] 1.09897993228779 #> durbinWatsonTest(x[2:20]+0.8*x[1:19]) #[1] 0.937241876707273 #> durbinWatsonTest(x[2:20]+0.9*x[1:19]) #[1] 0.921488912587806 st_R = 1.95298958377419 assert_almost_equal(durbin_watson(x), st_R, 14) st_R = 1.848802400319998 assert_almost_equal(durbin_watson(x**2), st_R, 14) st_R = 1.09897993228779 assert_almost_equal(durbin_watson(x[1:] + 0.5 * x[:-1]), st_R, 14) st_R = 0.937241876707273 assert_almost_equal(durbin_watson(x[1:] + 0.8 * x[:-1]), st_R, 14) st_R = 0.921488912587806 assert_almost_equal(durbin_watson(x[1:] + 0.9 * x[:-1]), st_R, 14)
def test_durbin_watson(): #benchmark values from R car::durbinWatsonTest(x) #library("car") #> durbinWatsonTest(x) #[1] 1.95298958377419 #> durbinWatsonTest(x**2) #[1] 1.848802400319998 #> durbinWatsonTest(x[2:20]+0.5*x[1:19]) #[1] 1.09897993228779 #> durbinWatsonTest(x[2:20]+0.8*x[1:19]) #[1] 0.937241876707273 #> durbinWatsonTest(x[2:20]+0.9*x[1:19]) #[1] 0.921488912587806 st_R = 1.95298958377419 assert_almost_equal(durbin_watson(x), st_R, 14) st_R = 1.848802400319998 assert_almost_equal(durbin_watson(x**2), st_R, 14) st_R = 1.09897993228779 assert_almost_equal(durbin_watson(x[1:]+0.5*x[:-1]), st_R, 14) st_R = 0.937241876707273 assert_almost_equal(durbin_watson(x[1:]+0.8*x[:-1]), st_R, 14) st_R = 0.921488912587806 assert_almost_equal(durbin_watson(x[1:]+0.9*x[:-1]), st_R, 14)
def fit(y, X, reg_names): nr = len(reg_names) try: mod = sm.GLSAR(y.values, X, 2, missing='drop') # MLR analysis with AR2 modeling res = mod.iterative_fit() output = xr.Dataset({'coef': (['reg_name'], res.params[1:]), \ 'conf_int': (['reg_name', 'limit'], res.conf_int()[1:,:]), \ 'p_value': (['reg_name'], res.pvalues[1:]), \ 'DWT': (sms.durbin_watson(res.wresid)), \ 'CoD': (res.rsquared)}, \ coords = {'reg_name': (['reg_name'], reg_names),\ 'limit': (['limit'], ['lower', 'upper'])}) except: nans = np.full([nr], np.nan) output = xr.Dataset({'coef': (['reg_name'], nans), \ 'conf_int': (['reg_name', 'limit'], np.array([nans, nans]).T), \ 'p_value': (['reg_name'], nans), \ 'DWT': (np.nan), \ 'CoD': (np.nan)}, \ coords = {'reg_name': (['reg_name'], reg_names),\ 'limit': (['limit'], ['lower', 'upper'])}) return output
def regression_analysis(self, y_column: str, *x_column: str) -> dict: """回归分析(OLS) :param y_column: y值所在的列名 :param x_column: x值所在的列名 :return: 字典,包括参数、检验结果 """ X_turple = (np.array(self.data[x_column[0]]), ) for i in range(1, len(x_column)): for column_info in self.meta: if "{}. {}".format(column_info['index'], column_info['title']) == x_column[i]: if column_info['type'] in ['rate', 'scale', 'numInput']: # 连续性变量直接插入矩阵 X_turple += (np.array(self.data[x_column[i]]), ) elif column_info['type'] in ["radio", "checkbox", "sort"]: # 分类变量转化为虚拟变量后删去参照组插入矩阵 dummy = sm.categorical(np.array(self.data[x_column[i]])) X_turple += (dummy[:, 1:], ) break X = np.column_stack(X_turple) X = sm.add_constant(X) y = np.array(self.data[y_column]) model = sm.OLS(y, X) result = model.fit() result_dict = dict() result_dict['params'] = [round(i, 3) for i in result.params] # [常数, x1, x2, ...] result_dict['tvalues'] = [round(i, 3) for i in result.tvalues] result_dict['pvalues'] = [round(i, 3) for i in result.pvalues] result_dict['rsquared'] = round(result.rsquared, 3) result_dict['rsquared_adj'] = round(result.rsquared_adj, 3) result_dict['fvalue'] = round(result.fvalue, 3) result_dict['f_pvalue'] = round(result.f_pvalue, 3) result_dict['DW'] = round(durbin_watson(result.wresid), 3) result_dict['condition_number'] = round(result.condition_number) if np.isnan(result_dict['f_pvalue']): return None return result_dict
def setup(self): self._calc_coefficients() self._setup_return_analysis_of_fund_and_fit(self.fitted_tms) self._setup_out_of_sample_fit() residuals = self.fit_model.resid self.durbin_watson_test = durbin_watson(residuals) regressors_df = self.input_data.regressors_df analysed_tms = self.input_data.analysed_tms self.risk_contribution = RiskContributionAnalysis.get_risk_contribution( regressors_df, self.coefficients, analysed_tms) factors_perf_attrib, unexplained_perf_attrib = ReturnAttributionAnalysis.get_factor_return_attribution( analysed_tms, self.fitted_tms, regressors_df, self.coefficients, self.intercept) self.factors_performance_attribution_ret = factors_perf_attrib self.unexplained_performance_attribution_ret = unexplained_perf_attrib self._setup_correlations(self.fitted_tms) self.condition_number = cond(regressors_df.values) self._setup_r_square_of_each_predictor() self._setup_autocorrelation(residuals) _, _, _, self.heteroskedasticity = het_breuschpagan( residuals, self.fit_model.model.exog) self._setup_cooks_distance(self.fit_model)
def fit_sm(result, coordinates, to_fit, data, design, r, s, sortvar): for i in range(coordinates.shape[0]): if to_fit[i]: squared_distances = ((data[...,:3] - coordinates[i])**2).sum(axis=1) valid = np.where(squared_distances < r) if valid[0].size > 120: weights = np.exp(squared_distances[valid] / s) fit = sm.WLS( endog = data[valid][...,3], exog = design[valid], weights = weights, hasconst = True).fit() result[i,0] = fit.params result[i,1] = fit.bse result[i,2] = fit.tvalues result[i,3,0] = fit.mse_resid result[i,3,1] = fit.df_resid df = DataFrame({ 'x' : data[valid][...,0], 'y' : data[valid][...,1], 'z' : data[valid][...,2], 'time' : data[valid][...,4]}) df['weighted_residual'] = weights * fit.resid df.sort_values(by=sortvar, inplace=True) result[i,3,2] = durbin_watson(df.weighted_residual)
def metrics(obs, pred, f, q, m): # obs - log(observed), pred - prediction, f - FIB, q - subset, m - model rsq = round(r2_score(obs, pred), 3) dw = round(durbin_watson(obs - pred), 3) # Durbin-Watson rmse = round(np.sqrt(((pred - obs)**2).mean()), 3) # Root Mean Square Error mape = 100 * round(abs( (pred - obs) / obs).mean(), 3) # Mean Absolute Percentage Error sens_spec = wqm.pred_eval(obs, pred, thresh=np.log10( wqm.fib_thresh(f))) # Sensitivity/Specificity auroc = round(HF_models.compute_AUROC(obs, pred, f), 3) # Area Under the Receiver Operating Curve # Add to q performance for model m to perf dataframe mets = [[ rsq, dw, rmse, mape, auroc, sens_spec['Sensitivity'], sens_spec['Specificity'], sens_spec['Samples'], sens_spec['Exceedances'] ]] temp_perf = pd.DataFrame(data=mets, columns=[ 'Rsq', 'D-W', 'RMSE', 'MAPE', 'AUROC', 'sens', 'spec', 'N', 'exc' ], index=[[q], [m]]) return temp_perf
def autocorrelation_assumption(): ''' Autocorrelation: Assumes that there is no autocorrelation in the residuals. If there is autocorrelation, then there is a patern that is not explained due to the current value being dependent on the previous value. This may be resolved by adding a lag variable of either the dependent variable or some of the predictors. ''' from statsmodels.stats.stattools import durbin_watson print('\n=======================================================================================') print('Assumption 4: No Autocorrelation') print('\nPerforming Durbin-Watson Test') print('Values of 1.5 < d < 2.5 generally show that there is no autocorrelation in the data') print('0 to 2< is positive autocorrelation') print('>2 to 4 is negative autocorrelation') print('-------------------------------------') durbinWatson = durbin_watson(df_results['Residuals']) print('Durbin-Watson:', durbinWatson) if durbinWatson < 1.5: print('Signs of positive autocorrelation') print('\nAssumption not satisfied') elif durbinWatson > 2.5: print('Signs of negative autocorrelation') print('\nAssumption not satisfied') else: print('Little to no autocorrelation') print('\nAssumption satisfied')
def test_durbin_watson_3d(self): shape = (10, 1, 10) x = np.random.standard_normal(100) dw = sum(np.diff(x)**2.0) / np.dot(x, x) x = np.tile(x[None, :, None], shape) assert_almost_equal(np.squeeze(dw * np.ones(shape)), durbin_watson(x, axis=1))
def best_lag_dw(self, df, threshold=0.2): model = VAR(df, freq="MS") # Assumes stationary data. best_aic = 99999 best_lag = None best_dw = None # Searching for best lag order. for i in range(1, 16): result = model.fit(i) #print("Lag order: ", i, " AIC: ", result.aic) # Checking with Durbin-Watson test for autocorrelation as well. dw_out = durbin_watson(result.resid) #print("DW test: ", dw_out) #print(abs(2.0-dw_out[0])) if ((result.aic < best_aic) and (abs(2.0 - round(dw_out[0], 2)) <= threshold) and (abs(2.0 - round(dw_out[1], 2)) <= threshold)): #print("ENTRA") best_aic = result.aic best_lag = i best_dw = dw_out print("Best lag order: ", best_lag, " with an AIC score of: ", best_aic) print("Durbin-Watson results:") for col, val in zip(df.columns, best_dw): print(col, ':', round(val, 2)) print("-------------------------------------------------") return best_aic, best_lag, best_dw
def volatile_models_metrics(input_model): return { 'AIC': input_model.aic, 'BIC': input_model.bic, 'R-squared': input_model.rsquared, 'DW': durbin_watson(input_model.resid.dropna()) }
def autocorrelation_assumption(model, features, label): """ Autocorrelation: Assumes that there is no autocorrelation in the residuals. If there is autocorrelation, then there is a pattern that is not explained due to the current value being dependent on the previous value. This may be resolved by adding a lag variable of either the dependent variable or some of the predictors. """ print("Assumption 4: No Autocorrelation", "\n") # Calculating residuals for the Durbin Watson-tests df_results = calculate_residuals(model, features, label) print("\nPerforming Durbin-Watson Test") print( "Values of 1.5 < d < 2.5 generally show that there is no autocorrelation in the data" ) print("0 to 2< is positive autocorrelation") print(">2 to 4 is negative autocorrelation") print("-------------------------------------") durbinWatson = durbin_watson(df_results["Residuals"]) print("Durbin-Watson:", durbinWatson) if durbinWatson < 1.5: print("Signs of positive autocorrelation", "\n") print("Assumption not satisfied") elif durbinWatson > 2.5: print("Signs of negative autocorrelation", "\n") print("Assumption not satisfied") else: print("Little to no autocorrelation", "\n") print("Assumption satisfied")
def get_durbin_watson(cols, model): """ Check for Serial Correlation of Residuals (Errors) """ out = durbin_watson(model.resid) for col, val in zip(cols, out): print(col, ':', round(val, 2))
def get_model_info(model): bg_lm, bg_lm_pval, bg_fval, bg_fpval = acorr_breusch_godfrey(model) jb, jb_pval, jb_skew, jb_kurtosis = jarque_bera(model.resid) het_bp_lm, het_bp_lmpval, het_bp_fval, het_bp_fpval = het_breuschpagan( model.resid, model.model.exog) return { 'r_squared': model.rsquared, 'adj_r_squared': model.rsquared_adj, 'p_values': model.pvalues, 'params': model.params, 'std': model.bse, 'size': model.nobs, 't_values': model.tvalues, 'durbin_watson': durbin_watson(model.resid), 'breusch_godfrey': { 'lm': bg_lm, 'lm_pval': bg_lm_pval, 'fval': bg_fval, 'f_pval': bg_fpval }, 'jarque_bera': { 'jb': jb, 'jb_pval': jb_pval, 'skew': jb_skew, 'kurtosis': jb_kurtosis }, 'het_breuschpagan': { 'lm': het_bp_lm, 'lm_pval': het_bp_lmpval, 'fval': het_bp_fval, 'f_pval': het_bp_fpval }, 'residuals': model.resid }
def test_autocorrelation(model_path, data_path): from statsmodels.stats.stattools import durbin_watson df_results = calculate_residuals(model_path, data_path) durbinWatson = durbin_watson(df_results['Residuals']) assert durbinWatson > 1.5 assert durbinWatson < 2.5
def dw_test(error): """ The test statistic is approximately equal to 2*(1-r) where r is the sample autocorrelation of the residuals. Thus, for r == 0, indicating no serial correlation, the test statistic equals 2. This statistic will always be between 0 and 4. The closer to 0 the statistic, the more evidence for positive serial correlation. The closer to 4, the more evidence for negative serial correlation. """ print('dw test', durbin_watson(error, axis=0))
def checkdb(df,col): "It also tells whether the Data is serially correlated or not " r = k.durbin_watson(df[col], axis=0) if(r==0): print "Not Serially correlated " return False,r else: print "Serially correlated " return True,r
def durbin_watson(self): """Performs Durbin-Watson test for checking autocorrelation of residuals""" if not self.is_fitted: print("Model not fitted yet!") return None from statsmodels.stats.stattools import durbin_watson test_score = float(durbin_watson(self.resid_)) return round(test_score, 3)
def check(self, residuals, n): tStats = durbin_watson(residuals) pValue = stats.t.sf(np.abs(tStats), n - 1) * 2 HypothesisTestObj = HypothesisTest( H0="""the residuals are not correlated""", pValue=pValue) #HypothesisTestObj.log() self.violation = 1 - HypothesisTestObj.result
def dw(data_frame): """ Take in a data frame use OLS to build the residuals Returns the Durbin-Watson Statistic, best value = 2.00 """ ols_res = OLS(data_frame, np.ones(len(data_frame))).fit() return durbin_watson(ols_res.resid)
def fit_at(formula, **kwargs): model, data = model_at(formula=formula, **kwargs) fit = model.fit() data['expected_signal'] = fit.predict() data['residual'] = fit.resid data['weighted_residual'] = data.weight * fit.resid fit.durbin_watson = durbin_watson(data.weighted_residual) return fit, model, data
def autocorrelation(self): """ Assumes no autocorrelation of the error terms. The value should be between 1.5 and 2.5. < 1.5 = positive autocorr. > 2.5 = negative autocorr """ dw = durbin_watson(self.results['Residuals']) if dw < 1.5: return "Assumption not met - Positive Autocorrelation", dw elif dw > 2.5: return "Assumption not met - Negative Autocorrelation", dw return "Assumption met", dw
def generate_regression_dataframe(reg_dict): """ From the dictionary which contains the regressions, it will extract the regressions: - parameters coefficients, standard deviation and p values, - r squared and adjusted r squared - f statistic and its p value - durbin watson test It will store in a pandas Dataframe, where each one of the information above will be assign to one column and each row will be a ticker. :param reg_dict: dictionary that contains the regressions :return: pandas DataFrame """ lst_df = [] lst_index = [] for key in reg_dict.keys(): lst_ = [] lst_index.append(key) lst_.append(reg_dict[key].params['Constant']) # constant parameter lst_.append(reg_dict[key].bse['Constant']) # constant standard error lst_.append(reg_dict[key].pvalues['Constant']) # constant p value lst_.append( reg_dict[key].params['Consumption']) # consumption parameter lst_.append( reg_dict[key].bse['Consumption']) # consumption standard error lst_.append( reg_dict[key].pvalues['Consumption']) # consumption p value lst_.append(reg_dict[key].rsquared) # r squared model lst_.append(reg_dict[key].rsquared_adj) # adjusted r squared model lst_.append(reg_dict[key].fvalue) # f statistic model lst_.append(reg_dict[key].f_pvalue) # p value f statistic lst_.append(smt.durbin_watson(reg_dict[key].resid)) # durbin watson lst_df.append(lst_) lst_columns = [ 'coef_constante', 'std_err_constante', 'p_value_constante', 'coef_consumption', 'std_err_consumption', 'p_value_consumption', 'r_squared', 'r_squared_adj', 'f_stats', 'p_value_f_stats', 'durb_watson' ] df_reg = pd.DataFrame(lst_df, columns=lst_columns, index=lst_index) return df_reg
def check_autocorrelation(residual): s = durbin_watson(residual) if s <= 1.5: return s, 'there is positive correlation' if s >= 2.5: return s, 'there is negative correlation' if s < 2.5 and s >= 2.1: return s, 'there is a slight negative correlation' if s > 1.5 and s <= 1.9: return s, 'there is a slight positive correlation' else: return s, 'there is no correlation'
def dw_test(self, x, y): ''' 计算dw统计量, 并存入参数字典. :param X: array-like, GDP :param y: array-like, 居民人均收入 :return: float, dw_value ''' x1, const = self.para_dict["x1_coef"], self.para_dict["const_coef"] error = y - (x * x1 + const) dw = durbin_watson(error) self.para_dict["dw_value"] = dw return dw
def residuals_properties(residuals, model='Model'): """ Computes statistical values and displays plots to evaluate how the models fitted the training dataset. The residuals in a time series model are what is left over after fitting a model. :param model: string to identify the model. default_value='Model' :param residuals: residuals of the model. :return: """ # Compute mean, median, skewness, kurtosis and durbin statistic mean_value = residuals.mean() median = np.median(residuals) # skewness = 0 : same weight in both the tails such as a normal distribution. skew = stats.skew(residuals) # Kurtosis is the degree of the peak of a distribution. # 3 it is normal, >3 higher peak, <3 lower peak kurtosis = stats.kurtosis(residuals) # Values between 0 and 2 indicate positive and values between 2 and 4 indicate negative auto-correlation. durbin = durbin_watson(residuals) # Anderson-Darling test null hypothesis: the sample follows the normal distribution anderson = stats.normaltest(residuals)[1] print( f'{model} residuals information:\n - Mean: {mean_value:.4f} \n - Median: {median:.4f} \n - Skewness: ' f'{skew:.4f} \n - Kurtosis: {kurtosis:.4f}\n - Durbin: {durbin:.4f}\n - Anderson p-value: {anderson:.4f}' ) # Create plots sn.set() fig, axes = plt.subplots(1, 5, figsize=(25, 5.3)) # Compute standardized residuals residuals = (residuals - np.nanmean(residuals)) / np.nanstd(residuals) # First picture: q-q plot # Keep only not NaN residuals. residuals_non_missing = residuals[~(np.isnan(residuals))] qqplot(residuals_non_missing, line='s', ax=axes[0]) axes[0].set_title('Normal Q-Q') # Second picture: simple plot of standardized residuals x = np.arange(0, len(residuals), 1) sn.lineplot(x=x, y=residuals, ax=axes[1]) axes[1].set_title('Standardized residual') # Third picture: comparison between residual and gaussian distribution kde = stats.gaussian_kde(residuals_non_missing) x_lim = (-1.96 * 2, 1.96 * 2) x = np.linspace(x_lim[0], x_lim[1]) axes[2].plot(x, stats.norm.pdf(x), label='Normal (0,1)', lw=2) axes[2].plot(x, kde(x), label='Residuals', lw=2) axes[2].set_xlim(x_lim) axes[2].legend() axes[2].set_title('Estimated density') # Last pictures: residuals auto-correlation plots plot_acf(residuals, ax=axes[3], lags=30) plot_pacf(residuals, ax=axes[4], lags=30) fig.tight_layout() plt.show()
def par_boot(func_solve, func_fit, m, p, p_error, t, mRNA, res_old): n = 1000 chi2_vec = np.zeros(n) dw_vec = np.zeros(n) if func_solve == 'stationary': # get solution to fitted model y_model = [p[0] for _ in res_old] # carry out n bootstraps for idx in range(n): # resample y_boot = y_model + np.array([np.random.normal(0, ps) for ps in p_error]) # fit r, p_out = fit_stationary(t, y_boot, p_error) chi2_vec[idx] = np.sum(r**2) dw_vec[idx] = durbin_watson(r) else: # get solution to fitted model y_model = func_solve(np.log10(p), t, mRNA) # carry out n bootstraps for idx in range(n): # resample y_boot = y_model + np.array([np.random.normal(0, ps) for ps in p_error]) # fit r, p_out = fit(func_fit, m, t, y_boot, p_error, mRNA, samples=1, plot=False, p_old=p) chi2_vec[idx] = np.sum(r**2) dw_vec[idx] = durbin_watson(r) # plotting to check distribution plot = False if plot: plt.hist(chi2_vec, bins=int(np.sqrt(n))) plt.show() # get p-value for chi2 test chi2_old = np.sum(res_old**2) chi2_ecdf = ECDF(chi2_vec) chi2_p = 1 - chi2_ecdf(chi2_old) # right sided test for chi2 # get p-value for dw-test dw_old = durbin_watson(res_old) dw_ecdf = ECDF(dw_vec) dw_p = dw_ecdf(dw_old) # left sided test durbin-watson return chi2_p, dw_p
def generate_regression_dataframe(reg_dict): """ Extrai do dicionario contendo as regressoes os parametros, desvios padroes, p valores, r quadrado, r quadrado ajustado, estatistica f, p valor da estatistica f e durbin watson. Armazena todas essas informacoes em um DataFrame, sendo cada linha respectiva a um ticker. :param reg_dict: Dicionario contendo as regressoes. :return: DataFrame contendo os parametros das regressoes nas colunas e os tickers nas linhas """ # Retira as informacoes do dicionario com as regressoes # entrada e o dicionario contendo as regressoes # retorna um dataframe com as informacoes lst_df = [] lst_index = [] for key in reg_dict.keys(): lst_ = [] lst_index.append(key) lst_.append(reg_dict[key].params['constante']) # parametro constante lst_.append(reg_dict[key].bse['constante']) # standard error constante lst_.append(reg_dict[key].pvalues['constante']) # p_value constante lst_.append(reg_dict[key].params['Consumo']) # parametro consumo lst_.append(reg_dict[key].bse['Consumo']) # standard error consumo lst_.append(reg_dict[key].pvalues['Consumo']) # #p_value consumo lst_.append(reg_dict[key].rsquared) # r squared model lst_.append(reg_dict[key].rsquared_adj) # adjusted r squared model lst_.append(reg_dict[key].fvalue) # f statistic model lst_.append(reg_dict[key].f_pvalue) # p value f statistic lst_.append(smt.durbin_watson(reg_dict[key].resid)) # durbin watson lst_df.append(lst_) del lst_ lst_columns = ['coef_constante', 'std_err_constante', 'p_value_constante', 'coef_consumo', 'std_err_consumo', 'p_value_consumo', 'r_squared', 'r_squared_adj', 'f_stats', 'p_value_f_stats', 'durb_watson'] df_reg = pd.DataFrame(lst_df, columns = lst_columns, index=lst_index) return df_reg
def get_durbin_watson(errors, axis): #must feed 1-d array ''' A number which determines whether there is autocorrelation in the residuals of a time series regression. The statistic ranges from 0 to 4 with 0 indicating positive autocorrelation and 4 indicating negative correlation. A value of 2 indicates no auto correlation in the sample. The formula is expressed as: d=(sum from t=2 to t=T of: (et-et-1)2/(sum from t=1 to t=T of: et2) where the series of et are the residuals from a regression. Read more: http://www.businessdictionary.com/definition/Durbin-Watson-Statistic.html ''' db = durbin_watson(errors.dropna(), axis) print('Durbin-Watson test statistic:{}'.format(db))
def score_VAR_correlation(self, models, x_train, lag=0, maxlag=None): ''' durbin_watson test the closer the result is to 2 then there is no correlation, the closer to 0 or 4 then correlation implies ''' for i, (name_model, model) in enumerate(models.items()): if name_model == 'VAR': if maxlag != None: #studio hypersapce sul parametro lag vet_aic = [] vet_bic = [] vet_fpe = [] vet_hqic = [] for i in range(maxlag): result = model.fit(i) vet_aic.append(result.aic) vet_bic.append(result.bic) vet_fpe.append(result.fpe) vet_hqic.append(result.hqic) df_results = pd.DataFrame() df_results['AIC'] = vet_aic df_results['BIC'] = vet_bic df_results['FPE'] = vet_fpe df_results['HQIC'] = vet_hqic return df_results else: # fit diretto su un valore specifico di lag result = model.fit(lag) out = durbin_watson(result.resid) df_results = pd.DataFrame() for col, val in zip(x_train.columns, out): df_results[col] = [round(val, 2)] return df_results.T elif name_model == 'VARMAX': result = model.fit() out = durbin_watson(result.resid) df_results = pd.DataFrame() for col, val in zip(x_train.columns, out): df_results[col] = [round(val, 2)] return df_results.T
def check_residual_autocorrelation(self): """Check the residual autocorrelation in a regression analysis using the Durbin-Watson test. The closer the Durbin-Watson value is to 0, the more evidence for positive serial correlation. The closer to 4, the more evidence for negative serial correlation. """ if self.fitted_result is None: raise DataWasNotFitted() self.durbin_watson_value = stattools.durbin_watson( self.fitted_result.resid)
def regression_scores(timeseries, time_window_size, time_lag, reg, cv, scoring, timeseriesZ=None): """Compute regression scores for a given set of 3 timeseries according to the variable causality structures. """ global causality_structures if scoring == 'residual_tests': features_regression = np.zeros([len(causality_structures),7]) else: features_regression = np.zeros([len(causality_structures),2]) #added 2 dimensions to compute r2 and mse for j, (cs_train, cs_test) in enumerate(causality_structures): ts_train = timeseries[:,cs_train] if not(timeseriesZ is None): ts_train = np.hstack([ts_train, timeseriesZ]) if time_lag is None: time_lag=time_window_size ts_test = timeseries[:,cs_test] tmp_score = np.zeros([time_window_size,2]) #added 2 dimensions to compute r2 and mse residuals = np.zeros(timeseries.shape[0]-time_window_size) for i_reg in range(time_window_size): idx_example = np.arange(i_reg, timeseries.shape[0]-time_lag, time_window_size) X = np.zeros((idx_example.size, time_window_size, ts_train.shape[1])) for k in range(time_window_size): X[:,k] = ts_train[idx_example+k] X = X.reshape(X.shape[0], X.shape[1] * X.shape[2]) y = ts_test[idx_example + time_lag] if scoring == 'residual_tests': y_pred_i_reg = np.zeros(y.size) kfold = KFold(n=y.size, n_folds=cv) for train, test in kfold: reg.fit(X[train], y[train]) y_pred_i_reg[test] = reg.predict(X[test]) residuals[idx_example] = y - y_pred_i_reg #residuals else: tmp_predict = cross_val_predict(reg, X, y, cv=cv) tmp_score[i_reg,0] = r2_score(y,tmp_predict).mean() tmp_score[i_reg,1] = mean_squared_error(y,tmp_predict).mean() #tmp_score[i_reg] = cross_val_score(reg, X, y, cv=cv, scoring=scoring).mean() if scoring == 'residual_tests': features_regression[j,0] = durbin_watson(residuals) features_regression[j,[1,2]] = omni_normtest(residuals) features_regression[j,3:] = jarque_bera(residuals) else: features_regression[j] = tmp_score.mean(0) return features_regression
def fit(y, X, reg_names): nr = len(reg_names) try: mod = sm.GLSAR(y.values, X, 2, missing = 'drop') # MLR analysis with AR2 modeling res = mod.iterative_fit() output = xr.Dataset({'coef': (['reg_name'], res.params[1:]), \ 'conf_int': (['reg_name', 'limit'], res.conf_int()[1:,:]), \ 'p_value': (['reg_name'], res.pvalues[1:]), \ 'DWT': (sms.durbin_watson(res.wresid)), \ 'CoD': (res.rsquared)}, \ coords = {'reg_name': (['reg_name'], reg_names),\ 'limit': (['limit'], ['lower', 'upper'])}) except: nans = np.full([nr], np.nan) output = xr.Dataset({'coef': (['reg_name'], nans), \ 'conf_int': (['reg_name', 'limit'], np.array([nans, nans]).T), \ 'p_value': (['reg_name'], nans), \ 'DWT': (np.nan), \ 'CoD': (np.nan)}, \ coords = {'reg_name': (['reg_name'], reg_names),\ 'limit': (['limit'], ['lower', 'upper'])}) return output
def fitdata(f, Xdata,Ydata,Errdata, pguess, ax=False, ax2=False): ''' popt = vector of length N of the optimized parameters pcov = Covariance matrix of the fit perr = vector of length N of the std-dev of the optimized parameters p95 = half width of the 95% confidence interval for each parameter p_p = vector of length N of the p-value for the parameters being zero (if p<0.05, null hypothesis rejected and parameter is non-zero) chisquared = chisquared value for the fit chisquared_red = chisquared/degfreedom chisquare = (p, chisquared, chisquared_red, degfreedom) p = Probability of finding a chisquared value at least as extreme as the one shown chisquared_red = chisquared/degfreedom. value should be approx. 1 for a good fit. R2 = correlation coefficient or proportion of explained variance R2_adj = adjusted R2 taking into account number of predictors resanal = (p, w, mean, stddev) Analysis of residuals p = Probability of finding a w at least as extreme as the one observed (should be high for good fit) w = Shapiro-Wilk test criterion mean = mean of residuals p_res = probability that the mean value obtained is different from zero merely by chance F = F-statistic for the fit msm/msE. Null hypothesis is that there is NO Difference between the two variances. p_F = probability that this value of F can arise by chance alone. p_F < 0.05 to reject null hypothesis and prove that the fit is good. dw = Durbin_Watson statistic (value between 0 and 4). 2 = no-autocorrelation. 0 = .ve autocorrelation, 4 = -ve autocorrelation. ''' def error(p,Xdata,Ydata,Errdata): Y=f(Xdata,p) residuals=(Y-Ydata)/Errdata return residuals res=scipy.optimize.leastsq(error,pguess,args=(Xdata,Ydata,Errdata),full_output=1) (popt,pcov,infodict,errmsg,ier)=res perr=scipy.sqrt(scipy.diag(pcov)) M=len(Ydata) N=len(popt) #Residuals Y=f(Xdata,popt) residuals=(Y-Ydata)/Errdata meanY=scipy.mean(Ydata) squares=(Y-meanY)/Errdata squaresT=(Ydata-meanY)/Errdata SSM=sum(squares**2) #Corrected Sum of Squares SSE=sum(residuals**2) #Sum of Squares of Errors SST=sum(squaresT**2)#Total Corrected sum of Squares DFM=N-1 #Degree of Freedom for model DFE=M-N #Degree of Freedom for error DFT=M-1 #Degree of freedom total MSM=SSM/DFM #Mean Squares for model(explained Variance) MSE=SSE/DFE #Mean Squares for Error(should be small wrt MSM) unexplained Variance MST=SST/DFT #Mean squares for total R2=SSM/SST #proportion of unexplained variance R2_adj= 1-(1-R2)*(M-1)/(M-N-1) #Adjusted R2 #t-test to see if parameters are different from zero t_stat=popt/perr #t-stat for popt different from zero t_stat=t_stat.real p_p= 1.0-scipy.stats.t.cdf(t_stat,DFE) #should be low for good fit z=scipy.stats.t(M-N).ppf(0.95) p95=perr*z #Chi-Squared Analysis on Residuals chisquared=sum(residuals**2) degfreedom=M-N chisquared_red=chisquared/degfreedom p_chi2=1.0-scipy.stats.chi2.cdf(chisquared, degfreedom) stderr_reg=scipy.sqrt(chisquared_red) chisquare=(p_chi2,chisquared,chisquared_red,degfreedom,R2,R2_adj) #Analysis of Residuals w, p_shapiro=scipy.stats.shapiro(residuals) mean_res=scipy.mean(residuals) stddev_res=scipy.sqrt(scipy.var(residuals)) t_res=mean_res/stddev_res #t-statistics p_res=1.0-scipy.stats.t.cdf(t_res,M-1) F=MSM/MSE p_F=1.0-scipy.stats.f.cdf(F,DFM,DFE) dw=stools.durbin_watson(residuals) resanal=(p_shapiro,w,mean_res,p_res,F,p_F,dw) if ax: formataxis(ax) ax.plot(Ydata,Y,'ro') ax.errorbar(Ydata,Y,yerr=Errdata, fmt='.') Ymin,Ymax=min((min(Y),min(Ydata))),max((max(Y),max(Ydata))) ax.plot([Ymin,Ymax],[Ymin,Ymax],'b') ax.xaxis.label.set_text('Data') ax.yaxis.label.set_text('Fitted') sigmay,avg_stddev_data=get_stderr_fit(f,Xdata,popt,pcov) Yplus=Y+sigmay Yminus=Y-sigmay ax.plot(Y,Yplus,'c',alpha=0.6,linestyle='--',linewidth=0.5) ax.plot(Y,Yminus,'c',alpha=0.6,linestyle='==',linewidth=0.5) ax.fill_between(Y,Yminus,Yplus,facecolor='cyan',alpha=0.5) titletext='Parity plot for fit.\n' titletext+=r'$r^2$=%5.2f,$r^2_{adj}$=%5.2f,$p_{shapiro}$=%5.2f,$Durbin-Watson=%2.1f' titletext+='\n F=%5.2f,$p_F$=%3.2e' titletext+='$\sigma_{err}^{reg}$=%5.2f' #ax.title.set_text(titletext%(R2, R2_adj, avg_stddev_data, chisquared_red, p_chi2, stderr_reg)) ax.figure.canvas.draw() if ax2: formataxis(ax2) ax2.plot(Y,residuals,'ro') ax2.xaxis.label.set_text('Fitted Data') ax2.yaxis.label.set_text('Residuals') titletext='Analysis of Residuals\n' titletext+=r'mean=%5.2f,$p_{res}$=%5.2f,$p_{shapiro}$=%5.2f,$Durbin-Watson$=%2.1f' titletext+='\n F=%5.2f,$p_F$=%3.2e' ax2.title.set_text(titletext%(mean_res,p_res,p_shapiro,dw,F,p_F)) return popt,pcov,perr, p95, p_p,chisquare, resanal
print R2 print R2_adj chisquared=sum(residuals**2) Dof=M-N chisquared_red=chisquared/Dof p_chi2=1-scipy.stats.chi2.cdf(chisquared,Dof) stderr_reg=scipy.sqrt(chisquared_red) chisquare=(p_chi2,chisquared,chisquared_red,Dof,R2,R2_adj) print chisquare w,p_shapiro=scipy.stats.shapiro(residuals) mean_res=scipy.mean(residuals) stddev_res=scipy.sqrt(scipy.var(residuals)) t_res=mean_res/stddev_res p_res=1-scipy.stats.t.cdf(t_res,M-1) print p_res if p_res<0.05: print ('Null Hypothesesis in rejected') F=MSM/MSE p_F=1-scipy.stats.f.cdf(F,DFM,DFE) if p_F <0.05: print ('Null hypothesis is rejected') dw=stools.durbin_watson(residuals) resanal=(p_shapiro,w,mean_res,p_res,F,p_F,dw)
def test_frame_timeseries_durbin_watson(self): """Test Durbin Watson""" result = self.frame.timeseries_durbin_watson_test("logM") db_result = smst.durbin_watson(self.pandaframe["logM"]) self.assertAlmostEqual(result, db_result, delta=0.0000000001)
def fitdata(f,Xdata,Ydata,Errdata,pguess,dict_data,ax=False,ax2=False): def error(p,Xdata,Ydata,Errdata,dict_data): Y=f(Xdata,p,dicct_data) residuals=(Y-Ydata)/Errdata return residuals res=scipy.optimize.leastsq(error,pguess.args=(Xdata,Ydata,Errdata,dict_data),full_output=1) (popt,pcov,infodict,errmsg,ier)=res perr=scipy.sqrt(scipy.diag(pcov)) M=len(Ydata) N=len(popt) Y=f(Xdata,popt,dict_data) residuals=(Y-Ydata)/Errdata meanY=scipy.mean(Ydata) squares=(Y-meanY)/Errdata squaresT=(Ydata-meanY)/Errdata SSM=sum(squares**2) SSE=sum(residuals**2) SST=sum(squaresT**2) DFM=N-1 DFE=M-N DFT=M-1 MSM=SSM/DFM MSE=SSE/DFE MST=SST/DFT R2=SSM/SST R2_adj=1-(1-R2)*(M-1)/(M-N-1) t_stat=popt/perr t_stat=t_stat.real p_p=1.0-scipy.stats.t.cdf(t_stat,DFE) z=scipy.stats.t(M-N).ppf(0.95) p95=perr*z chisquared=sum(residuals**2) degfreeedom=M-N chisqured_red=chisquared/degfreedom p_chi2=1.0-scipy.stats.chi2.cdf(chisquared,degfreedom) chisquare=(p_chi2,chisquared,chisquared_red,degfreedom,R2,R2_adj) w,p_shapiro=scipy.stats.shapiro(residuals) mean_res=scipy.mean(residuals) stddev_res=scipy.sqrt(scipy.var(residuals)) t_res=mean_res/stddev_res p_res=1.0-scipy.stats.t.cdf(t_res,M-1) F=MSM/MSE p_F=1.0-scipy.stats.f.cdf(F,DFM,DFE) dw=stools.durbin_watson(residuals) resanal=(p_shapiro,w,mean_res,p_res,F,p_F,dw) if ax: formataxis(ax) ax.plotdata(Ydata,Y,'ro') ax.errorbar(Ydata,Y,yerr=Errdata,fmt='.') Ymin,Ymax=min((min(Y),min(Ydata))),max((max(Y),max(Ydata))) ax.plot([Ymin,Ymax],[Ymin,Ymax],'b') ax.xaxis.label.set_text('Data') ax.yaxis.label.set_text('Fitted') sigmay,avg_stddev_data=get_stderr_fit(f,Xdata,popt,pcov,dict_data) Yplus=Y+sigmay Yminus=Y-sigmay ax.plot(Y,Yplus,'c',alpha=0.6,linestyle='--',linewidth=0.5) ax.plot(Y,Yminus,'c',alpha=0.6,linestyle='--',linewidth=0.5) ax.fill_between(Y,Yminus,Yplus,facecolor='cyan',alpha=0.5) titletext ='parity plot for fit.\n' titletext +=r'$r^2$=%5.2f,$r^2_{adj}$=%5.2f,' titletext +='$\sigma_{exp}$=%5.2f,$\chi^2_{\nu}$=%5.2f,$p_{\chi^2}$=%5.2f,' titletext +='$\sigma_{err}^{reg}$=%5.2f' ax.title.set_text(titletext%(R2,R2_adj,avg_stddev_data,chisquared_red,p_chi2,stderr_reg)) ax.figure.canvas.draw() if ax2: formataxis(ax2) ax2.plot(Y,residuals,'ro') ax2.xaxis.label.set_text('Fitted Data') ax2.yaxis.label.set_text('Residuals') titletext ='Analysis of Residuals\n' titletext +=r'mean=%5.2f,$p_{res}$=%5.2f,$p_{shapiro}$=%5.2f, $Durbin-Watson=%2.1f' titletext +='\n F=%5.2f, $p_F$=%3.2e' ax2.title.set_text(titletext%(mean_res,p_res,p_shapiro,dw,F,p_F)) ax2.figure.canvas.draw() return popt,pcov,perr,p95,p_p,chisquare,resanal
def fitdata(f,Xdata,Ydata,Errdata,pguess,dict_data,ax=False,ax2=False): def error(p, Xdata, Ydata, Errdata, dict_data): Y=f(Xdata, p,dict_data) residuals= (Y-Ydata)/Errdata return residuals res = scipy.optimize.leastsq(error, pguess, args=(Xdata, Ydata, Errdata, dict_data), full_output=1) (popt, pcov, infodict, errmsg, ier) = res perr=scipy.sqrt(scipy.diag(pcov)) M= len(Ydata) N=len(popt) ''' Residuals: ''' Y=f(Xdata,popt,dict_data) residuals=(Y-Ydata)/Errdata meanY=scipy.mean(Ydata) squares=(Y-meanY)/Errdata squaresT=(Ydata-meanY)/Errdata print "Residuals:\n",residuals SSM=sum(squares**2) #Corrected Sum of Squares SSE=sum(residuals**2) #Sum of Squares of Errors SST=sum(squaresT**2) #Total corrected sum of squares ''' Degrees of Freedom: ''' DFM=N-1 #Degrees of freedom for model DFE=M-N #Degrees of freedom for error # DFT=M-1 #Degrees of freedom total MSM=SSM/DFM #Mean squares for model (explained variance) MSE=SSE/DFE #Mean squares for Error (should be small wrt MSM) Unexplained Variance # MST=SST/DFT #Mean squares for total R2=SSM/SST #proportion of explained variance R2_adj=1-(1-R2)*(M-1)/(M-N-1) #Adjusted R2 ''' t-test : ''' #t-test to see if parameters are different from zero t_stat=popt/perr #t-statistic for popt different from zero' t_stat=t_stat.real p_p=1.0-scipy.stats.t.cdf(t_stat,DFE) #should be low for good fit. z=scipy.stats.t(M-N).ppf(0.95) p95=perr*z ''' Chi squared Analysis on Residuals: ''' chisquared=sum(residuals**2) degfreedom=M-N chisquared_red=chisquared/degfreedom p_chi2=1.0-scipy.stats.chi2.cdf(chisquared,degfreedom) stderr_reg=scipy.sqrt(chisquared_red) chisquare=(p_chi2,chisquared, chisquared_red, degfreedom,R2,R2_adj) ''' Residual Analysis: ''' w,p_shapiro=scipy.stats.shapiro(residuals) mean_res=scipy.mean(residuals) stddev_res=scipy.sqrt(scipy.var(residuals)) t_res=mean_res/stddev_res #t-statistic to test that mean_res is zero. p_res=1.0-scipy.stats.t.cdf(t_res,M-1) #if p_res <0.05, null hypothesis rejected and mean is non-zero. #Should be high for good fit. ''' F-test on Residuals: ''' F=MSM/MSE #explained variance/unexplained . Should be large p_F=1.0-scipy.stats.f.cdf(F,DFM,DFE) #if p_F <0.05n, null-hypothesis is rejected #i.e. R^2>0 and at least one of the fitting parameters >0. dw=stools.durbin_watson(residuals) resanal=(p_shapiro,w,mean_res,F,p_F,dw) if ax: formataxis(ax) ax.plot(Ydata,Y,'ro') ax.errorbar(Ydata,Y,yerr=Errdata,fmt='.') Ymin,Ymax=min((min(Y),min(Ydata))),max((max(Y),max(Ydata))) ax.plot([Ymin,Ymax],[Ymin,Ymax],'b') ax.xaxis.label.set_text('Data') ax.yaxis.label.set_text('Fitted') sigmay,avg_stddev_data=get_stderr_fit(f,Xdata,popt,pcov,dict_data) Yplus=Y+sigmay Yminus=Y-sigmay ax.plot(Y,Yplus,'c',alpha=0.6,linestyle='--',linewidth=0.5) ax.plot(Y,Yminus,'c',alpha=0.6,linestyle='--',linewidth=0.5) ax.fill_between(Y,Yminus,Yplus,facecolor='cyan',alpha=0.5) titletext='Parity Plot for Fitted Data \n' titletext+=r'R^2=%5.2f, Adjusted Residual square=%5.2f \n ' titletext +='Exp. sigma=%5.2f, $\chi^2_{\nu}$=%5.2f, $p_{\chi^2}$=%5.2f, ' titletext +='$\sigma_{err}^{reg}$=%5.2f' print "Standard Deviation of Y:\n",sigmay print "Positive Deviation of Y:\n ",Yplus print "Negative Deviation of Y:\n ",Yminus ax.title.set_text(titletext%(R2,R2_adj, avg_stddev_data, chisquared_red, p_chi2, stderr_reg)) ax.figure.canvas.draw() if ax2:#Test for homoscedasticity formataxis(ax2) ax2.plot(Y,residuals,'ro') ax2.xaxis.label.set_text('Fitted Data') ax2.yaxis.label.set_text('Residuals') titletext='Analysis of Residuals\n' titletext+=r'mean=%5.2f, $p_{res}$=%5.2f, $p_{shapiro}$=%5.2f, $Durbin-Watson$=%2.1f' titletext+='\nF=%5.2f, $p_F$=%3.2e' ax2.title.set_text(titletext%(mean_res, p_res, p_shapiro, dw , F, p_F)) ax2.figure.canvas.draw() return popt,pcov,perr,p95,p_p,chisquare,resanal
def fitdata(f,Xdata,Ydata,Errdata,pguess,dict_data,ax=False,ax2=False): ''' fitdata(f,Xdata,Ydata,Errdata,pguess,dict_data) f=function f(X,p,dict_data) Xdata=array like object (k,M) shaped array for data with k predictors e.g. if X = (x1,x2,x3) then X=(X1,X2,X3) where X1 is a vector of x1 etc Ydata=array like object of length M Errdata=array like object of length M: error estimate of ydata. pguess=array like object of length N(vector of guess of parameters) dict_data= dictionary containing other data necessary for f Returns: popt=vector of length N of the optimized parameters pcov=covariance matrix of the fit perr=vector of length N of the std-dev of the optimized parameters p95=half width of the 95% confidence interval for each parameter i.e. popt-p95 and popt+p95 p_p=vector of length N of the p-value for the parameters being zero (if p<0.05,null hypothesis rejected and parameter is non-zero) chisquared=(chisquared,chisquared_red,degfreedom,p) chisquared=chisquared value for the fit:sum of squared of weighted residuals chisquared_red=chisquared/degfreedom. Value should be approx. 1 for a good fit. degfreedom=M-N the degrees of freedom of the fitting chisquare=(p,chisquared,chisquared_red,degfreedom) p=Probability of finding a chisquared value at least as extreme as the one shown purely by random chance(should be high for good fit) chisquared=chisquared value for the fit: sum of squares of weighted residuals chisquared_red=chisquared/degfreedom. Value should be approx. 1 for a good fit. degfreedom=M-N the degrees of freedom of the fitting R2=correlation coefficient or proportion of explained variance R2_adj=adjusted R2 taking into account number of predictors resanal=(p,w,mean,stddev) Analysis of residuals p=Probability of finding a w at least as extreme as the one observed (should be high for good fit) w=Shapiro-wilk test criterion mean=mean of residuals p_res=probability that the mean value obtained is different form zero merely by chance (should be low for good fit) The mean must be within 1 stddev of zro for highly significant fitting. F=F-statistic for the fit MSM/MSE Null hypothesis is that there is NO Difference between the twpo variances. p_F=probability that this value of F can arise by chance alone. p_F<0.05 to reject null hypothesis and prove that the fit is good dw=Durbin_Watson statistic (value between 0 and 4). 2=no-autocorrelation. 0=+ve autocorrelation, 4 = -ve autocorrelation. ''' def error(p, Xdata, Ydata, Errdata, dict_data): Y=f(Xdata, p,dict_data) residuals= (Y-Ydata)/Errdata return residuals res = scipy.optimize.leastsq(error, pguess, args=(Xdata, Ydata, Errdata, dict_data), full_output=1) (popt, pcov, infodict, errmsg, ier) = res perr=scipy.sqrt(scipy.diag(pcov)) M=len(Ydata) N=len(popt) #Residuals Y=f(Xdata,popt,dict_data) residuals=(Y-Ydata)/Errdata meanY=scipy.mean(Ydata) squares=(Y-meanY)/Errdata squaresT=(Ydata-meanY)/Errdata SSM=sum(squares**2) #Corrected Sum of Squares SSE=sum(residuals**2) #Sum of Squares of Errors SST=sum(squaresT**2) #Total corrected sum of squares DFM=N-1 #Degrees of freedom for model DFE=M-N #Degrees of freedom for error DFT=M-1 #Degrees of freedom total MSM=SSM/DFM #Mean squares for model (explained variance) MSE=SSE/DFE #Mean squares for Error (should be small wrt MSM) Unexplained Variance MST=SST/DFT #Mean squares for total R2=SSM/SST #proportion of explained variance R2_adj=1-(1-R2)*(M-1)/(M-N-1) #Adjusted R2 #t-test to see if parameters are different from zero t_stat=popt/perr #t-statistic for popt different from zero' t_stat=t_stat.real p_p=1.0-scipy.stats.t.cdf(t_stat,DFE) #should be low for good fit. z=scipy.stats.t(M-N).ppf(0.95) p95=perr*z #Chisquared Analysis on Residuals chisquared=sum(residuals**2) degfreedom=M-N chisquared_red=chisquared/degfreedom p_chi2=1.0-scipy.stats.chi2.cdf(chisquared,degfreedom) stderr_reg=scipy.sqrt(chisquared_red) chisquare=(p_chi2,chisquared, chisquared_red, degfreedom,R2,R2_adj) #Analysis of Residuals w,p_shapiro=scipy.stats.shapiro(residuals) mean_res=scipy.mean(residuals) stddev_res=scipy.sqrt(scipy.var(residuals)) t_res=mean_res/stddev_res #t-statistic to test that mean_res is zero. p_res=1.0-scipy.stats.t.cdf(t_res,M-1) #if p_res <0.05, null hypothesis rejected and mean is non-zero. #Should be high for good fit. #F-test on residuals F=MSM/MSE #explained variance/unexplained . Should be large p_F=1.0-scipy.stats.f.cdf(F,DFM,DFE) #if p_F <0.05n, null-hypothesis is rejected #i.e. R^2>0 and at least one of the fitting parameters >0. dw=stools.durbin_watson(residuals) resanal=(p_shapiro,w,mean_res,F,p_F,dw) if ax: formataxis(ax) ax.plot(Ydata,Y,'ro') ax.errorbar(Ydata,Y,yerr=Errdata,fmt='.') Ymin,Ymax=min((min(Y),min(Ydata))),max((max(Y),max(Ydata))) ax.plot([Ymin,Ymax],[Ymin,Ymax],'b') ax.xaxis.label.set_text('Data') ax.yaxis.label.set_text('Fitted') sigmay,avg_stddev_data=get_stderr_fit(f,Xdata,popt,pcov,dict_data) Yplus=Y+sigmay Yminus=Y-sigmay ax.plot(Y,Yplus,'c',alpha=0.6,linestyle='--',linewidth=0.5) ax.plot(Y,Yminus,'c',alpha=0.6,linestyle='--',linewidth=0.5) ax.fill_between(Y,Yminus,Yplus,facecolor='cyan',alpha=0.5) titletext='Parity plot for fit.\n' titletext+=r'$r^2$=%5.2f, $r^2_{adj}$=%5.2f, ' titletext +='$\sigma_{exp}$=%5.2f,$\chi^2_{\nu}$=%5.2f, $p_{\chi^2}$=%5.2f, ' titletext +='$\sigma_{err}^{reg}$=%5.2f' ax.title.set_text(titletext%(R2,R2_adj, avg_stddev_data, chisquared_red, p_chi2, stderr_reg)) ax.figure.canvas.draw() if ax2:#Test for homoscedasticity formataxis(ax2) ax2.plot(Y,residuals,'ro') ax2.xaxis.label.set_text('Fitted Data') ax2.yaxis.label.set_text('Residuals') titletext='Analysis of Residuals\n' titletext+=r'mean=%5.2f, $p_{res}$=%5.2f, $p_{shapiro}$=%5.2f, $Durbin-Watson$=%2.1f' titletext+='\nF=%5.2f, $p_F$=%3.2e' ax2.title.set_text(titletext%(mean_res, p_res, p_shapiro, dw , F, p_F)) ax2.figure.canvas.draw() return popt,pcov,perr,p95,p_p,chisquare,resanal
def fitdata(f,XData,YData,ErrData,pguess,ax=False,ax2=False): def error(p,XData,YData,ErrData): Y=f(XData,p) residuals=(Y-YData)/ErrData return residuals res=scipy.optimize.leastsq(error,pguess,args=(XData,YData,ErrData),full_output=1) (popt,pcov,infodict,errmsg,ier)=res perr=scipy.sqrt(scipy.diag(pcov)) M=len(YData) N=len(popt) #residuals Y=f(XData,popt) residuals=(Y-YData)/ErrData meanY=scipy.mean(YData) squares=(Y-meanY)/ErrData squaresT=(YData-meanY)/ErrData SSM=sum(squares**2) #Corrected Sum of Squares SSE=sum(residuals**2)#Sum of Squares of Errors SST=sum(squaresT**2)#Totaal corrected sum of squares DFM=N-1 #degrees of freedom for model DFE=M-N #degrees of freedom for error DFT=M-1 #degrees of freedom total MSM=SSM/DFM #Mean squares for model (explained variance) MSE=SSE/DFE #Mean squares for error MST=SST/DFT #Mean squares for total R2=SSM/SST #proportion of explained varience R2_adj=1-(1-R2)*(M-1)/(M-N-1)#Adjusted R2 #t-test to see if parameters are different from zero t_stat=popt/perr #t-statistic for popt different from zero t_stat=t_stat.real p_p=1.0-scipy.stats.t.cdf(t_stat,DFE) #should be low for good fit. z=scipy.stats.t(M-N).ppf(0.95) p95=perr*z #chisquared analysis on residuals chisquared=sum(residuals**2) degfreedom=M-N chisquared_red=chisquared/degfreedom p_chi2=1.0-scipy.stats.chi2.cdf(chisquared,degfreedom) stderr_reg=scipy.sqrt(chisquared_red) chisquare=(p_chi2,chisquared_red,degfreedom,R2,R2_adj) #Analysis of Residuals w,p_shapiro=scipy.stats.shapiro(residuals) mean_res=scipy.mean(residuals) #mean of all residuals stddev_res=scipy.sqrt(scipy.var(residuals)) #standard deviation of all residuals t_res=mean_res/stddev_res #t-statistic to test that was mean_res is zero. p_res=1.0-scipy.stats.t.cdf(t_res,M-1) #if p_res<0.05,null-hypothesis is rejected and mean is non-zero #should be high for good fit. # F-test on residuals F=MSM/MSE #explained/un-explained. Should be large p_F=1.0-scipy.stats.f.cdf(F,DFM,DFE) #if p_F<0.05,null hypothesis is rejected #i.e. R^2>0 and at least one of the fitting parameters >0. dw=stools.durbin_watson(residuals) resanal=(p_shapiro,w,mean_res,p_res,F,p_F,dw) if ax: formataxis(ax) ax.plot(YData,Y,'ro') ax.errorbar(YData,Y,yerr=ErrData,fmt='.') Ymin,Ymax=min((min(Y),min(YData))), max((max(Y),max(YData))) ax.plot([Ymin,Ymax],[Ymin,Ymax],'b') ax.xaxis.label.set_text('Data') ax.yaxis.label.set_text('Fitted') sigmaY,avg_stddev_data=get_stderr_fit(f,XData,popt,pcov) Yplus=Y+sigmaY Yminus=Y-sigmaY ax.plot(Y,Yplus,'C',alpha=0.6,linestyle='--',linewidth=0.5) ax.plot(Y,Yminus,'c',alpha=0.6,linestyle='--',linewidth=0.5) ax.fill_between(Y,Yminus,Yplus,facecolor='cyan',alpha=0.5) titletext='Parity plot for fit.\n' titletext += r'$r^2$ = %5.2f, $r^2_(adj)$= %5.2f' titletext +='$\sigma_{exp}$ = %5.2f, $\chi^2_{\nu}$=%5.2f, $p_{\chi^2}$=%5.2f,' titletext +='$\sigma_{ree}^{reg}$ = %5.2f' ax.title.set_text(titletext%(R2,R2_adj,avg_stddev_data,chisquared_red,p_chi2,stderr_reg)) ax.figure.canvas.draw() if ax2: #test for homoscedasticity formataxis(ax2) ax2.plot(Y,residuals,'ro') ax2.xaxis.label.set_text('Fitted Data') ax2.yaxis.label.set_text('Residuals') titletext = 'Analysis of residuals\n' titletext +=r'mean= %5.2f, $p_{res}$= %5.2f, $p_{shapiro}$=%5.2f , $Durbin-watson$=%2.1f' titletext +='\n F=%5.2f,$p_F$ = %3.2e' ax2.title.set_text(titletext%(mean_res,p_res,p_shapiro,dw,F,p_F)) ax2.figure.canvas.draw() return popt,pcov,perr,p95,p_p,chisquare,resanal
chisquare=sum((j**2)/((sigmai**2)*81)) sigmasq=sum(((u-ucalc)**2)/9) sigmau=scipy.sqrt(sigmasq) dumean2=(u-umean)**2 ducalc2=(u-ucalc)**2 dsducalc2=sum(ducalc2) dsdumean2=sum(dumean2) r2=1-dsducalc2/dsdumean2 residual=(u-ucalc)/e_q #calculating the residuals w,p_shapiro=scipy.stats.shapiro(residual)#shapiro wilk test dw=stools.durbin_watson(residual)#durbin watson test DFM=8 DFE=1 squares=(ucalc-umean) squaresT=(u-umean) residuals=(ucalc-u) SSM=sum(squares**2) SSE=sum(residuals**2) SST=sum(squaresT**2) MSM=SSM/DFM MSE=SSE/DFE
sunspot_data = sm.datasets.sunspots.load_pandas().data sunspot_data.index = pd.Index(sm.tsa.datetools.dates_from_range('1700', '2008')) del sunspot_data["YEAR"] sunspot_data.plot(figsize=(12,8)); fig = plt.figure(figsize=(12,8)) ax1 = fig.add_subplot(211) fig = sm.graphics.tsa.plot_acf(sunspot_data.values.squeeze(), lags=40, ax=ax1) ax2 = fig.add_subplot(212) fig = sm.graphics.tsa.plot_pacf(sunspot_data, lags=40, ax=ax2) arma_mod20 = sm.tsa.ARMA(sunspot_data, (2,0)).fit() arma_mod30 = sm.tsa.ARMA(sunspot_data, (3,0)).fit() stattools.durbin_watson(arma_mod30.resid.values) fig = plt.figure(figsize=(12,8)) ax = fig.add_subplot(111) ax = arma_mod30.resid.plot(ax=ax) resid = arma_mod30.resid diag.normal_ad(resid) fig = plt.figure(figsize=(12,8)) ax = fig.add_subplot(111) fig = qqplot(resid, line='q', ax=ax, fit=True) fig = plt.figure(figsize=(12,8)) ax1 = fig.add_subplot(211)
def fitdata(f, Xdata,Ydata,Errdata, pguess,dict_data, ax=False, ax2=False): def error(p,Xdata,Ydata,Errdata,dict_data): Y=f(Xdata,p,dic_data) residuals=(Y-Ydata)/Errdata return residuals res=scipy.optimize.leastsq(error,pguess,args=(Xdata,Ydata,Errdata,dict_data),full_output=1) (popt,pcov,infodict,errmsg,ier)=res perr=scipy.sqrt(scipy.diag(pcov)) M=len(Ydata) N=len(popt) #Residuals Y=f(Xdata,popt,dict_data) residuals=(Y-Ydata)/Errdata meanY=scipy.mean(Ydata) squares=(Y-meanY)/Errdata squaresT=(Ydata-meanY)/Errdata SSM=sum(squares**2) #Corrected Sum of Squares SSE=sum(residuals**2) #Sum of Squares of Errors SST=sum(squaresT**2)#Total Corrected sum of Squares DFM=N-1 #Degree of Freedom for model DFE=M-N #Degree of Freedom for error DFT=M-1 #Degree of freedom total MSM=SSM/DFM #Mean Squares for model(explained Variance) MSE=SSE/DFE #Mean Squares for Error(should be small wrt MSM) unexplained Variance MST=SST/DFT #Mean squares for total R2=SSM/SST #proportion of unexplained variance R2_adj= 1-(1-R2)*(M-1)/(M-N-1) #Adjusted R2 #t-test to see if parameters are different from zero t_stat=popt/perr #t-stat for popt different from zero t_stat=t_stat.real p_p= 1.0-scipy.stats.t.cdf(t_stat,DFE) #should be low for good fit z=scipy.stats.t(M-N).ppf(0.95) p95=perr*z #Chi-Squared Analysis on Residuals chisquared=sum(residuals**2) degfreedom=M-N chisquared_red=chisquared/degfreedom p_chi2=1.0-scipy.stats.chi2.cdf(chisquared, degfreedom) stderr_reg=scipy.sqrt(chisquared_red) chisquare=(p_chi2,chisquared,chisquared_red,degfreedom,R2,R2_adj) #Analysis of Residuals w, p_shapiro=scipy.stats.shapiro(residuals) mean_res=scipy.mean(residuals) stddev_res=scipy.sqrt(scipy.var(residuals)) t_res=mean_res/stddev_res #t-statistics p_res=1.0-scipy.stats.cdf(t_res,M-1) F=MSM/MSE p_F=1.0-scipy.stats.f.cdf(F,DFM,DFE) dw=stools.durbin_watson(residuals) resanal=(p_shapiro,w,mean_res,p_res,F,p_F,dw) if ax: formataxis(ax) ax.plot(Ydata,Y,'ro') ax.errorbar(Ydata,Y,yerr=Errdata, fmt='.') Ymin,Ymax=min((min(Y),min(Ydata))),max((max(Y),max(Ydata))) ax.plot([Ymin,Ymax],[Ymin,Ymax],'b') ax.xaxis.label.set_text('Data') ax.yaxis.label.set_text('Fitted') sigmay,avg_stddev_data=get_stderr_fit(f,xdata,popt,pcov,dict_data) Yplus=Y+sigmay Yminus=Y-sigmay ax.plot(Y,Yplus,'c',alpha=0.6,linestyle='--',linewidth=0.5) ax.plot(Y,Yminus,'c',alpha=0.6,linestyle='==',linewidth=0.5) ax.fill_between(Y,Yminus,Yplus,facecolor='cyan',alpha=0.5) titletext='Parity plot for fit.\n' titletext+=r'$r^2$=%5.2f,$r^2_{adj}$=%5.2f,$p_{shapiro}$=%5.2f,$Durbin-Watson=%2.1f' titletext+='\n F=%5.2f,$p_F$=%3.2e' titletext+='$\sigma_{err}^{reg}$=%5.2f' ax.title.set.text(titletext%(R2,R2_adj,avg_stddev_data,chisquared_red,p_chi2,stderr_reg)) ax.figure.canvas.draw() if ax2: formataxis(ax2) ax2.plot(Y,residuals,'ro') ax2.xaxis.label.set_text('Fitted Data') ax2.yaxis.label.set_text('Residuals') titletext='Analysis of Residuals\n' titletext+=r'mean=%5.2f,$p_{res}$=%5.2f,$p_{shapiro}$=%5.2f,$Durbin-Watson$=%2.1f' titletext+='\n F=%5.2f,$p_F$=%3.2e' ax2.title.set_text(titletext%(mean_res,p_res,p_shapiro,dw,F,p_F)) return popt,pcov,perr, p95, p_p,chisquare, resanal
def fitdata(f,Xdata,Ydata,Errdata,pguess,ax=False,ax2=False): def error(p,Xdata,Ydata,Errdata): Y=f(Xdata,p) residuals=(Y-Ydata)/Errdata return residuals res=scipy.optimize.leastsq(error,pguess,args=(Xdata,Ydata,Errdata),full_output=1) (popt,pcov,infodict,errmsg,ier)=res perr=scipy.sqrt(scipy.diag(pcov)) M=len(Ydata) N=len(popt) Y=f(Xdata,popt) residuals=(Y-Ydata)/Errdata meanY=scipy.mean(Ydata) squares=(Y-meanY)/Errdata squaresT=(Ydata-meanY)/Errdata SSM=sum(squares**2) SSE=sum(residuals**2) SST=sum(squaresT**2) DFM=N-1 DFE=M-N DFT=M-1 MSM=SSM/DFM MSE=SSE/DFE MSM=SST/DFT '''R2''' R2=SSM/SST R2_adj=1-(1-R2)*(M-1)/(M-N-1) '''t-test''' t_stat=popt/perr t_stat=t_stat.real p_p=1.0-scipy.stats.t.cdf(t_stat,DFE) z=scipy.stats.t(M-N).ppf(0.95) p95=perr*z '''chi-square''' chisqred=sum(residuals**2) degfrdm=M-N chisqred_red=chisqred/degfrdm p_chi2=1.0-scipy.stats.chi2.cdf(chisqred,degfrdm) stderr_reg=scipy.sqrt(chisqred_red) chisqre=(p_chi2,chisqred,chisqred_red,degfrdm,R2,R2_adj) '''shapiro-wilk test''' w,p_shapiro=scipy.stats.shapiro(residuals) mean_res=scipy.mean(residuals) stddev_res=scipy.sqrt(scipy.var(residuals)) t_res=mean_res/stddev_res p_res=1.0-scipy.stats.t.cdf(t_res,M-1) '''F-test''' F=MSM/MSE p_F=1-scipy.stats.f.cdf(F,DFM,DFE) '''durbin-watson''' dw=stools.durbin_watson(residuals) resanal=(p_shapiro,w,mean_res,p_res,F,p_F,dw) return popt,pcov,perr,p95,p_p,chisqre,resanal,R2,chisqred,w,dw
def summary(self, yname=None, xname=None, title=None, alpha=.05): """Summarize the Regression Results Parameters ----------- yname : string, optional Default is `y` xname : list of strings, optional Default is `var_##` for ## in p the number of regressors title : string, optional Title for the top table. If not None, then this replaces the default title alpha : float significance level for the confidence intervals Returns ------- smry : Summary instance this holds the summary tables and text, which can be printed or converted to various output formats. See Also -------- statsmodels.iolib.summary.Summary : class to hold summary results """ #TODO: import where we need it (for now), add as cached attributes from statsmodels.stats.stattools import (jarque_bera, omni_normtest, durbin_watson) jb, jbpv, skew, kurtosis = jarque_bera(self.wresid) omni, omnipv = omni_normtest(self.wresid) eigvals = self.eigenvals condno = self.condition_number self.diagn = dict(jb=jb, jbpv=jbpv, skew=skew, kurtosis=kurtosis, omni=omni, omnipv=omnipv, condno=condno, mineigval=eigvals[0]) top_left = [('Dep. Variable:', None), ('Model:', None), ('Method:', ['Least Squares']), ('Date:', None), ('Time:', None) ] top_right = [('Pseudo R-squared:', ["%#8.4g" % self.prsquared]), ('Bandwidth:', ["%#8.4g" % self.bandwidth]), ('Sparsity:', ["%#8.4g" % self.sparsity]), ('No. Observations:', None), ('Df Residuals:', None), #[self.df_resid]), #TODO: spelling ('Df Model:', None) #[self.df_model]) ] diagn_left = [('Omnibus:', ["%#6.3f" % omni]), ('Prob(Omnibus):', ["%#6.3f" % omnipv]), ('Skew:', ["%#6.3f" % skew]), ('Kurtosis:', ["%#6.3f" % kurtosis]) ] diagn_right = [('Durbin-Watson:', ["%#8.3f" % durbin_watson(self.wresid)]), ('Jarque-Bera (JB):', ["%#8.3f" % jb]), ('Prob(JB):', ["%#8.3g" % jbpv]), ('Cond. No.', ["%#8.3g" % condno]) ] if title is None: title = self.model.__class__.__name__ + ' ' + "Regression Results" #create summary table instance from statsmodels.iolib.summary import Summary smry = Summary() smry.add_table_2cols(self, gleft=top_left, gright=top_right, yname=yname, xname=xname, title=title) smry.add_table_params(self, yname=yname, xname=xname, alpha=.05, use_t=True) # smry.add_table_2cols(self, gleft=diagn_left, gright=diagn_right, #yname=yname, xname=xname, #title="") #add warnings/notes, added to text format only etext = [] if eigvals[-1] < 1e-10: wstr = "The smallest eigenvalue is %6.3g. This might indicate " wstr += "that there are\n" wstr += "strong multicollinearity problems or that the design " wstr += "matrix is singular." wstr = wstr % eigvals[-1] etext.append(wstr) elif condno > 1000: #TODO: what is recommended wstr = "The condition number is large, %6.3g. This might " wstr += "indicate that there are\n" wstr += "strong multicollinearity or other numerical " wstr += "problems." wstr = wstr % condno etext.append(wstr) if etext: smry.add_extra_txt(etext) return smry
'F_Statistic': results.fvalue, 'F_Statistic_P_Value': results.f_pvalue, 'Log_Likelihood': results.llf, 'AIC': results.aic, 'BIC': results.bic, 'Number_Of_Observations': int(results.nobs), 'Degrees_Of_Freedom_Model': int(results.df_model), 'Degrees_Of_Freedom_Residual': int(results.df_resid) }, 'Parameters': params, 'Diagnostics': { 'Omnibus': results.diagn['omni'], 'Omnibus_P_Value': results.diagn['omnipv'], 'Skew': results.diagn['skew'], 'Kurtosis': results.diagn['kurtosis'], 'Durbin_Watson': durbin_watson(results.wresid), 'Jarque_Bera': results.diagn['jb'], 'Jarque_Bera_P_Value': results.diagn['jbpv'], 'Condition_Number': results.diagn['condno'] } } print(json.dumps(resultsObject, sort_keys=True)) # OLS Regression Results # ============================================================================== # Dep. Variable: var1 R-squared: 0.734 # Model: OLS Adj. R-squared: 0.706 # Method: Least Squares F-statistic: 26.21 # Date: Sun, 05 Jul 2015 Prob (F-statistic): 3.45e-06
def test_durbin_watson_2d(self, reset_randomstate): shape = (1, 10) x = np.random.standard_normal(100) dw = sum(np.diff(x)**2.0) / np.dot(x, x) x = np.tile(x[:, None], shape) assert_almost_equal(np.squeeze(dw * np.ones(shape)), durbin_watson(x))
def fitdata(f,Xdata,Ydata,Errdata,pguess,ax= False,ax2= False): #calculating the popt def error(pguess,Xdata,Ydata,Errdata): Y=f(Xdata,pguess) residuals= (Y-Ydata)/Errdata return (residuals) res= scipy.optimize.leastsq(error, pguess,args=(Xdata,Ydata,Errdata),full_output=1) (popt,pcov,infodict,errmsg,ier)=res perr= scipy.sqrt(scipy.diag(pcov)) M= len(Xdata) N= len(popt) #residuals Y= f(Xdata,popt) residuals=(Y-Ydata)/Errdata meanY= scipy.mean(Ydata) squares= (Y-meanY)/Errdata squaresT= (Ydata-meanY)/Errdata SSM= sum(squares**2)#corrected sum of squares SSE= sum(residuals**2)#sum of squares of errors SST= sum(squaresT**2)#total corrected sum of squrare DFM= N-1 DFE= M-N DFT= M-1 MSM= SSM/DFM MSE= SSE/DFE MST= SST/DFT R2= SSM/SST #proportion of explained variance R2_adj= 1-(1-R2)*(M-1)/(M-N-1)#Adjusted R2 # t test to see if parameters are different from 0 t_stat= popt/perr t_stat= t_stat.real p_p= 1.0-scipy.stats.t.cdf(t_stat,DFE) z=scipy.stats.t(M-N).ppf(0.95) p95= perr*z #chisquared analysis on residuals chisquared= sum(residuals**2) degfreedom= M-N chisquared_red= chisquared/degfreedom p_chi2= 1.0-scipy.stats.chi2.cdf(chisquared,degfreedom) stderr_reg= scipy.sqrt(chisquared_red) chisquare=(p_chi2, chisquared,chisquared_red,degfreedom,R2,R2_adj) #analysis of residuals w,p_shapiro= scipy.stats.shapiro(residuals) mean_res= scipy.mean(residuals) stddev_res= scipy.sqrt(scipy.var(residuals)) t_res= mean_res/stddev_res p_res=1.0-scipy.stats.t.cdf(t_res,M-1) #if p<0.05, null hypothesis is rejected and mean is non-zero #should be high for a good fit #F-test on the residuals F= MSM/MSE #explained variance/ unexplained should be large p_F= 1.0-scipy.stats.f.cdf(F,DFM,DFE) #if p_F<0.05, null hypo is rejected dw= stools.durbin_watson(residuals) resanal= (p_shapiro,w,mean_res,p_res,F,p_F,dw) if ax: formataxis(ax) ax.plot(Ydata,Y,'ro') ax.errorbar(Ydata,Y,yerr=Errdata,fmt='.') Ymin, Ymax= min((min(Y),min(Ydata))),max((max(Y),max(Ydata))) ax.plot([Ymin,Ymax],[Ymin,Ymax],'b') ax.xaxis.label.set_text('Data') ax.yaxis.label.set_text('Fitted') sigmaY, avg_stddev_data= get_stderr_fit(f,Xdata,popt,pcov) Yplus= Y+sigmaY Yminus= Y-sigmaY ax.plot(Y,Yplus,'c',alpha=0.6,linestyle='--',linewidth= 0.5) ax.plot(Y,Yminus,'c',alpha=0.6,linestyle='--',linewidth= 0.5) ax.fill_between(Y,Yminus,Yplus,facecolor= 'cyan',alpha=0.5) titletext='Parity plot for fit.\n' titletext+= r'$r^r$=%5.2f,$r^2_(adj)$=%5.2f,' titletext+= '$\sigma_<exp>$=%5.2f,$\chi^2_<\nu>$= %5.2f,$p_<chi_2>$=%5.2f,' titletext+= '$sigma_<err>^<reg>$=%5.2f' ax.title.set_text(titletext%(R2,R2_adj,avg_stddev_data,chisquared_red,p_chi2,stderr_reg)) ax.figure.canvas.draw() if ax2 : #test for homoscedaticity formataxis(ax2) ax2.plot(Y,residuals,'ro') ax2.xaxis.label.set_text('Fitted Data') ax2.yaxis.label.set_text('Residuals') titletext= 'Analysis of Residuals\n' titletext+= r'mean=%5.2f,$p_(res)$=%5.2f,$p_<shapiro>$= %5.2f, $Durbin-Watson$=%2.1f' titletext+= '\n F= %5.2f, $p_F$=%3.2e' ax2.title.set_text(titletext%(mean_res,p_res,p_shapiro,dw, F, p_F)) ax2.figure.canvas.draw() return popt, pcov,perr,p95,p_p,chisquare,resanal
def fitdata(f,Xdata,Ydata,Errdata,pguess,ax=False,ax2=False): ''' fitdata(f,Xdata,Ydata,Errdata,pguess): ''' def error(p,Xdata,Ydata,Errdata): Y=f(Xdata,p) residuals=(Y-Ydata)/Errdata return residuals res=scipy.optimize.leastsq(error,pguess,args=(Xdata,Ydata,Errdata),full_output=1) (popt,pcov,infodict,errmsg,ier)=res #optimize p perr=scipy.sqrt(scipy.diag(pcov)) #vector of sd of p M=len(Ydata) N=len(popt) #Residuals Y=f(Xdata,popt) residuals=(Y-Ydata)/Errdata meanY=scipy.mean(Ydata) squares=(Y-meanY)/Errdata squaresT=(Ydata-meanY)/Errdata SSM=sum(squares**2) #corrected sum of squares SSE=sum(residuals**2) #sum of squares of errors SST=sum(squaresT**2) #total corrected sum of squares DFM=N-1 #for model DFE=M-N #for error DFT=M-1 #total MSM=SSM/DFM #mean squares for model(explained variance) MSE=SSE/DFE #mean squares for errors(should be small wrt MSM) unexplained variance MST=SST/DFT #mean squares for total R2=SSM/SST #proportion of explained variance R2_adj=1-(1-R2)*(M-1)/(M-N-1) #adjusted R2 #ttest to see if parameters are different from zero t_stat=popt/perr #tstatistic for popt different from zero t_stat=t_stat.real p_p=1.0-scipy.stats.t.cdf(t_stat,DFE) #should be low for good fit z=scipy.stats.t(M-N).ppf(0.95) p95=perr*z #Chisquared ananlysis on residuals chisquared=sum(residuals**2) degfreedom=M-N chisquared_red=chisquared/degfreedom p_chi2=1.0-scipy.stats.chi2.cdf(chisquared,degfreedom) stderr_reg=scipy.sqrt(chisquared_red) chisquare=(p_chi2,chisquared,chisquared_red,degfreedom,R2,R2_adj) #Analysis of residuals w,p_shapiro=scipy.stats.shapiro(residuals) # to check if residuals are normally distributed mean_res=scipy.mean(residuals) stddev_res=scipy.sqrt(scipy.var(residuals)) t_res=mean_res/stddev_res p_res=1.0-scipy.stats.t.cdf(t_res,M-1) #if p_res<0.05,null hypothesis is rejected. #R^2>0 and at least one of the fitting parameters>0 #F-test on residuals F=MSM/MSE p_F=1.0-scipy.stats.f.cdf(F,DFM,DFE) dw=stools.durbin_watson(residuals) #to check if they are correlated resanal=(p_shapiro,w,mean_res,p_res,F,p_F,dw) if ax: formataxis(ax) ax.plot(Ydata,Y,'ro') ax.errorbar(Ydata,Y,yerr=Errdata,fmt='.') Ymin,Ymax=min((min(Y),min(Ydata))),max((max(Y),max(Ydata))) ax.plot([Ymin,Ymax],[Ymin,Ymax],'b') ax.xaxis.label.set_text('Data') ax.yaxis.label.set_text('Fitted') sigmay,avg_stddev_data=get_stderr_fit(f, Xdata, popt, pcov) Yplus=Y+sigmay Yminus=Y-sigmay ax.plot(Y,Yplus,'c',alpha=.6,linestyle='--',linewidth=.5) ax.plot(Y,Yminus,'c',alpha=.6,linestyle='--',linewidth=.5) ax.fill_between(Y,Yminus,Yplus,facecolor='cyan',alpha=.5) titletext='Parity plot for fit.\n' titletext+=r'$r^2$=%5.2f,$r^2_{adj}$=%5.2f, ' titletext+='$\sigma_{exp}$=%5.2f,$\chi^2_{\nu}$=%5.2f,$p_{\chi^2}$=%5.2f, ' titletext+='$\sigma_{err}^{reg}$=%5.2f' ax.title.set_text(titletext%(R2,R2_adj,avg_stddev_data,chisquared_red,p_chi2,stderr_reg)) ax.figure.canvas.draw() if ax2:#test for homoscedasticity formataxis(ax2) ax2.plot(Y,residuals,'ro') ax2.xaxis.label.set_text('Fitted data') ax2.yaxis.label.set_text('Residuals') titletext='Analysis of residuals\n' titletext+=r'mean=%5.2f,$p_{res}$=%5.2f,$p_{shapiro}$=%5.2f,$Durbin-Watson$=%2.1f' titletext+='\n F=%5.2f,$p_F$=%3.2e' ax2.title.set_text(titletext%(mean_res,p_res,p_shapiro,dw,F,p_F)) ax2.figure.canvas.draw() return popt,pcov,perr,p95,p_p,chisquare,resanal
def test_durbin_watson_pandas(): x = np.random.randn(50) x_series = pd.Series(x) assert_almost_equal(durbin_watson(x), durbin_watson(x_series), decimal=13)
def test_durbin_watson(self): x = np.random.standard_normal(100) dw = sum(np.diff(x)**2.0) / np.dot(x, x) assert_almost_equal(dw, durbin_watson(x))