def backtesting(candidates, base, regs, dates, long_dates, target="", l=9): ''' plot and compile backtesting results for candidate models inputs: * candidates - list of variables to backtest models for * base: dependent variable and dummies * regs: df with independent variables to be used * dates - list of pq1 periods to start 9Q backtest * long_dates - list of pq1 periods to start full length backtest * target - target dependent variable outputs: * mape - matrix of variables and MAPE values for various forecasting periods * saves mape matrix to candidate backtesting results.xlsx * saves matrix of variables and MAPE values for full backtest to long backtsting results.xlsx file ''' dep = base['Dependent'] mapes = [] mape = pd.DataFrame(index=candidates, columns=dates) print("MAPE Columns : ", mape.columns) long_mape = pd.DataFrame(index=candidates, columns=long_dates) for i in candidates: X = create_design(base, regs, i) dep = dep[X.index] model = mc.regress_data(dep, X, intercept=True) beta = model.params for pq1 in dates: figname = i + ' 9Q Backtest ' + str(pq1) mape.loc[i, pq1] = create_backtest(X, beta, pq1, figname, base['Dependent']) for pq1 in long_dates: l = len(X.loc[pq1:].index) figname = i + ' Full History Backtest ' + str(pq1) long_mape.loc[i, pq1] = create_backtest(X, beta, pq1, figname, base['Dependent'], l=l) mape.to_excel('Candidate Backtesting MAPE Results.xlsx') long_mape.to_excel('Candidate Long Backtesting MAPE Results.xlsx') return mape
def recursive_reg(dep, X, n, varname): ''' perform recursive regression on model inputs: * dep - dependent variable * X - design matrix * n - number of latest periods to use for recursive regression * varname - name of the x variable in the model outputs: * params - parameter values for the recursive regresison trials * ps - pvalues for the recursive regression trials next steps - change funciton so that a date can be passed in lieu of n ''' # each iteration will generate a set of pvalues and params, then plot each of the pvalues and params. ps = pd.DataFrame() params = pd.DataFrame() for i in range(n, len(dep)): dep_trim = dep[0:i] X_trim = X.iloc[0:i, :] model = mc.regress_data(dep_trim, X_trim, intercept=True) params[X_trim.index[-1]] = model.params ps[X_trim.index[-1]] = model.pvalues for i in params.index: if 'Lag_Y' in i: plt.ylim(bottom=0.75, top=1.25) ax = params.transpose()[i].plot() # print("AX", ax) ax.set_title(varname + ' model ' + i + ' parameter') ax.figure.savefig(varname + ' model ' + i + ' param.png') plt.clf() for i in ps.index: ax = ps.transpose()[i].plot() # print("AX", ax) ax.set_title(varname + ' model ' + i + ' pvalues') ax.figure.savefig(varname + ' model ' + i + ' pval.png') plt.clf() return params, ps
def oot_backtesting(candidates, base, regs, pq0, dates, full_base, full_regs, target, l=9): ''' perform out of time testing on the list of candidate models inputs: * candidates - list of models to perform oot testing on * base: dependent variable and dummies for intended development period for oot test * regs: df with independent variables for intended development period for oot test * pq0 - spot period * dates - dates to perform backtesting on * full base - full dependent variables and dummies * full_regs - full independent variables * target - model target actuals * l - length of forecast for backtesting outputs: * mape - matrix of MAPE values for forecasts indexed by pq1 period ''' dep = base['Dependent'] mapes = [] mape = pd.DataFrame(index=candidates, columns=dates) for i in candidates: full_X = create_design(full_base, full_regs, i) X = create_design(base, regs, i) dep = dep[X.index] model = mc.regress_data(dep, X, intercept=True) beta = model.params for pq1 in dates: figname = i + '_' + str(pq0)[:11] + ' Out of Time ' + str(pq1) mape.loc[i, pq1] = create_backtest(full_X, beta, pq1, figname, target) mape.to_excel('Out of Time MAPE Results.xlsx') return mape
def stress_test_compare(filename, shtm, shtb, shta, shts, shtc, short_list, pq0, pq1, base, regs, dep, bottom="", top=""): ''' compare stress test forecasts with those of the current model inputs: * filename - name of the .xlsx file where the data is located * shtm - name of the tab with the model development data * shtb - name of the tab with the base scenario forecast * shta - name of the tab with the adverse scenario forecast * shts - name of the tab with the severe scenario forecast * shtc - name of the tab with the comparison forecasts from the current model * short_list - list of candidate variables for forecasting * pq0 - spot period * pq1 - first forecasted period * base: dependent variable and dummies * regs: df with independent variables to be used * dep - dependent variable * bottom - bottom of y-axis of desired forecast graphs * top - top of y-axis of desired forecast graphs outputs: * forecast_tbl - forecast metrics for candidate models * compare_tbl - forecast metrics for current model * saves plots of each forecast to .png file in pwd ''' writer = pd.ExcelWriter('Stress Test Forecast.xlsx', engine='xlsxwriter') compare = wrangle_forecast_data(filename, shtc) full_df = wrangle_forecast_data(filename, shtm) full_dep = full_df.iloc[:, 0] full_dep.name = 'Dependent' target_spot = full_dep[pq0] compare = compare for i in short_list: X = create_design(base, regs, i) dep = dep[X.index] model = mc.regress_data(dep, X, intercept=True) beta = model.params forecast = build_scenario(filename, shtm, shtb, shta, shts, beta, pq0, pq1, i) figname = i + ' Stress Test Comparison' fig, ax = plt.subplots(ncols=1, nrows=1, figsize=(10, 6)) if top != "": if bottom != "": plt.ylim(top=top, bottom=bottom) else: plt.ylim(top=top) else: if bottom != "": plt.ylim(bottom=bottom) plt.setp(ax.get_xticklabels(), rotation=45) j = 0 colors = ['Black', 'Green', 'Blue', 'Red'] forecast.columns = [ 'Actual', 'Model Base', 'Model_Adverse', 'Model_Severe' ] forecast_tbl = pd.DataFrame() for k in forecast.columns: if j > 0: actual = target_spot pq4 = forecast[k].iloc[4] pq9 = forecast[k].iloc[9] pq20 = forecast[k].iloc[20] cagr9 = (pq9 / actual)**(4 / 9) - 1 cagr20 = (pq20 / actual)**(4 / 20) - 1 actual = '${0:,.0f}'.format(actual) pq4 = '${0:,.0f}'.format(pq4) pq9 = '${0:,.0f}'.format(pq9) pq20 = '${0:,.0f}'.format(pq20) cagr9 = '{:.2%}'.format(cagr9) cagr20 = '{:.2%}'.format(cagr20) forecast_tbl[k] = pd.Series( [actual, pq4, pq9, pq20, cagr9, cagr20]) ax.plot(forecast.index, forecast[k], color=colors[j]) j += 1 forecast_tbl = forecast_tbl.transpose() forecast_tbl.columns = [ 'Actual', 'PQ4', 'PQ9', 'PQ20', '9Q CAGR', '20Q CAGR' ] for col in compare.columns: compare.loc[full_dep.index[-1], col] = target_spot compare = compare.sort_index(axis=0) colors = ['Green', 'Blue', 'Red'] j = 0 compare_tbl = pd.DataFrame() for k in compare.columns: ax.plot(compare.index, compare[k], color=colors[j], linestyle='--') actual = target_spot pq4 = compare[k].iloc[4] pq9 = compare[k].iloc[9] pq20 = compare[k].iloc[20] cagr9 = (pq9 / actual)**(4 / 9) - 1 cagr20 = (pq20 / actual)**(4 / 20) - 1 actual = '${0:,.0f}'.format(actual) pq4 = '${0:,.0f}'.format(pq4) pq9 = '${0:,.0f}'.format(pq9) pq20 = '${0:,.0f}'.format(pq20) cagr9 = '{:.2%}'.format(cagr9) cagr20 = '{:.2%}'.format(cagr20) compare_tbl[k] = pd.Series([actual, pq4, pq9, pq20, cagr9, cagr20]) j += 1 compare_tbl = compare_tbl.transpose() compare_tbl.columns = [ 'Actual', 'PQ4', 'PQ9', 'PQ20', '9Q CAGR', '20Q CAGR' ] # ax.legend(loc = 'best') ax.set_title(figname) vals = ax.get_yticks() ax.set_yticklabels(['${0:,.0f}'.format(x) for x in vals]) # for x in vals: # ax.axhline(y=x, color = 'black', linewidth = 0.2) plt.savefig(figname + '.png') plt.close() forecast_tbl.to_excel(writer, sheet_name=i[:25] + " New") compare_tbl.to_excel(writer, sheet_name=i[:25] + ' Old') writer.save() return forecast_tbl, compare_tbl
def stress_test_plot(filename, shtm, shtb, shta, shts, short_list, pq0, pq1, base, regs, bottom="", top=""): ''' plot stress testing results for candidate models inputs * filename - name of the .xlsx file where the data is located * shtm - name of the tab with the model development data * shtb - name of the tab with the base scenario forecast * shta - name of the tab with the adverse scenario forecast * shts - name of the tab with the severe scenario forecast * short_list - list of candidate variables for forecasting * pq0 - spot period * pq1 - first forecasted period * base: dependent variable and dummies * regs: df with independent variables to be used * bottom - bottom of y-axis of desired forecast graphs * top - top of y-axis of desired forecast graphs outputs: * saves stress test forecast to .png files ''' for i in short_list: dep = base['Dependent'] X = create_design(base, regs, i) dep = dep[X.index] model = mc.regress_data(dep, X, intercept=True) beta = model.params forecast = build_scenario(filename, shtm, shtb, shta, shts, beta, pq0, pq1, i) figname = i + ' Stress Test Forecast' fig, ax = plt.subplots(ncols=1, nrows=1, figsize=(10, 6)) if top != "": if bottom != "": plt.ylim(top=top, bottom=bottom) else: plt.ylim(top=top) else: if bottom != "": plt.ylim(bottom=bottom) plt.setp(ax.get_xticklabels(), rotation=45) j = 0 colors = ['Black', 'Green', 'Blue', 'Red'] for i in forecast.columns: ax.plot(forecast.index, forecast[i], color=colors[j]) j += 1 # ax.legend(loc = 'best') ax.set_title(figname) vals = ax.get_yticks() ax.set_yticklabels(['${0:,.0f}'.format(x) for x in vals]) # for x in vals: # ax.axhline(y=x, color = 'black', linewidth = 0.2) plt.savefig(figname + '.png') plt.close()
def statistical_testing(base, regs, adf_alpha=0.05, param_alpha=0.10, bg_alpha=0.05, white_alpha=0.05, sw_alpha=0.05): ''' create regressions and run diagnostics filter out regressions that do not pass tests save statistical testing results to .xlsx file tests performed: * Breusch Godfrey (Autocorrelation) * Whites Test (Heteroskedasticity) * AIC - no filter on this * Shapiro-Wilk (Normality of Residuals) * Durbin Watson (Autocorrelation) - no filter on this inputs: * base: dependent variable and dummies * regs: df with independent variables to be used outputs: *pass_tests - list of variable names in regs that passed the tests * saves file Statistical Test Results.xlsx next steps: create function for each statistical test and parameterize significance level that is hard-coded here ''' candidates = [] bgs = [] whits = [] sws = [] pass_tests = [] aics = [] dw0 = [] dw1 = [] dw2 = [] dw3 = [] regs = regs.dropna(axis=1) # iterating over the possible independent variables for i in regs.columns: dep = base['Dependent'] X = create_basic_design(cp.deepcopy(base), regs, i) dep = dep[X.index] model = mc.regress_data(dep, X, intercept=True) dep_order = residual_integration_order(model.resid, alpha=adf_alpha) # creating regression results and diagnostics for each variable dep = base['Dependent'] X = create_design(cp.deepcopy(base), regs, i) dep = dep[X.index] model = mc.regress_data(dep, X, intercept=True) model_image_save(model, i) # Serial correlation test bg = sm.stats.diagnostic.acorr_breusch_godfrey( model, nlags=4, store=False) # Null: no autocorrelation # Heteroscedasticity Test whit = sm.stats.diagnostic.het_white(model.resid, model.model.exog, retres=False) # normality test sw = shapiro(model.resid) # Null: Residuals are normally distributed # AIC goodness of fit aic = model.aic # Plot the PACF plot_pacf(model.resid, lags=20) plt.savefig(i + ' PACF.png') plt.close() # Save Durbin-Watson pvalues up to 4 lags dw = durbin_watson(model, 4) # saving regression results and diagnostics for each with significant parameters and I(0) residuals if (dep_order == 0) & ( (model.pvalues[len(base.columns):] < param_alpha).all() == True): candidates.append(i) bgs.append(bg[1]) whits.append(whit[1]) sws.append(sw[1]) aics.append(aic) dw0.append(dw[0]) dw1.append(dw[1]) dw2.append(dw[2]) dw3.append(dw[3]) # filtering for candidates that pass all statistical requirements and plotting 1Q backtest if bg[1] > bg_alpha: if whit[1] > white_alpha: if sw[1] > sw_alpha: plt.plot(X.index, dep, X.index, model.predict()) plt.show() plt.close() pass_tests.append(i) results = pd.DataFrame({ 'Variable': candidates, 'Breusch-Godfrey p': bgs, 'White p': whits, 'Shapiro-Wilk p': sws, 'AIC': aics, 'DW Lag1': dw0, 'DW Lag2': dw1, 'DW Lag3': dw2, 'DW Lag4': dw3 }) # outputting results to file results.to_excel('Statistical Testing Results.xlsx') return pass_tests
def create_sensitivity(filename, shtm, shtb, base, regs, short_list, pq0, pq1, l=50): ''' create sensitivity testing for list of candidate models inputs: * filename - name of the .xlsx file where the data is located * shtm - name of the tab with the model development data * shtb - name of the tab with the base scenario forecast * base: dependent variable and dummies * regs: df with independent variables to be used * short_list - list of candidate variables for forecasting * pq0 - spot period * pq1 - first forecasted period * l - number of forecasts to randomly generate outputs: * saves plot of sensitivity analysis to .png in pwd ''' resdf = pd.DataFrame(columns=['Mean', 'Min', 'Max']) df = wrangle_model_data(filename, shtm) dep = df.iloc[:, 0] dep.name = 'Dependent' dep = np.log(dep) for i in short_list: X = create_design(base, regs, i) dep = dep[X.index] model = mc.regress_data(dep, X, intercept=True) beta = model.params figname = i + ' Dynamic Sensitivity Testing' fig, ax = plt.subplots(ncols=1, nrows=1, figsize=(10, 6)) ends = [] for k in range(l): j = 0 forecast = build_random_scenario(filename, shtm, shtb, beta, pq0, pq1, i) ends.append(forecast['Rand'].iloc[-1]) colors = ['black', 'red'] for col in forecast.columns: ax.plot(forecast.index, forecast[col], color=colors[j]) j += 1 try: plt.ylim(top=1.25 * np.max(np.max(forecast)), bottom=0.75 * np.min(np.min(forecast))) except: pass plt.setp(ax.get_xticklabels(), rotation=45) ax.set_title(figname) vals = ax.get_yticks() ax.set_yticklabels(['${0:,.0f}'.format(x) for x in vals]) for x in vals: ax.axhline(y=x, color='black', linewidth=0.2) plt.savefig(figname + '.png') plt.close() resdf.loc[i] = [ '${0:,.0f}'.format(np.mean(ends)), '${0:,.0f}'.format(np.min(ends)), '${0:,.0f}'.format(np.max(ends)) ] writer = pd.ExcelWriter('Sensitivity Metrics.xlsx', engine='xlsxwriter') resdf.to_excel(writer) writer.save()