Python regress_data 예제들, mcModels.regress_data Python 예제들

예제 #1

0

파일 보기

파일: ecm.py 프로젝트: mmpatil/Data-Modeling-Automation

def backtesting(candidates, base, regs, dates, long_dates, target="", l=9):
    '''
    plot and compile backtesting results for candidate models
    inputs:
    * candidates - list of variables to backtest models for
    * base: dependent variable and dummies 
    * regs: df with independent variables to be used
    * dates - list of pq1 periods to start 9Q backtest
    * long_dates - list of pq1 periods to start full length backtest
    * target - target dependent variable
    outputs:
    * mape - matrix of variables and MAPE values for various forecasting periods
    * saves mape matrix to candidate backtesting results.xlsx
    * saves matrix of variables and MAPE values for full backtest to long backtsting results.xlsx file
    '''
    dep = base['Dependent']
    mapes = []
    mape = pd.DataFrame(index=candidates, columns=dates)
    print("MAPE Columns : ", mape.columns)

    long_mape = pd.DataFrame(index=candidates, columns=long_dates)

    for i in candidates:
        X = create_design(base, regs, i)
        dep = dep[X.index]
        model = mc.regress_data(dep, X, intercept=True)
        beta = model.params

        for pq1 in dates:
            figname = i + ' 9Q Backtest ' + str(pq1)
            mape.loc[i, pq1] = create_backtest(X, beta, pq1, figname,
                                               base['Dependent'])

        for pq1 in long_dates:
            l = len(X.loc[pq1:].index)
            figname = i + ' Full History Backtest ' + str(pq1)
            long_mape.loc[i, pq1] = create_backtest(X,
                                                    beta,
                                                    pq1,
                                                    figname,
                                                    base['Dependent'],
                                                    l=l)
    mape.to_excel('Candidate Backtesting MAPE Results.xlsx')
    long_mape.to_excel('Candidate Long Backtesting MAPE Results.xlsx')

    return mape

예제 #2

0

파일 보기

파일: ecm.py 프로젝트: mmpatil/Data-Modeling-Automation

def recursive_reg(dep, X, n, varname):
    '''
    perform recursive regression on model 
    inputs:
    * dep - dependent variable
    * X - design matrix
    * n - number of latest periods to use for recursive regression
    * varname - name of the x variable in the model
    outputs:
    * params - parameter values for the recursive regresison trials
    * ps - pvalues for the recursive regression trials
    next steps - change funciton so that a date can be passed in lieu of n
    '''
    # each iteration will generate a set of pvalues and params, then plot each of the pvalues and params.
    ps = pd.DataFrame()
    params = pd.DataFrame()
    for i in range(n, len(dep)):
        dep_trim = dep[0:i]
        X_trim = X.iloc[0:i, :]
        model = mc.regress_data(dep_trim, X_trim, intercept=True)
        params[X_trim.index[-1]] = model.params
        ps[X_trim.index[-1]] = model.pvalues

        for i in params.index:
            if 'Lag_Y' in i:
                plt.ylim(bottom=0.75, top=1.25)
            ax = params.transpose()[i].plot()
            # print("AX", ax)
            ax.set_title(varname + ' model ' + i + ' parameter')
            ax.figure.savefig(varname + ' model ' + i + ' param.png')
            plt.clf()
        for i in ps.index:
            ax = ps.transpose()[i].plot()
            # print("AX", ax)
            ax.set_title(varname + ' model ' + i + ' pvalues')
            ax.figure.savefig(varname + ' model ' + i + ' pval.png')
            plt.clf()

    return params, ps

예제 #3

0

파일 보기

파일: ecm.py 프로젝트: mmpatil/Data-Modeling-Automation

def oot_backtesting(candidates,
                    base,
                    regs,
                    pq0,
                    dates,
                    full_base,
                    full_regs,
                    target,
                    l=9):
    '''
    perform out of time testing on the list of candidate models
    inputs:
    * candidates - list of models to perform oot testing on
    * base: dependent variable and dummies for intended development period for oot test
    * regs: df with independent variables for intended development period for oot test
    * pq0 - spot period
    * dates - dates to perform backtesting on 
    * full base - full dependent variables and dummies 
    * full_regs - full independent variables
    * target - model target actuals
    * l - length of forecast for backtesting
    outputs:
    * mape - matrix of MAPE values for forecasts indexed by pq1 period
    '''
    dep = base['Dependent']
    mapes = []
    mape = pd.DataFrame(index=candidates, columns=dates)
    for i in candidates:
        full_X = create_design(full_base, full_regs, i)
        X = create_design(base, regs, i)
        dep = dep[X.index]
        model = mc.regress_data(dep, X, intercept=True)
        beta = model.params
        for pq1 in dates:
            figname = i + '_' + str(pq0)[:11] + ' Out of Time ' + str(pq1)
            mape.loc[i, pq1] = create_backtest(full_X, beta, pq1, figname,
                                               target)
    mape.to_excel('Out of Time MAPE Results.xlsx')
    return mape

예제 #4

0

파일 보기

파일: ecm.py 프로젝트: mmpatil/Data-Modeling-Automation

def stress_test_compare(filename,
                        shtm,
                        shtb,
                        shta,
                        shts,
                        shtc,
                        short_list,
                        pq0,
                        pq1,
                        base,
                        regs,
                        dep,
                        bottom="",
                        top=""):
    '''
    compare stress test forecasts with those of the current model
    inputs:
    * filename - name of the .xlsx file where the data is located
    * shtm - name of the tab with the model development data
    * shtb - name of the tab with the base scenario forecast
    * shta - name of the tab with the adverse scenario forecast
    * shts - name of the tab with the severe scenario forecast
    * shtc - name of the tab with the comparison forecasts from the current model
    * short_list - list of candidate variables for forecasting
    * pq0 - spot period
    * pq1 - first forecasted period
    * base: dependent variable and dummies 
    * regs: df with independent variables to be used
    * dep - dependent variable 
    * bottom - bottom of y-axis of desired forecast graphs
    * top - top of y-axis of desired forecast graphs
    outputs: 
    * forecast_tbl - forecast metrics for candidate models
    * compare_tbl - forecast metrics for current model
    * saves plots of each forecast to .png file in pwd
    '''
    writer = pd.ExcelWriter('Stress Test Forecast.xlsx', engine='xlsxwriter')
    compare = wrangle_forecast_data(filename, shtc)
    full_df = wrangle_forecast_data(filename, shtm)
    full_dep = full_df.iloc[:, 0]
    full_dep.name = 'Dependent'
    target_spot = full_dep[pq0]
    compare = compare
    for i in short_list:
        X = create_design(base, regs, i)
        dep = dep[X.index]
        model = mc.regress_data(dep, X, intercept=True)
        beta = model.params
        forecast = build_scenario(filename, shtm, shtb, shta, shts, beta, pq0,
                                  pq1, i)

        figname = i + ' Stress Test Comparison'
        fig, ax = plt.subplots(ncols=1, nrows=1, figsize=(10, 6))
        if top != "":
            if bottom != "":
                plt.ylim(top=top, bottom=bottom)
            else:
                plt.ylim(top=top)
        else:
            if bottom != "":
                plt.ylim(bottom=bottom)

        plt.setp(ax.get_xticklabels(), rotation=45)
        j = 0
        colors = ['Black', 'Green', 'Blue', 'Red']
        forecast.columns = [
            'Actual', 'Model Base', 'Model_Adverse', 'Model_Severe'
        ]
        forecast_tbl = pd.DataFrame()
        for k in forecast.columns:
            if j > 0:
                actual = target_spot
                pq4 = forecast[k].iloc[4]
                pq9 = forecast[k].iloc[9]
                pq20 = forecast[k].iloc[20]
                cagr9 = (pq9 / actual)**(4 / 9) - 1
                cagr20 = (pq20 / actual)**(4 / 20) - 1

                actual = '${0:,.0f}'.format(actual)
                pq4 = '${0:,.0f}'.format(pq4)
                pq9 = '${0:,.0f}'.format(pq9)
                pq20 = '${0:,.0f}'.format(pq20)
                cagr9 = '{:.2%}'.format(cagr9)
                cagr20 = '{:.2%}'.format(cagr20)
                forecast_tbl[k] = pd.Series(
                    [actual, pq4, pq9, pq20, cagr9, cagr20])
            ax.plot(forecast.index, forecast[k], color=colors[j])
            j += 1
        forecast_tbl = forecast_tbl.transpose()
        forecast_tbl.columns = [
            'Actual', 'PQ4', 'PQ9', 'PQ20', '9Q CAGR', '20Q CAGR'
        ]

        for col in compare.columns:
            compare.loc[full_dep.index[-1], col] = target_spot

        compare = compare.sort_index(axis=0)

        colors = ['Green', 'Blue', 'Red']
        j = 0
        compare_tbl = pd.DataFrame()
        for k in compare.columns:
            ax.plot(compare.index, compare[k], color=colors[j], linestyle='--')

            actual = target_spot
            pq4 = compare[k].iloc[4]
            pq9 = compare[k].iloc[9]
            pq20 = compare[k].iloc[20]
            cagr9 = (pq9 / actual)**(4 / 9) - 1
            cagr20 = (pq20 / actual)**(4 / 20) - 1

            actual = '${0:,.0f}'.format(actual)
            pq4 = '${0:,.0f}'.format(pq4)
            pq9 = '${0:,.0f}'.format(pq9)
            pq20 = '${0:,.0f}'.format(pq20)
            cagr9 = '{:.2%}'.format(cagr9)
            cagr20 = '{:.2%}'.format(cagr20)
            compare_tbl[k] = pd.Series([actual, pq4, pq9, pq20, cagr9, cagr20])
            j += 1
        compare_tbl = compare_tbl.transpose()
        compare_tbl.columns = [
            'Actual', 'PQ4', 'PQ9', 'PQ20', '9Q CAGR', '20Q CAGR'
        ]
        #         ax.legend(loc = 'best')
        ax.set_title(figname)
        vals = ax.get_yticks()
        ax.set_yticklabels(['${0:,.0f}'.format(x) for x in vals])
        #         for x in vals:
        #             ax.axhline(y=x, color = 'black', linewidth = 0.2)
        plt.savefig(figname + '.png')
        plt.close()

        forecast_tbl.to_excel(writer, sheet_name=i[:25] + " New")
        compare_tbl.to_excel(writer, sheet_name=i[:25] + ' Old')
    writer.save()
    return forecast_tbl, compare_tbl

예제 #5

0

파일 보기

파일: ecm.py 프로젝트: mmpatil/Data-Modeling-Automation

def stress_test_plot(filename,
                     shtm,
                     shtb,
                     shta,
                     shts,
                     short_list,
                     pq0,
                     pq1,
                     base,
                     regs,
                     bottom="",
                     top=""):
    '''
    plot stress testing results for candidate models
    inputs
    * filename - name of the .xlsx file where the data is located
    * shtm - name of the tab with the model development data
    * shtb - name of the tab with the base scenario forecast
    * shta - name of the tab with the adverse scenario forecast
    * shts - name of the tab with the severe scenario forecast
    * short_list - list of candidate variables for forecasting
    * pq0 - spot period
    * pq1 - first forecasted period
    * base: dependent variable and dummies 
    * regs: df with independent variables to be used
    * bottom - bottom of y-axis of desired forecast graphs
    * top - top of y-axis of desired forecast graphs
    outputs:
    * saves stress test forecast to .png files
    '''
    for i in short_list:
        dep = base['Dependent']
        X = create_design(base, regs, i)
        dep = dep[X.index]
        model = mc.regress_data(dep, X, intercept=True)
        beta = model.params

        forecast = build_scenario(filename, shtm, shtb, shta, shts, beta, pq0,
                                  pq1, i)
        figname = i + ' Stress Test Forecast'
        fig, ax = plt.subplots(ncols=1, nrows=1, figsize=(10, 6))

        if top != "":
            if bottom != "":
                plt.ylim(top=top, bottom=bottom)
            else:
                plt.ylim(top=top)
        else:
            if bottom != "":
                plt.ylim(bottom=bottom)

        plt.setp(ax.get_xticklabels(), rotation=45)
        j = 0
        colors = ['Black', 'Green', 'Blue', 'Red']
        for i in forecast.columns:
            ax.plot(forecast.index, forecast[i], color=colors[j])
            j += 1
        #         ax.legend(loc = 'best')
        ax.set_title(figname)
        vals = ax.get_yticks()
        ax.set_yticklabels(['${0:,.0f}'.format(x) for x in vals])
        #         for x in vals:
        #             ax.axhline(y=x, color = 'black', linewidth = 0.2)
        plt.savefig(figname + '.png')
        plt.close()

예제 #6

0

파일 보기

파일: ecm.py 프로젝트: mmpatil/Data-Modeling-Automation

def statistical_testing(base,
                        regs,
                        adf_alpha=0.05,
                        param_alpha=0.10,
                        bg_alpha=0.05,
                        white_alpha=0.05,
                        sw_alpha=0.05):
    '''
    create regressions and run diagnostics
    filter out regressions that do not pass tests
    save statistical testing results to .xlsx file
    tests performed:
    * Breusch Godfrey (Autocorrelation)
    * Whites Test (Heteroskedasticity)
    * AIC - no filter on this 
    * Shapiro-Wilk (Normality of Residuals)
    * Durbin Watson (Autocorrelation) - no filter on this
    inputs: 
    * base: dependent variable and dummies 
    * regs: df with independent variables to be used
    outputs:
    *pass_tests - list of variable names in regs that passed the tests
    * saves file Statistical Test Results.xlsx
    
    next steps: create function for each statistical test and parameterize significance level that is hard-coded here
    '''
    candidates = []
    bgs = []
    whits = []
    sws = []
    pass_tests = []
    aics = []
    dw0 = []
    dw1 = []
    dw2 = []
    dw3 = []
    regs = regs.dropna(axis=1)
    # iterating over the possible independent variables
    for i in regs.columns:
        dep = base['Dependent']
        X = create_basic_design(cp.deepcopy(base), regs, i)
        dep = dep[X.index]
        model = mc.regress_data(dep, X, intercept=True)
        dep_order = residual_integration_order(model.resid, alpha=adf_alpha)
        # creating regression results and diagnostics for each variable
        dep = base['Dependent']
        X = create_design(cp.deepcopy(base), regs, i)
        dep = dep[X.index]
        model = mc.regress_data(dep, X, intercept=True)
        model_image_save(model, i)
        # Serial correlation test
        bg = sm.stats.diagnostic.acorr_breusch_godfrey(
            model, nlags=4, store=False)  # Null: no autocorrelation
        # Heteroscedasticity Test
        whit = sm.stats.diagnostic.het_white(model.resid,
                                             model.model.exog,
                                             retres=False)
        # normality test
        sw = shapiro(model.resid)  # Null: Residuals are normally distributed
        # AIC goodness of fit
        aic = model.aic
        # Plot the PACF
        plot_pacf(model.resid, lags=20)
        plt.savefig(i + ' PACF.png')
        plt.close()
        # Save Durbin-Watson pvalues up to 4 lags
        dw = durbin_watson(model, 4)

        # saving regression results and diagnostics for each with significant parameters and I(0) residuals
        if (dep_order == 0) & (
            (model.pvalues[len(base.columns):] < param_alpha).all() == True):
            candidates.append(i)
            bgs.append(bg[1])
            whits.append(whit[1])
            sws.append(sw[1])
            aics.append(aic)
            dw0.append(dw[0])
            dw1.append(dw[1])
            dw2.append(dw[2])
            dw3.append(dw[3])
            # filtering for candidates that pass all statistical requirements and plotting 1Q backtest
            if bg[1] > bg_alpha:
                if whit[1] > white_alpha:
                    if sw[1] > sw_alpha:
                        plt.plot(X.index, dep, X.index, model.predict())
                        plt.show()
                        plt.close()
                        pass_tests.append(i)
    results = pd.DataFrame({
        'Variable': candidates,
        'Breusch-Godfrey p': bgs,
        'White p': whits,
        'Shapiro-Wilk p': sws,
        'AIC': aics,
        'DW Lag1': dw0,
        'DW Lag2': dw1,
        'DW Lag3': dw2,
        'DW Lag4': dw3
    })
    # outputting results to file
    results.to_excel('Statistical Testing Results.xlsx')
    return pass_tests

예제 #7

0

파일 보기

파일: ecm.py 프로젝트: mmpatil/Data-Modeling-Automation

def create_sensitivity(filename,
                       shtm,
                       shtb,
                       base,
                       regs,
                       short_list,
                       pq0,
                       pq1,
                       l=50):
    '''
    create sensitivity testing for list of candidate models
    inputs: 
    * filename - name of the .xlsx file where the data is located
    * shtm - name of the tab with the model development data
    * shtb - name of the tab with the base scenario forecast
    * base: dependent variable and dummies 
    * regs: df with independent variables to be used
    * short_list - list of candidate variables for forecasting
    * pq0 - spot period
    * pq1 - first forecasted period
    * l - number of forecasts to randomly generate
    outputs:
    * saves plot of sensitivity analysis to .png in pwd
    '''
    resdf = pd.DataFrame(columns=['Mean', 'Min', 'Max'])
    df = wrangle_model_data(filename, shtm)
    dep = df.iloc[:, 0]
    dep.name = 'Dependent'
    dep = np.log(dep)
    for i in short_list:
        X = create_design(base, regs, i)
        dep = dep[X.index]
        model = mc.regress_data(dep, X, intercept=True)
        beta = model.params
        figname = i + ' Dynamic Sensitivity Testing'
        fig, ax = plt.subplots(ncols=1, nrows=1, figsize=(10, 6))
        ends = []
        for k in range(l):
            j = 0
            forecast = build_random_scenario(filename, shtm, shtb, beta, pq0,
                                             pq1, i)
            ends.append(forecast['Rand'].iloc[-1])
            colors = ['black', 'red']
            for col in forecast.columns:
                ax.plot(forecast.index, forecast[col], color=colors[j])
                j += 1
        try:
            plt.ylim(top=1.25 * np.max(np.max(forecast)),
                     bottom=0.75 * np.min(np.min(forecast)))
        except:
            pass
        plt.setp(ax.get_xticklabels(), rotation=45)
        ax.set_title(figname)
        vals = ax.get_yticks()
        ax.set_yticklabels(['${0:,.0f}'.format(x) for x in vals])
        for x in vals:
            ax.axhline(y=x, color='black', linewidth=0.2)
        plt.savefig(figname + '.png')
        plt.close()

        resdf.loc[i] = [
            '${0:,.0f}'.format(np.mean(ends)),
            '${0:,.0f}'.format(np.min(ends)), '${0:,.0f}'.format(np.max(ends))
        ]
    writer = pd.ExcelWriter('Sensitivity Metrics.xlsx', engine='xlsxwriter')
    resdf.to_excel(writer)
    writer.save()