예제 #1
0
    def test_het_breusch_pagan(self):
        res = self.res

        bptest = dict(statistic=0.709924388395087, pvalue=0.701199952134347, parameters=(2,), distr="f")

        bp = smsdia.het_breuschpagan(res.resid, res.model.exog)
        compare_t_est(bp, bptest, decimal=(12, 12))
예제 #2
0
resid = fsm_results.resid
qq1 = sm.qqplot(resid, line ='45', fit=True, dist=stats.t)


y = fsm_df["price"]

y_hat = fsm_results.predict()


fig2, ax2 = plt.subplots()
ax2.set(xlabel="Price",
        ylabel="Residuals")
ax2.scatter(x=y_hat, y=y_hat-y, color="blue", alpha=0.2);


lm, lm_p_value, fvalue, f_p_value = het_breuschpagan(y-y_hat, fsm_df[["price"]])
print("Lagrange Multiplier p-value:", lm_p_value)
print("F-statistic p-value:", f_p_value)




#necessary?? 

preds = fsm_results.predict()


def plot_predictions(y_true, y_hat):
    fig, axis = plt.subplots()
    axis.scatter(y_true,y_hat,label='Model Output', alpha=.5, edgecolor='black')
    y_equalsx = np.linspace(0,y_true.max())
예제 #3
0
def quiz4():

    df = pd.read_csv(utils.PATH.COURSE_FILE(4, 'botswana.tsv', 'week3'),
                     sep='\t')
    print(df.head())
    print(df.info())

    # Q1

    religion = df['religion'].value_counts()
    print('---- Q1:', religion)

    # Q2

    df_nonna = df.dropna()
    print('---- Q2:', df_nonna.shape)

    # Q3

    df['nevermarr'] = df['agefm'].apply(lambda x: 0 if x >= 0 else 1)
    df.drop(['evermarr'], axis=1, inplace=True)
    df['agefm'].fillna(0, inplace=True)
    df['heduc'] = df.apply(lambda row: row['heduc']
                           if row['nevermarr'] == 0 else -1,
                           axis=1)

    heduc_na = df['heduc'].isnull().value_counts()
    print('---- Q3:', heduc_na)

    # Q4

    df['idlnchld_noans'] = df['idlnchld'].apply(lambda x: 0 if x >= 0 else -1)
    df['idlnchld'].fillna(-1, inplace=True)
    df['heduc_noans'] = df['heduc'].apply(lambda x: 0 if x >= 0 else -1)
    df['heduc'].fillna(-1, inplace=True)
    df['usemeth_noans'] = df['usemeth'].apply(lambda x: 0 if x >= 0 else -1)
    df['usemeth'].fillna(-1, inplace=True)

    df.dropna(inplace=True)
    print(df.shape)
    print('---- Q4:', df.shape[0] * df.shape[1])

    # Q5

    model = smf.ols('ceb ~ age + educ + religion + idlnchld + knowmeth + usemeth + agefm + heduc + urban + electric + radio + tv + bicycle +'\
                          'nevermarr + idlnchld_noans + heduc_noans + usemeth_noans', data=df)
    fitted = model.fit()
    sum = fitted.summary()
    print('---- Q5"\n', sum)

    # Q6

    print('---- Q6: p=%f' %
          het_breuschpagan(fitted.resid, fitted.model.exog)[1])

    # Q7


    model1 = smf.ols('ceb ~ age + educ + religion + idlnchld + knowmeth + usemeth + agefm + heduc + urban + electric + radio + tv + bicycle +'\
                           'nevermarr + idlnchld_noans + heduc_noans + usemeth_noans', data=df)
    fitted = model1.fit(cov_type='HC1')
    sum = fitted.summary()
    print('---- Q7\n:', sum)

    # Q8

    model2 = smf.ols('ceb ~ age + educ + idlnchld + knowmeth + usemeth + agefm + heduc + urban + electric + bicycle +'\
                           'nevermarr + idlnchld_noans + heduc_noans + usemeth_noans', data=df)
    fitted = model2.fit()

    print('p=%f' % het_breuschpagan(fitted.resid, fitted.model.exog)[1])

    model2 = smf.ols('ceb ~ age + educ + idlnchld + knowmeth + usemeth + agefm + heduc + urban + electric + bicycle +'\
                           'nevermarr + idlnchld_noans + heduc_noans + usemeth_noans', data=df)
    fitted = model2.fit(cov_type='HC1')

    print("---- Q8: F=%f, p=%f, k1=%f" %
          model1.fit().compare_f_test(model2.fit()))

    # Q9

    model3 = smf.ols('ceb ~ age + educ + idlnchld + knowmeth + agefm + heduc + urban + electric + bicycle +'\
                           'nevermarr + idlnchld_noans + heduc_noans', data=df)
    fitted = model3.fit()
    print("---- Q9: F=%f, p=%f, k1=%f" %
          model2.fit().compare_f_test(model3.fit()))

    # Q10

    model = smf.ols('ceb ~ age + educ + idlnchld + knowmeth + agefm + heduc + urban + electric + bicycle +'\
                           'nevermarr + idlnchld_noans + heduc_noans', data=df)
    fitted = model.fit(cov_type='HC1')
    sum = fitted.summary()
    print('---- Q10\n:', sum)

    return
예제 #4
0
def breusch_pagan():
    names = [
        "Lagrange multiplier statistic", "p-value", "f-value", "f p-value"
    ]
    test = het_breuschpagan(res.resid, res.model.exog)
    print(dict(zip(names, test)))
예제 #5
0
import pandas as pd
import os
import statsmodels.api as sm
from statsmodels.stats.diagnostic import het_breuschpagan

base_path = os.path.dirname(__file__)
base_path = os.path.join(base_path, 'Aula 9')

df = pd.read_stata(os.path.join(base_path, 'HPRICE1-Kayo.dta'))
print(df.head())
# df.corr()

X = df[['lotsize', 'sqrft', 'bdrms']]
y = df['price']

X2 = sm.add_constant(X)
# model = sm.RLM(y, X2,  M=sm.robust.norms.HuberT())
model = sm.OLS(y, X2)
results = model.fit(cov_type='HC0')
# Statsmodels gives R-like statistical output
print(results.summary())

name = ['Lagrange multiplier statistic', 'p-value', 'f-value', 'f p-value']
bp = het_breuschpagan(results.resid, results.model.exog)
bp
df = pd.DataFrame(name, bp)
print(df)
예제 #6
0
파일: regression.py 프로젝트: pjryan356/CoB
#define input
X2 = sm.add_constant(X)

# create OLS model
model = sm.OLS(Y, X2)

# fit data
est = model.fit()

# test for heteroscedasticity (want p values over 0.05)
_, pval, _, f_pval = diag.het_white(est.resid, est.model.exog, retres=False)
print(pval, f_pval)
print('_' * 100)

_, pval, _, f_pval = diag.het_breuschpagan(est.resid, est.model.exog)
print(pval, f_pval)
print('_' * 100)
# values greater than 0.05 there is no heterodasticity

# test for autocorrelation want p values over 0.05)
lag = min(10, (len(X) // 5))
print("number of lags is {}".format(lag))
ibvalue, pval = diag.acorr_ljungbox(est.resid, lags=lag)
print(min(pval))
print('_' * 100)

# check residuals are normal (visual, close to line)

sm.qqplot(est.resid, line='s')
pylab.show()
예제 #7
0
    def test_all(self):

        d = macrodata.load_pandas().data
        #import datasetswsm.greene as g
        #d = g.load('5-1')

        #growth rates
        gs_l_realinv = 400 * np.diff(np.log(d['realinv'].values))
        gs_l_realgdp = 400 * np.diff(np.log(d['realgdp'].values))

        #simple diff, not growthrate, I want heteroscedasticity later for testing
        endogd = np.diff(d['realinv'])
        exogd = add_constant(np.c_[np.diff(d['realgdp'].values),
                                   d['realint'][:-1].values])

        endogg = gs_l_realinv
        exogg = add_constant(np.c_[gs_l_realgdp, d['realint'][:-1].values])

        res_ols = OLS(endogg, exogg).fit()
        #print res_ols.params

        mod_g1 = GLSAR(endogg, exogg, rho=-0.108136)
        res_g1 = mod_g1.fit()
        #print res_g1.params

        mod_g2 = GLSAR(endogg, exogg, rho=-0.108136)  #-0.1335859) from R
        res_g2 = mod_g2.iterative_fit(maxiter=5)
        #print res_g2.params

        rho = -0.108136

        #                 coefficient   std. error   t-ratio    p-value 95% CONFIDENCE INTERVAL
        partable = np.array([
            [-9.50990, 0.990456, -9.602, 3.65e-018, -11.4631, -7.55670],  # ***
            [4.37040, 0.208146, 21.00, 2.93e-052, 3.95993, 4.78086],  # ***
            [-0.579253, 0.268009, -2.161, 0.0319, -1.10777, -0.0507346]
        ])  #    **

        #Statistics based on the rho-differenced data:

        result_gretl_g1 = dict(endog_mean=("Mean dependent var", 3.113973),
                               endog_std=("S.D. dependent var", 18.67447),
                               ssr=("Sum squared resid", 22530.90),
                               mse_resid_sqrt=("S.E. of regression", 10.66735),
                               rsquared=("R-squared", 0.676973),
                               rsquared_adj=("Adjusted R-squared", 0.673710),
                               fvalue=("F(2, 198)", 221.0475),
                               f_pvalue=("P-value(F)", 3.56e-51),
                               resid_acf1=("rho", -0.003481),
                               dw=("Durbin-Watson", 1.993858))

        #fstatistic, p-value, df1, df2
        reset_2_3 = [5.219019, 0.00619, 2, 197, "f"]
        reset_2 = [7.268492, 0.00762, 1, 198, "f"]
        reset_3 = [5.248951, 0.023, 1, 198, "f"]
        #LM-statistic, p-value, df
        arch_4 = [7.30776, 0.120491, 4, "chi2"]

        #multicollinearity
        vif = [1.002, 1.002]
        cond_1norm = 6862.0664
        determinant = 1.0296049e+009
        reciprocal_condition_number = 0.013819244

        #Chi-square(2): test-statistic, pvalue, df
        normality = [20.2792, 3.94837e-005, 2]

        #tests
        res = res_g1  #with rho from Gretl

        #basic

        assert_almost_equal(res.params, partable[:, 0], 4)
        assert_almost_equal(res.bse, partable[:, 1], 6)
        assert_almost_equal(res.tvalues, partable[:, 2], 2)

        assert_almost_equal(res.ssr, result_gretl_g1['ssr'][1], decimal=2)
        #assert_almost_equal(res.llf, result_gretl_g1['llf'][1], decimal=7) #not in gretl
        #assert_almost_equal(res.rsquared, result_gretl_g1['rsquared'][1], decimal=7) #FAIL
        #assert_almost_equal(res.rsquared_adj, result_gretl_g1['rsquared_adj'][1], decimal=7) #FAIL
        assert_almost_equal(np.sqrt(res.mse_resid),
                            result_gretl_g1['mse_resid_sqrt'][1],
                            decimal=5)
        assert_almost_equal(res.fvalue,
                            result_gretl_g1['fvalue'][1],
                            decimal=4)
        assert_allclose(res.f_pvalue,
                        result_gretl_g1['f_pvalue'][1],
                        rtol=1e-2)
        #assert_almost_equal(res.durbin_watson, result_gretl_g1['dw'][1], decimal=7) #TODO

        #arch
        #sm_arch = smsdia.acorr_lm(res.wresid**2, maxlag=4, autolag=None)
        sm_arch = smsdia.het_arch(res.wresid, nlags=4)
        assert_almost_equal(sm_arch[0], arch_4[0], decimal=4)
        assert_almost_equal(sm_arch[1], arch_4[1], decimal=6)

        #tests
        res = res_g2  #with estimated rho

        #estimated lag coefficient
        assert_almost_equal(res.model.rho, rho, decimal=3)

        #basic
        assert_almost_equal(res.params, partable[:, 0], 4)
        assert_almost_equal(res.bse, partable[:, 1], 3)
        assert_almost_equal(res.tvalues, partable[:, 2], 2)

        assert_almost_equal(res.ssr, result_gretl_g1['ssr'][1], decimal=2)
        #assert_almost_equal(res.llf, result_gretl_g1['llf'][1], decimal=7) #not in gretl
        #assert_almost_equal(res.rsquared, result_gretl_g1['rsquared'][1], decimal=7) #FAIL
        #assert_almost_equal(res.rsquared_adj, result_gretl_g1['rsquared_adj'][1], decimal=7) #FAIL
        assert_almost_equal(np.sqrt(res.mse_resid),
                            result_gretl_g1['mse_resid_sqrt'][1],
                            decimal=5)
        assert_almost_equal(res.fvalue,
                            result_gretl_g1['fvalue'][1],
                            decimal=0)
        assert_almost_equal(res.f_pvalue,
                            result_gretl_g1['f_pvalue'][1],
                            decimal=6)
        #assert_almost_equal(res.durbin_watson, result_gretl_g1['dw'][1], decimal=7) #TODO

        c = oi.reset_ramsey(res, degree=2)
        compare_ftest(c, reset_2, decimal=(2, 4))
        c = oi.reset_ramsey(res, degree=3)
        compare_ftest(c, reset_2_3, decimal=(2, 4))

        #arch
        #sm_arch = smsdia.acorr_lm(res.wresid**2, maxlag=4, autolag=None)
        sm_arch = smsdia.het_arch(res.wresid, nlags=4)
        assert_almost_equal(sm_arch[0], arch_4[0], decimal=1)
        assert_almost_equal(sm_arch[1], arch_4[1], decimal=2)
        '''
        Performing iterative calculation of rho...

                         ITER       RHO        ESS
                           1     -0.10734   22530.9
                           2     -0.10814   22530.9

        Model 4: Cochrane-Orcutt, using observations 1959:3-2009:3 (T = 201)
        Dependent variable: ds_l_realinv
        rho = -0.108136

                         coefficient   std. error   t-ratio    p-value
          -------------------------------------------------------------
          const           -9.50990      0.990456    -9.602    3.65e-018 ***
          ds_l_realgdp     4.37040      0.208146    21.00     2.93e-052 ***
          realint_1       -0.579253     0.268009    -2.161    0.0319    **

        Statistics based on the rho-differenced data:

        Mean dependent var   3.113973   S.D. dependent var   18.67447
        Sum squared resid    22530.90   S.E. of regression   10.66735
        R-squared            0.676973   Adjusted R-squared   0.673710
        F(2, 198)            221.0475   P-value(F)           3.56e-51
        rho                 -0.003481   Durbin-Watson        1.993858
        '''
        '''
        RESET test for specification (squares and cubes)
        Test statistic: F = 5.219019,
        with p-value = P(F(2,197) > 5.21902) = 0.00619

        RESET test for specification (squares only)
        Test statistic: F = 7.268492,
        with p-value = P(F(1,198) > 7.26849) = 0.00762

        RESET test for specification (cubes only)
        Test statistic: F = 5.248951,
        with p-value = P(F(1,198) > 5.24895) = 0.023:
        '''
        '''
        Test for ARCH of order 4

                     coefficient   std. error   t-ratio   p-value
          --------------------------------------------------------
          alpha(0)   97.0386       20.3234       4.775    3.56e-06 ***
          alpha(1)    0.176114      0.0714698    2.464    0.0146   **
          alpha(2)   -0.0488339     0.0724981   -0.6736   0.5014
          alpha(3)   -0.0705413     0.0737058   -0.9571   0.3397
          alpha(4)    0.0384531     0.0725763    0.5298   0.5968

          Null hypothesis: no ARCH effect is present
          Test statistic: LM = 7.30776
          with p-value = P(Chi-square(4) > 7.30776) = 0.120491:
        '''
        '''
        Variance Inflation Factors

        Minimum possible value = 1.0
        Values > 10.0 may indicate a collinearity problem

           ds_l_realgdp    1.002
              realint_1    1.002

        VIF(j) = 1/(1 - R(j)^2), where R(j) is the multiple correlation coefficient
        between variable j and the other independent variables

        Properties of matrix X'X:

         1-norm = 6862.0664
         Determinant = 1.0296049e+009
         Reciprocal condition number = 0.013819244
        '''
        '''
        Test for ARCH of order 4 -
          Null hypothesis: no ARCH effect is present
          Test statistic: LM = 7.30776
          with p-value = P(Chi-square(4) > 7.30776) = 0.120491

        Test of common factor restriction -
          Null hypothesis: restriction is acceptable
          Test statistic: F(2, 195) = 0.426391
          with p-value = P(F(2, 195) > 0.426391) = 0.653468

        Test for normality of residual -
          Null hypothesis: error is normally distributed
          Test statistic: Chi-square(2) = 20.2792
          with p-value = 3.94837e-005:
        '''

        #no idea what this is
        '''
        Augmented regression for common factor test
        OLS, using observations 1959:3-2009:3 (T = 201)
        Dependent variable: ds_l_realinv

                           coefficient   std. error   t-ratio    p-value
          ---------------------------------------------------------------
          const            -10.9481      1.35807      -8.062    7.44e-014 ***
          ds_l_realgdp       4.28893     0.229459     18.69     2.40e-045 ***
          realint_1         -0.662644    0.334872     -1.979    0.0492    **
          ds_l_realinv_1    -0.108892    0.0715042    -1.523    0.1294
          ds_l_realgdp_1     0.660443    0.390372      1.692    0.0923    *
          realint_2          0.0769695   0.341527      0.2254   0.8219

          Sum of squared residuals = 22432.8

        Test of common factor restriction

          Test statistic: F(2, 195) = 0.426391, with p-value = 0.653468
        '''

        ################ with OLS, HAC errors

        #Model 5: OLS, using observations 1959:2-2009:3 (T = 202)
        #Dependent variable: ds_l_realinv
        #HAC standard errors, bandwidth 4 (Bartlett kernel)

        #coefficient   std. error   t-ratio    p-value 95% CONFIDENCE INTERVAL
        #for confidence interval t(199, 0.025) = 1.972

        partable = np.array([
            [-9.48167, 1.17709, -8.055, 7.17e-014, -11.8029, -7.16049],  # ***
            [4.37422, 0.328787, 13.30, 2.62e-029, 3.72587, 5.02258],  #***
            [-0.613997, 0.293619, -2.091, 0.0378, -1.19300, -0.0349939]
        ])  # **

        result_gretl_g1 = dict(endog_mean=("Mean dependent var", 3.257395),
                               endog_std=("S.D. dependent var", 18.73915),
                               ssr=("Sum squared resid", 22799.68),
                               mse_resid_sqrt=("S.E. of regression", 10.70380),
                               rsquared=("R-squared", 0.676978),
                               rsquared_adj=("Adjusted R-squared", 0.673731),
                               fvalue=("F(2, 199)", 90.79971),
                               f_pvalue=("P-value(F)", 9.53e-29),
                               llf=("Log-likelihood", -763.9752),
                               aic=("Akaike criterion", 1533.950),
                               bic=("Schwarz criterion", 1543.875),
                               hqic=("Hannan-Quinn", 1537.966),
                               resid_acf1=("rho", -0.107341),
                               dw=("Durbin-Watson", 2.213805))

        linear_logs = [1.68351, 0.430953, 2, "chi2"]
        #for logs: dropping 70 nan or incomplete observations, T=133
        #(res_ols.model.exog <=0).any(1).sum() = 69  ?not 70
        linear_squares = [7.52477, 0.0232283, 2, "chi2"]

        #Autocorrelation, Breusch-Godfrey test for autocorrelation up to order 4
        lm_acorr4 = [1.17928, 0.321197, 4, 195, "F"]
        lm2_acorr4 = [4.771043, 0.312, 4, "chi2"]
        acorr_ljungbox4 = [5.23587, 0.264, 4, "chi2"]

        #break
        cusum_Harvey_Collier = [0.494432, 0.621549, 198,
                                "t"]  #stats.t.sf(0.494432, 198)*2
        #see cusum results in files
        break_qlr = [3.01985, 0.1, 3, 196,
                     "maxF"]  #TODO check this, max at 2001:4
        break_chow = [13.1897, 0.00424384, 3, "chi2"]  # break at 1984:1

        arch_4 = [3.43473, 0.487871, 4, "chi2"]

        normality = [23.962, 0.00001, 2, "chi2"]

        het_white = [33.503723, 0.000003, 5, "chi2"]
        het_breusch_pagan = [1.302014, 0.521520, 2,
                             "chi2"]  #TODO: not available
        het_breusch_pagan_konker = [0.709924, 0.701200, 2, "chi2"]

        reset_2_3 = [5.219019, 0.00619, 2, 197, "f"]
        reset_2 = [7.268492, 0.00762, 1, 198, "f"]
        reset_3 = [5.248951, 0.023, 1, 198, "f"]  #not available

        cond_1norm = 5984.0525
        determinant = 7.1087467e+008
        reciprocal_condition_number = 0.013826504
        vif = [1.001, 1.001]

        names = 'date   residual        leverage       influence        DFFITS'.split(
        )
        cur_dir = os.path.abspath(os.path.dirname(__file__))
        fpath = os.path.join(cur_dir,
                             'results/leverage_influence_ols_nostars.txt')
        lev = np.genfromtxt(fpath,
                            skip_header=3,
                            skip_footer=1,
                            converters={0: lambda s: s})
        #either numpy 1.6 or python 3.2 changed behavior
        if np.isnan(lev[-1]['f1']):
            lev = np.genfromtxt(fpath,
                                skip_header=3,
                                skip_footer=2,
                                converters={0: lambda s: s})

        lev.dtype.names = names

        res = res_ols  #for easier copying

        cov_hac = sw.cov_hac_simple(res, nlags=4, use_correction=False)
        bse_hac = sw.se_cov(cov_hac)

        assert_almost_equal(res.params, partable[:, 0], 5)
        assert_almost_equal(bse_hac, partable[:, 1], 5)
        #TODO

        assert_almost_equal(res.ssr, result_gretl_g1['ssr'][1], decimal=2)
        assert_almost_equal(res.llf, result_gretl_g1['llf'][1],
                            decimal=4)  #not in gretl
        assert_almost_equal(res.rsquared,
                            result_gretl_g1['rsquared'][1],
                            decimal=6)  #FAIL
        assert_almost_equal(res.rsquared_adj,
                            result_gretl_g1['rsquared_adj'][1],
                            decimal=6)  #FAIL
        assert_almost_equal(np.sqrt(res.mse_resid),
                            result_gretl_g1['mse_resid_sqrt'][1],
                            decimal=5)
        #f-value is based on cov_hac I guess
        #res2 = res.get_robustcov_results(cov_type='HC1')
        # TODO: fvalue differs from Gretl, trying any of the HCx
        #assert_almost_equal(res2.fvalue, result_gretl_g1['fvalue'][1], decimal=0) #FAIL
        #assert_approx_equal(res.f_pvalue, result_gretl_g1['f_pvalue'][1], significant=1) #FAIL
        #assert_almost_equal(res.durbin_watson, result_gretl_g1['dw'][1], decimal=7) #TODO

        c = oi.reset_ramsey(res, degree=2)
        compare_ftest(c, reset_2, decimal=(6, 5))
        c = oi.reset_ramsey(res, degree=3)
        compare_ftest(c, reset_2_3, decimal=(6, 5))

        linear_sq = smsdia.linear_lm(res.resid, res.model.exog)
        assert_almost_equal(linear_sq[0], linear_squares[0], decimal=6)
        assert_almost_equal(linear_sq[1], linear_squares[1], decimal=7)

        hbpk = smsdia.het_breuschpagan(res.resid, res.model.exog)
        assert_almost_equal(hbpk[0], het_breusch_pagan_konker[0], decimal=6)
        assert_almost_equal(hbpk[1], het_breusch_pagan_konker[1], decimal=6)

        hw = smsdia.het_white(res.resid, res.model.exog)
        assert_almost_equal(hw[:2], het_white[:2], 6)

        #arch
        #sm_arch = smsdia.acorr_lm(res.resid**2, maxlag=4, autolag=None)
        sm_arch = smsdia.het_arch(res.resid, nlags=4)
        assert_almost_equal(sm_arch[0], arch_4[0], decimal=5)
        assert_almost_equal(sm_arch[1], arch_4[1], decimal=6)

        vif2 = [
            oi.variance_inflation_factor(res.model.exog, k) for k in [1, 2]
        ]

        infl = oi.OLSInfluence(res_ols)
        #print np.max(np.abs(lev['DFFITS'] - infl.dffits[0]))
        #print np.max(np.abs(lev['leverage'] - infl.hat_matrix_diag))
        #print np.max(np.abs(lev['influence'] - infl.influence))  #just added this based on Gretl

        #just rough test, low decimal in Gretl output,
        assert_almost_equal(lev['residual'], res.resid, decimal=3)
        assert_almost_equal(lev['DFFITS'], infl.dffits[0], decimal=3)
        assert_almost_equal(lev['leverage'], infl.hat_matrix_diag, decimal=3)
        assert_almost_equal(lev['influence'], infl.influence, decimal=4)
예제 #8
0



















diag.het_goldfeldquandt(modelMR2.resid, modelMR2.model.exog)
diag.het_breuschpagan(modelMR4.resid, modelMR4.model.exog)
diag.het_white(modelMR2.resid, modelMR2.model.exog, retres = False)
diag.(modelMR2.resid, modelMR2.model.exog)


diag.acorr_ljungbox(modelMR2.resid)



예제 #9
0
def breusch_pagan_test(dataset, regression_dict, results_dict):
    for key in regression_dict.keys():
        BP_statistic = smd.het_breuschpagan(results_dict[key].resid,
                                            dataset[regression_dict[key][1:]])
        print("{}\nBP:\t{:.2f}\nP-Val:\t{:.4f}".format(key, BP_statistic[0],
                                                       BP_statistic[1]))
# Fit Random Forest Regressor according to best hyperparameters
regressor = RandomForestRegressor(random_state=0, n_estimators=500, max_depth=3, max_features=4, 
                                  min_samples_leaf=1, bootstrap=True)
regressor.fit(X, y)

# Obtain R2 squared and adjusted R-squared
r2 = regressor.score(X,y)
RAdjusted = 1-(((1-r2)*len(dataset['demand_Maa'])/(len(dataset['demand_Maa'])-5-1)))

# Apply 10-fold cross validation
maescores = cross_val_score(regressor, X, y, scoring='neg_mean_absolute_error', cv=10, n_jobs=-1)

# Obtain AIC and BIC value
y_hat = regressor.predict(X)
residuals = y - y_hat
res = het_breuschpagan(residuals, X)
sse = sum(residuals**2)
AIC = 2*4 - 2*np.log(sse)
BIC = 54*np.log(sse/54) + 4*np.log(54)

# Print scores
#print("The Mean Absolute Error of the Random Forest Regressor is " + str(abs(np.mean(maescores))))
print("The R2 squared of the Random Forest Regressor is " + str(r2))
print("The adjusted R2 squared of the Random Forest Regressor is " + str(RAdjusted))
print("The AIC score of the Random Forest Regressor is " + str(AIC))
print("The BIC score of the Random Forest Regressor " + str(BIC))
print("The p-value from the Breuschpagan test is " + str(res))
 
# Plot feature importance
feature_import = pd.DataFrame(data=regressor.feature_importances_, index=['own_price','compe_price','inc_per_capita','pro_exp','Annual_Prod_MSeeds'], columns=['values'])
feature_import.sort_values(['values'], ascending=False, inplace=True)
예제 #11
0
    variance_inflation_factor(X.values, i) for i in range(X.shape[1])
]
vif["features"] = X.columns
vif

# <h3>3. Homoscedasticity</h3>
#
# When residuals do not have constant variance (they exhibit heteroscedasticity), it is difficult to determine the true standard deviation of the forecast errors, usually resulting in confidence intervals that are too wide/narrow. For example, if the variance of the residuals is increasing over time, confidence intervals for out-of-sample predictions will be unrealistically narrow.

# In[49]:

from statsmodels.stats.diagnostic import het_breuschpagan
from statsmodels.stats.diagnostic import het_white

# breuschpagan test
bp_test = het_breuschpagan(res.resids, df[independent_vars].dropna())
labels = ['BP Statistic', 'BP-Test p-value', 'F-Statistic', 'F-Test p-value']
print(pd.Series(zip(labels, bp_test)))

# In[50]:

# visualization of heteroscedasticity
fitted = res.fitted_values
residuals = res.resids
plt.figure(figsize=(12, 8))
sns.regplot(x=fitted, y=residuals)
plt.xlabel('Fitted Values')
plt.xlim([4, 6])
plt.show()

# <center><h2>WEIGHTED REGRESSION</h2></center>
y_pred1 = result1.predict(X1)
err1 = y1 - y_pred1

plt.scatter(y1, y_pred1)
plt.plot(xx, xx, color='k')
plt.ylabel('Predicted')
plt.xlabel('Real')

##Need to find another model, when price is higher, it seems under-estimate

plt.hist(err1, bins=50)
probplot(err1, plot=plt)

##
diagnostic.het_breuschpagan(err1, X1)  ##result ftest, chi-square
diagnostic.het_breuschpagan(err1, X1[['bedrooms', 'bathrooms']])
##to delete outliers

cond = (house['price'] < 1000000) & (house['price'] >= 20000)
X2 = house[cond][varlist]
y2 = house[cond]['price']
X2 = sm.add_constant(X2)

y2.plot.kde()

model2 = sm.OLS(y2, X2)
result2 = model2.fit()
result2.summary()

y_pred2 = result2.predict(X2)
#plt.title('Stadium Percentage - Reg(p-value) - Normal Q-Q')
plt.savefig(savepath+'QQPlot.jpeg')
plt.show()

_,p=shapiro(y_train_res2)
print(p)
if p>0.05:
    x="The residuals seem to come from Gaussian process"
else:
    x="The normality assumption may not hold"
pvalue=pd.DataFrame(['Stadium Percentage Regression (p-value)',p,x]).T
pvalue=pvalue.rename(columns={0:'Model',1:'p-value',2:'Interpretation'})
savedfile=pvalue.to_csv('C:/Users/bcheasty/OneDrive - Athlone Institute Of Technology/Research Project/Data Set Creation/Data/Model/pvalue7.csv',index=False)

#Breuschpagan test
bp_test = het_breuschpagan(y_train_res, X_train2)
labels = ['LM Statistic', 'LM-Test p-value', 'F-Statistic', 'F-Test p-value']
bp=(dict(zip(labels, bp_test)))
bp=pd.DataFrame(bp,index=[0])
bp['Model']='Stadium Percentage - Reg(p-value)'
bp.loc[(bp['F-Test p-value']<0.05),'Interpretation']='The Residuals seem to be Heteroskedastic'
bp.loc[(bp['F-Test p-value']>0.05),'Interpretation']='The Residuals seem to be Homoskedastic'
bp=bp[['Model','F-Test p-value','Interpretation']]
savedfile=bp.to_csv('C:/Users/bcheasty/OneDrive - Athlone Institute Of Technology/Research Project/Data Set Creation/Data/Model/bp7.csv',index=False)

#Scale Loction
plot_lm_3 = plt.figure()
plt.scatter(y_train_pred, y_train_res, alpha=0.5,
            c='steelblue', marker='o', edgecolor='white',
            label='Training data');
sns.regplot(y_train_pred, y_train_res,
# Feel free to adjust the chart size
plt.figure(figsize = (15,7.5))
plt.plot(result['2016-7':'2016-9'].index,result.loc['2016-7':'2016-9','simple regression'])
plt.plot(result['2016-7':'2016-9'].index,result.loc['2016-7':'2016-9','fama_french'])
plt.plot(result['2016-7':'2016-9'].index,result.loc['2016-7':'2016-9','sample'])
plt.legend()
plt.show()


# In[22]:


plt.figure()
fama_model,resid.plot.density()
plt.show()


# In[23]:


print ('Residual mean:', np.mean(fama_model.resid))


# In[ ]:


from statsmodels.stats import diagnostic as dia
het = dia.het_breuschpagan(fama_model.resid,fama_df[['MKT','SMB','HML','RMW','CMA']][1:])
print ('p-value: ', het[-1])

예제 #15
0
    def test_all(self):

        d = macrodata.load_pandas().data
        #import datasetswsm.greene as g
        #d = g.load('5-1')

        #growth rates
        gs_l_realinv = 400 * np.diff(np.log(d['realinv'].values))
        gs_l_realgdp = 400 * np.diff(np.log(d['realgdp'].values))

        #simple diff, not growthrate, I want heteroscedasticity later for testing
        endogd = np.diff(d['realinv'])
        exogd = add_constant(np.c_[np.diff(d['realgdp'].values), d['realint'][:-1].values])

        endogg = gs_l_realinv
        exogg = add_constant(np.c_[gs_l_realgdp, d['realint'][:-1].values])

        res_ols = OLS(endogg, exogg).fit()
        #print res_ols.params

        mod_g1 = GLSAR(endogg, exogg, rho=-0.108136)
        res_g1 = mod_g1.fit()
        #print res_g1.params

        mod_g2 = GLSAR(endogg, exogg, rho=-0.108136)   #-0.1335859) from R
        res_g2 = mod_g2.iterative_fit(maxiter=5)
        #print res_g2.params


        rho = -0.108136

        #                 coefficient   std. error   t-ratio    p-value 95% CONFIDENCE INTERVAL
        partable = np.array([
                        [-9.50990,  0.990456, -9.602, 3.65e-018, -11.4631, -7.55670], # ***
                        [ 4.37040,  0.208146, 21.00,  2.93e-052,  3.95993, 4.78086], # ***
                        [-0.579253, 0.268009, -2.161, 0.0319, -1.10777, -0.0507346]]) #    **

        #Statistics based on the rho-differenced data:

        result_gretl_g1 = dict(
        endog_mean = ("Mean dependent var",   3.113973),
        endog_std = ("S.D. dependent var",   18.67447),
        ssr = ("Sum squared resid",    22530.90),
        mse_resid_sqrt = ("S.E. of regression",   10.66735),
        rsquared = ("R-squared",            0.676973),
        rsquared_adj = ("Adjusted R-squared",   0.673710),
        fvalue = ("F(2, 198)",            221.0475),
        f_pvalue = ("P-value(F)",           3.56e-51),
        resid_acf1 = ("rho",                 -0.003481),
        dw = ("Durbin-Watson",        1.993858))


        #fstatistic, p-value, df1, df2
        reset_2_3 = [5.219019, 0.00619, 2, 197, "f"]
        reset_2 = [7.268492, 0.00762, 1, 198, "f"]
        reset_3 = [5.248951, 0.023, 1, 198, "f"]
        #LM-statistic, p-value, df
        arch_4 = [7.30776, 0.120491, 4, "chi2"]

        #multicollinearity
        vif = [1.002, 1.002]
        cond_1norm = 6862.0664
        determinant = 1.0296049e+009
        reciprocal_condition_number = 0.013819244

        #Chi-square(2): test-statistic, pvalue, df
        normality = [20.2792, 3.94837e-005, 2]

        #tests
        res = res_g1  #with rho from Gretl

        #basic

        assert_almost_equal(res.params, partable[:,0], 4)
        assert_almost_equal(res.bse, partable[:,1], 6)
        assert_almost_equal(res.tvalues, partable[:,2], 2)

        assert_almost_equal(res.ssr, result_gretl_g1['ssr'][1], decimal=2)
        #assert_almost_equal(res.llf, result_gretl_g1['llf'][1], decimal=7) #not in gretl
        #assert_almost_equal(res.rsquared, result_gretl_g1['rsquared'][1], decimal=7) #FAIL
        #assert_almost_equal(res.rsquared_adj, result_gretl_g1['rsquared_adj'][1], decimal=7) #FAIL
        assert_almost_equal(np.sqrt(res.mse_resid), result_gretl_g1['mse_resid_sqrt'][1], decimal=5)
        assert_almost_equal(res.fvalue, result_gretl_g1['fvalue'][1], decimal=4)
        assert_allclose(res.f_pvalue,
                        result_gretl_g1['f_pvalue'][1],
                        rtol=1e-2)
        #assert_almost_equal(res.durbin_watson, result_gretl_g1['dw'][1], decimal=7) #TODO

        #arch
        #sm_arch = smsdia.acorr_lm(res.wresid**2, maxlag=4, autolag=None)
        sm_arch = smsdia.het_arch(res.wresid, maxlag=4)
        assert_almost_equal(sm_arch[0], arch_4[0], decimal=4)
        assert_almost_equal(sm_arch[1], arch_4[1], decimal=6)

        #tests
        res = res_g2 #with estimated rho

        #estimated lag coefficient
        assert_almost_equal(res.model.rho, rho, decimal=3)

        #basic
        assert_almost_equal(res.params, partable[:,0], 4)
        assert_almost_equal(res.bse, partable[:,1], 3)
        assert_almost_equal(res.tvalues, partable[:,2], 2)

        assert_almost_equal(res.ssr, result_gretl_g1['ssr'][1], decimal=2)
        #assert_almost_equal(res.llf, result_gretl_g1['llf'][1], decimal=7) #not in gretl
        #assert_almost_equal(res.rsquared, result_gretl_g1['rsquared'][1], decimal=7) #FAIL
        #assert_almost_equal(res.rsquared_adj, result_gretl_g1['rsquared_adj'][1], decimal=7) #FAIL
        assert_almost_equal(np.sqrt(res.mse_resid), result_gretl_g1['mse_resid_sqrt'][1], decimal=5)
        assert_almost_equal(res.fvalue, result_gretl_g1['fvalue'][1], decimal=0)
        assert_almost_equal(res.f_pvalue, result_gretl_g1['f_pvalue'][1], decimal=6)
        #assert_almost_equal(res.durbin_watson, result_gretl_g1['dw'][1], decimal=7) #TODO



        c = oi.reset_ramsey(res, degree=2)
        compare_ftest(c, reset_2, decimal=(2,4))
        c = oi.reset_ramsey(res, degree=3)
        compare_ftest(c, reset_2_3, decimal=(2,4))

        #arch
        #sm_arch = smsdia.acorr_lm(res.wresid**2, maxlag=4, autolag=None)
        sm_arch = smsdia.het_arch(res.wresid, maxlag=4)
        assert_almost_equal(sm_arch[0], arch_4[0], decimal=1)
        assert_almost_equal(sm_arch[1], arch_4[1], decimal=2)



        '''
        Performing iterative calculation of rho...

                         ITER       RHO        ESS
                           1     -0.10734   22530.9
                           2     -0.10814   22530.9

        Model 4: Cochrane-Orcutt, using observations 1959:3-2009:3 (T = 201)
        Dependent variable: ds_l_realinv
        rho = -0.108136

                         coefficient   std. error   t-ratio    p-value
          -------------------------------------------------------------
          const           -9.50990      0.990456    -9.602    3.65e-018 ***
          ds_l_realgdp     4.37040      0.208146    21.00     2.93e-052 ***
          realint_1       -0.579253     0.268009    -2.161    0.0319    **

        Statistics based on the rho-differenced data:

        Mean dependent var   3.113973   S.D. dependent var   18.67447
        Sum squared resid    22530.90   S.E. of regression   10.66735
        R-squared            0.676973   Adjusted R-squared   0.673710
        F(2, 198)            221.0475   P-value(F)           3.56e-51
        rho                 -0.003481   Durbin-Watson        1.993858
        '''

        '''
        RESET test for specification (squares and cubes)
        Test statistic: F = 5.219019,
        with p-value = P(F(2,197) > 5.21902) = 0.00619

        RESET test for specification (squares only)
        Test statistic: F = 7.268492,
        with p-value = P(F(1,198) > 7.26849) = 0.00762

        RESET test for specification (cubes only)
        Test statistic: F = 5.248951,
        with p-value = P(F(1,198) > 5.24895) = 0.023:
        '''

        '''
        Test for ARCH of order 4

                     coefficient   std. error   t-ratio   p-value
          --------------------------------------------------------
          alpha(0)   97.0386       20.3234       4.775    3.56e-06 ***
          alpha(1)    0.176114      0.0714698    2.464    0.0146   **
          alpha(2)   -0.0488339     0.0724981   -0.6736   0.5014
          alpha(3)   -0.0705413     0.0737058   -0.9571   0.3397
          alpha(4)    0.0384531     0.0725763    0.5298   0.5968

          Null hypothesis: no ARCH effect is present
          Test statistic: LM = 7.30776
          with p-value = P(Chi-square(4) > 7.30776) = 0.120491:
        '''

        '''
        Variance Inflation Factors

        Minimum possible value = 1.0
        Values > 10.0 may indicate a collinearity problem

           ds_l_realgdp    1.002
              realint_1    1.002

        VIF(j) = 1/(1 - R(j)^2), where R(j) is the multiple correlation coefficient
        between variable j and the other independent variables

        Properties of matrix X'X:

         1-norm = 6862.0664
         Determinant = 1.0296049e+009
         Reciprocal condition number = 0.013819244
        '''
        '''
        Test for ARCH of order 4 -
          Null hypothesis: no ARCH effect is present
          Test statistic: LM = 7.30776
          with p-value = P(Chi-square(4) > 7.30776) = 0.120491

        Test of common factor restriction -
          Null hypothesis: restriction is acceptable
          Test statistic: F(2, 195) = 0.426391
          with p-value = P(F(2, 195) > 0.426391) = 0.653468

        Test for normality of residual -
          Null hypothesis: error is normally distributed
          Test statistic: Chi-square(2) = 20.2792
          with p-value = 3.94837e-005:
        '''

        #no idea what this is
        '''
        Augmented regression for common factor test
        OLS, using observations 1959:3-2009:3 (T = 201)
        Dependent variable: ds_l_realinv

                           coefficient   std. error   t-ratio    p-value
          ---------------------------------------------------------------
          const            -10.9481      1.35807      -8.062    7.44e-014 ***
          ds_l_realgdp       4.28893     0.229459     18.69     2.40e-045 ***
          realint_1         -0.662644    0.334872     -1.979    0.0492    **
          ds_l_realinv_1    -0.108892    0.0715042    -1.523    0.1294
          ds_l_realgdp_1     0.660443    0.390372      1.692    0.0923    *
          realint_2          0.0769695   0.341527      0.2254   0.8219

          Sum of squared residuals = 22432.8

        Test of common factor restriction

          Test statistic: F(2, 195) = 0.426391, with p-value = 0.653468
        '''


        ################ with OLS, HAC errors

        #Model 5: OLS, using observations 1959:2-2009:3 (T = 202)
        #Dependent variable: ds_l_realinv
        #HAC standard errors, bandwidth 4 (Bartlett kernel)

        #coefficient   std. error   t-ratio    p-value 95% CONFIDENCE INTERVAL
        #for confidence interval t(199, 0.025) = 1.972

        partable = np.array([
        [-9.48167,      1.17709,     -8.055,    7.17e-014, -11.8029, -7.16049], # ***
        [4.37422,      0.328787,    13.30,     2.62e-029, 3.72587, 5.02258], #***
        [-0.613997,     0.293619,    -2.091,    0.0378, -1.19300, -0.0349939]]) # **

        result_gretl_g1 = dict(
                    endog_mean = ("Mean dependent var",   3.257395),
                    endog_std = ("S.D. dependent var",   18.73915),
                    ssr = ("Sum squared resid",    22799.68),
                    mse_resid_sqrt = ("S.E. of regression",   10.70380),
                    rsquared = ("R-squared",            0.676978),
                    rsquared_adj = ("Adjusted R-squared",   0.673731),
                    fvalue = ("F(2, 199)",            90.79971),
                    f_pvalue = ("P-value(F)",           9.53e-29),
                    llf = ("Log-likelihood",      -763.9752),
                    aic = ("Akaike criterion",     1533.950),
                    bic = ("Schwarz criterion",    1543.875),
                    hqic = ("Hannan-Quinn",         1537.966),
                    resid_acf1 = ("rho",                 -0.107341),
                    dw = ("Durbin-Watson",        2.213805))

        linear_logs = [1.68351, 0.430953, 2, "chi2"]
        #for logs: dropping 70 nan or incomplete observations, T=133
        #(res_ols.model.exog <=0).any(1).sum() = 69  ?not 70
        linear_squares = [7.52477, 0.0232283, 2, "chi2"]

        #Autocorrelation, Breusch-Godfrey test for autocorrelation up to order 4
        lm_acorr4 = [1.17928, 0.321197, 4, 195, "F"]
        lm2_acorr4 = [4.771043, 0.312, 4, "chi2"]
        acorr_ljungbox4 = [5.23587, 0.264, 4, "chi2"]

        #break
        cusum_Harvey_Collier  = [0.494432, 0.621549, 198, "t"] #stats.t.sf(0.494432, 198)*2
        #see cusum results in files
        break_qlr = [3.01985, 0.1, 3, 196, "maxF"]  #TODO check this, max at 2001:4
        break_chow = [13.1897, 0.00424384, 3, "chi2"] # break at 1984:1

        arch_4 = [3.43473, 0.487871, 4, "chi2"]

        normality = [23.962, 0.00001, 2, "chi2"]

        het_white = [33.503723, 0.000003, 5, "chi2"]
        het_breusch_pagan = [1.302014, 0.521520, 2, "chi2"]  #TODO: not available
        het_breusch_pagan_konker = [0.709924, 0.701200, 2, "chi2"]


        reset_2_3 = [5.219019, 0.00619, 2, 197, "f"]
        reset_2 = [7.268492, 0.00762, 1, 198, "f"]
        reset_3 = [5.248951, 0.023, 1, 198, "f"]  #not available

        cond_1norm = 5984.0525
        determinant = 7.1087467e+008
        reciprocal_condition_number = 0.013826504
        vif = [1.001, 1.001]

        names = 'date   residual        leverage       influence        DFFITS'.split()
        cur_dir = os.path.abspath(os.path.dirname(__file__))
        fpath = os.path.join(cur_dir, 'results/leverage_influence_ols_nostars.txt')
        lev = np.genfromtxt(fpath, skip_header=3, skip_footer=1,
                            converters={0:lambda s: s})
        #either numpy 1.6 or python 3.2 changed behavior
        if np.isnan(lev[-1]['f1']):
            lev = np.genfromtxt(fpath, skip_header=3, skip_footer=2,
                                converters={0:lambda s: s})

        lev.dtype.names = names

        res = res_ols #for easier copying

        cov_hac = sw.cov_hac_simple(res, nlags=4, use_correction=False)
        bse_hac =  sw.se_cov(cov_hac)

        assert_almost_equal(res.params, partable[:,0], 5)
        assert_almost_equal(bse_hac, partable[:,1], 5)
        #TODO

        assert_almost_equal(res.ssr, result_gretl_g1['ssr'][1], decimal=2)
        assert_almost_equal(res.llf, result_gretl_g1['llf'][1], decimal=4) #not in gretl
        assert_almost_equal(res.rsquared, result_gretl_g1['rsquared'][1], decimal=6) #FAIL
        assert_almost_equal(res.rsquared_adj, result_gretl_g1['rsquared_adj'][1], decimal=6) #FAIL
        assert_almost_equal(np.sqrt(res.mse_resid), result_gretl_g1['mse_resid_sqrt'][1], decimal=5)
        #f-value is based on cov_hac I guess
        #res2 = res.get_robustcov_results(cov_type='HC1')
        # TODO: fvalue differs from Gretl, trying any of the HCx
        #assert_almost_equal(res2.fvalue, result_gretl_g1['fvalue'][1], decimal=0) #FAIL
        #assert_approx_equal(res.f_pvalue, result_gretl_g1['f_pvalue'][1], significant=1) #FAIL
        #assert_almost_equal(res.durbin_watson, result_gretl_g1['dw'][1], decimal=7) #TODO


        c = oi.reset_ramsey(res, degree=2)
        compare_ftest(c, reset_2, decimal=(6,5))
        c = oi.reset_ramsey(res, degree=3)
        compare_ftest(c, reset_2_3, decimal=(6,5))

        linear_sq = smsdia.linear_lm(res.resid, res.model.exog)
        assert_almost_equal(linear_sq[0], linear_squares[0], decimal=6)
        assert_almost_equal(linear_sq[1], linear_squares[1], decimal=7)

        hbpk = smsdia.het_breuschpagan(res.resid, res.model.exog)
        assert_almost_equal(hbpk[0], het_breusch_pagan_konker[0], decimal=6)
        assert_almost_equal(hbpk[1], het_breusch_pagan_konker[1], decimal=6)

        hw = smsdia.het_white(res.resid, res.model.exog)
        assert_almost_equal(hw[:2], het_white[:2], 6)

        #arch
        #sm_arch = smsdia.acorr_lm(res.resid**2, maxlag=4, autolag=None)
        sm_arch = smsdia.het_arch(res.resid, maxlag=4)
        assert_almost_equal(sm_arch[0], arch_4[0], decimal=5)
        assert_almost_equal(sm_arch[1], arch_4[1], decimal=6)

        vif2 = [oi.variance_inflation_factor(res.model.exog, k) for k in [1,2]]

        infl = oi.OLSInfluence(res_ols)
        #print np.max(np.abs(lev['DFFITS'] - infl.dffits[0]))
        #print np.max(np.abs(lev['leverage'] - infl.hat_matrix_diag))
        #print np.max(np.abs(lev['influence'] - infl.influence))  #just added this based on Gretl

        #just rough test, low decimal in Gretl output,
        assert_almost_equal(lev['residual'], res.resid, decimal=3)
        assert_almost_equal(lev['DFFITS'], infl.dffits[0], decimal=3)
        assert_almost_equal(lev['leverage'], infl.hat_matrix_diag, decimal=3)
        assert_almost_equal(lev['influence'], infl.influence, decimal=4)
예제 #16
0
    
plt.plot(RESID[k])
plt.ylabel('Residual amplitude',fontsize = 13)
plt.xlabel('Week',fontsize = 13)

import statsmodels.api as sm
sm.graphics.tsa.plot_acf(RESID[k],lags=25)
np.array(abs(RESID[k])).sum()/(len(RESID[k]))

from statsmodels.stats.diagnostic import het_breuschpagan
from statsmodels.stats.diagnostic import het_white

df_resid = pd.DataFrame(RESID[k],columns = ['resid'])
X= np.concatenate((np.ones(30).reshape(-1,1),x[-30:,:]),axis = 1)
white = het_white(list(df_resid.resid), X )
breuschpagan = het_breuschpagan(list(df_resid.resid), X )

from statsmodels.stats.diagnostic import het_breuschpagan
from statsmodels.stats.diagnostic import het_white
import pandas as pd
import statsmodels.api as sm
from statsmodels.formula.api import ols

statecrime_df = sm.datasets.statecrime.load_pandas().data
f ='violent~hs_grad+poverty+single+urban'
statecrime_model = ols(formula=f, data=statecrime_df).fit()
white_test = het_white(np.array(statecrime_model.resid)[-30:], x[-30:,:] )
np.array(statecrime_model.model.exog)[-30:,:]


            IndVariable, IndVariable2, IndVariable3, IndVariable4, IndVariable5
        ]], DataExcel['demand_Maa'])))
print("Average demands:" + str(np.mean(DataExcel['demand_Maa'])))
print("Percentage MSE: " +
      str(np.mean(Scores) / np.mean(DataExcel['demand_Maa'])))
print("-----------------------------------------------")
print(SecondRegressor.feature_importances_ * 100)
print("-----------------------------------------------")
r2 = r2_score(y_true=y_actual, y_pred=y_pred)
print("The R Squared is: " + str(r2))
AIC, BIC = AICCalc(y_actual, y_pred, 5)
print("The AIC is: " + str(AIC))
print("The BIC is: " + str(BIC))
print("-----------------------------------------------")
Res, lm_pvalue, fvalue, pvalue = het_breuschpagan(
    Residuals, DataExcel[[
        IndVariable, IndVariable2, IndVariable3, IndVariable4, IndVariable5
    ]])
print("P Value of the Test: " + str(pvalue))
print("-----------------------------------------------")
# Computing the adjusted R squared
RAdjusted = 1 - (((1 - r2) * len(DataExcel['demand_Maa']) /
                  (len(DataExcel['demand_Maa']) - 5 - 1)))
print("The adjusted R squared is: " + str(RAdjusted))


def AICCalc(Y_Actual, Y_Predicted, N_Variables):
    res = Y_Actual - Y_Predicted
    SSE = sum(res**2)
    AIC = 2 * N_Variables - 2 * math.log(SSE)
    BIC = len(Y_Actual) * math.log(
        SSE / len(Y_Actual)) + N_Variables * math.log(len(Y_Actual))