def homoscedasticity_test(model):
    '''
    Function for testing the homoscedasticity of residuals in a linear regression model.
    It plots residuals and standardized residuals vs. fitted values and runs Breusch-Pagan and Goldfeld-Quandt tests.
    
    Args:
    * model - fitted OLS model from statsmodels
    '''
    fitted_vals = model.predict()
    resids = model.resid
    resids_standardized = model.get_influence().resid_studentized_internal

    fig, ax = plt.subplots(1,2)

    sns.regplot(x=fitted_vals, y=resids, lowess=True, ax=ax[0], line_kws={'color': 'red'})
    ax[0].set_title('Residuals vs Fitted', fontsize=16)
    ax[0].set(xlabel='Fitted Values', ylabel='Residuals')

    sns.regplot(x=fitted_vals, y=np.sqrt(np.abs(resids_standardized)), lowess=True, ax=ax[1], line_kws={'color': 'red'})
    ax[1].set_title('Scale-Location', fontsize=16)
    ax[1].set(xlabel='Fitted Values', ylabel='sqrt(abs(Residuals))')

    bp_test = pd.DataFrame(sms.het_breuschpagan(resids, model.model.exog), 
                           columns=['value'],
                           index=['Lagrange multiplier statistic', 'p-value', 'f-value', 'f p-value'])

    gq_test = pd.DataFrame(sms.het_goldfeldquandt(resids, model.model.exog)[:-1],
                           columns=['value'],
                           index=['F statistic', 'p-value'])

    print('\n Breusch-Pagan test ----')
    print(bp_test)
    print('\n Goldfeld-Quandt test ----')
    print(gq_test)
    print('\n Residuals plots ----')
def check_residuals_homoskedasticity(ols):
    import statsmodels.stats.api as sms
    resid = ols.resid
    exog = ols.model.exog
    lg, p, f, fp = sms.het_breuschpagan(resid=resid, exog_het=exog)
    if p >= 0.05:
        return True
    return False
 def run_breusch_pagan_test(self):
     """Run the Breusch-Pagan test."""
     if self.fitted_result is None:
         raise DataWasNotFitted()
     # Calculate the residues based on fitted model of linear regression
     breusch_pagan_test = statsmodelsapi.het_breuschpagan(
         self.fitted_result.resid, self.fitted_result.model.exog)
     # labels = ["LM Statistic", "LM-Test p-value", "F-Statistic", "F-Test p-value"]
     self.breusch_pagan_pvalue = float(breusch_pagan_test[1])
Exemplo n.º 4
0
def breusch_pagan_test(results):
    # Breusch-Pagan test for Heteroscedasticity
    test = sms.het_breuschpagan(results.resid, results.model.exog)
    print("")
    names = ['Breusch Pagan Statistics', 'p-value', 'f-value', 'f p-value']
    lzip(names, test)
    bp_results = pd.DataFrame([names, test])
    print(bp_results)
    return bp_results
def get_bpag(model: pd.DataFrame) -> tuple:
    """Calculate test statistics for heteroscedasticity

    Parameters
    ----------
    model : OLS Model
        Model containing residual values.

    Returns
    -------
    Test results from the Breusch-Pagan Test
    """

    lm_stat, p_value, f_stat, fp_value = het_breuschpagan(
        model.resid, model.model.exog)

    return lm_stat, p_value, f_stat, fp_value
Exemplo n.º 6
0
def regress_bp(SP5002):
    Y = SP5002["SP500"]
    BetaHAT1 = SP5002["Dividend"]
    BetaHAT2 = SP5002["Earnings"]
    BetaHAT3 = SP5002["Consumer Price Index"]
    BetaHAT4 = SP5002["Long Interest Rate"]
    results = sm.ols(formula="Y ~ BetaHAT1 + BetaHAT2 + BetaHAT3 + BetaHAT4",
                     data=SP5002).fit()
    print(results.summary())
    names = [
        'Lagrange multiplier statistic', 'p-value', 'f-value', 'f p-value'
    ]
    test = sms.het_breuschpagan(results.resid, results.model.exog)
    print("")
    lzip(names, test)
    bp_results = pd.DataFrame([names, test])
    print(bp_results)
    return results
Exemplo n.º 7
0
def process_heteroscedasticity(x, y, metrics_dict, suffix):
    x_with_const = sm.add_constant(x)

    results = sm.OLS(y, x_with_const).fit()

    bp_lm, bp_lm_pvalue, bp_fvalue, bp_f_pvalue = sms.het_breuschpagan(
        results.resid, results.model.exog)
    w_lm, w_lm_pvalue, w_fvalue, w_f_pvalue = sms.het_white(
        results.resid, results.model.exog)
    gq_fvalue, gq_f_pvalue, gq_type = sms.het_goldfeldquandt(
        results.resid, results.model.exog)

    beg_lim, end_lim = np.percentile(x, [33, 67])
    beg_ids = []
    end_ids = []
    for t_id, t in enumerate(x):
        if t < beg_lim:
            beg_ids.append(t_id)
        elif t > end_lim:
            end_ids.append(t_id)

    beg_std = np.std(np.array(y)[np.array(beg_ids)])
    end_std = np.std(np.array(y)[np.array(end_ids)])

    if end_std > beg_std:
        type = 'increasing'
    else:
        type = 'decreasing'

    metrics_dict['type' + suffix].append(type)

    metrics_dict['bp_lm' + suffix].append(bp_lm)
    metrics_dict['bp_lm_pvalue' + suffix].append(bp_lm_pvalue)
    metrics_dict['bp_fvalue' + suffix].append(bp_fvalue)
    metrics_dict['bp_f_pvalue' + suffix].append(bp_f_pvalue)

    metrics_dict['w_lm' + suffix].append(w_lm)
    metrics_dict['w_lm_pvalue' + suffix].append(w_lm_pvalue)
    metrics_dict['w_fvalue' + suffix].append(w_fvalue)
    metrics_dict['w_f_pvalue' + suffix].append(w_f_pvalue)

    metrics_dict['gq_fvalue' + suffix].append(gq_fvalue)
    metrics_dict['gq_f_pvalue' + suffix].append(gq_f_pvalue)
    metrics_dict['gq_type' + suffix].append(gq_type)
Exemplo n.º 8
0
def is_heteroscedastic(orig_ts, exog=None, verbose=False):
    if exog is None:
        ts, exog = get_auto_lags(orig_ts)
    else:
        ts, exog = orig_ts, sm.add_constant(exog)
    # ea = sms.het_arch(orig_ts)[1] <= 0.05

    wh = sms.het_white(ts, exog)[1] <= 0.05
    bp = sms.het_breuschpagan(ts, exog)[1] <= 0.05

    # Requires another independent variable responsible for the changes in variance
    # gf = sms.het_goldfeldquandt(ts, exog)[1] <= 0.05

    # Not to be used on time series
    # le = levene_test(orig_ts) <= 0.05
    # fl = fligner_test(orig_ts) <= 0.05
    # wa = wald_test(orig_ts) <= 0.05

    if verbose:
        print("\nTest for Heteroscedastiscity:")
        print(
            f'>Engle\'s ARCH (null = homosc.): p value = {sms.het_arch(orig_ts)[1]:.4f}'
        )
        # Breusch-Pagan is sensitive to missing normality, White is less sensitive
        print(
            f'>White (null = homosc.): p value = {sms.het_white(ts, exog)[1]:.4f}'
        )
        print(
            f'>Breusch-Pagan (null = homosc.): p value = {sms.het_breuschpagan(ts, exog)[1]:.4f}'
        )
        print(
            f'>Goldfeld-Quandt (null = homosc.): p value = {sms.het_goldfeldquandt(ts, exog)[1]:.4f}'
        )
        # Levene (Brown-Forsythe), Fligner is suitable for violation of normality
        print(
            f'>Levene alias Brown-Forsythe (null = homosc.): p value = {levene_test(orig_ts):.4f}'
        )
        print(
            f'>Fligner-Killeen (null = homosc.): p value = {fligner_test(orig_ts):.4f}'
        )
        print(
            f'>[DEV]Wald-Test on squares (null = homosc.): p value = {wald_test(orig_ts):.4f}'
        )
    return np.sum([wh, bp]) / 2.0
Exemplo n.º 9
0
sns.set(style="whitegrid")
sns.residplot(m1.resid, yhat, lowess=True, color='g')

# based on the graph, the model is heteroscedastic

# bruesch-pagan test against heteroscedasticty
import statsmodels.stats.api as sms

# H0: homoscedasticity
# H1: heteroscedasticity

# return value of breusch pagan test
# lagrange_multiplier, pvalue, fscore, fp-value

# parameters: [residuals, x-array]
pval = sms.het_breuschpagan(m1.resid, m1.model.exog)[1]

if pval < 0.05:
    print("Reject H0. Model is Heteroscedastic")
else:
    print("FTR H0. Model is Homoscedastic")

# iii) Reasiduals have a normal distribution
stats.probplot(m1.resid, dist='norm', plot=pylab)
pylab.show()

# iv) rows > columns
prot.shape

# k-Fold Cross-Validation
Exemplo n.º 10
0
def heteroskedasticity_test(test_name,
                            appelpy_model_object,
                            *,
                            regressors_subset=None):
    """Return the results of a heteroskedasticity test given a model.

    Output is a dictionary with with these keys:
    - 'distribution' (str): the distribution of the test
    - 'nu' (int): the number of degrees of freedom for the test
    - 'test_stat' (float): the value of the test statistic
    - 'p_value' (float): the p-value for the test

    Supported tests:
    - 'breusch_pagan': equivalent to Stata's `hettest` command.
    - 'breusch_pagan_studentized': equivalent to default behaviour of the
        bptest command in R.
    - 'white': equivalent to Stata's `imtest, white` command.

    Args:
        test_name (str): either 'breusch_pagan', 'breusch_pagan_studentized'
            or 'white'.
        appelpy_model_object: the object that contains the info about a model
            fitted with Appelpy.  e.g. for OLS regression the object would
            be of the type appelpy.linear_model.OLS.
        regressors_subset (list, optional): For breusch_pagan, this can be
            set so that the test runs on a subset of regressors. Defaults to
            None.

    Raises:
        ValueError: Choose one of 'breusch_pagan', 'breusch_pagan_studentized'
            or 'white' as a test name.
        ValueError: Check the regressors_subset items were used in the model.

    Returns:
        dict: info for the test, e.g. test distribution, degrees of freedom,
            test statistic and p-value.
    """

    # Gather test info in a dict:
    test_summary = {'distribution': 'chi2'}

    if test_name == 'breusch_pagan':
        # Get residuals (from model object or run again on a regressors subset)
        if regressors_subset:
            if not set(regressors_subset).issubset(
                    set(appelpy_model_object.X_list)):
                raise ValueError(
                    'Regressor(s) not recognised in dataset.  Check the list given to the function.'
                )
            reduced_model = sm.OLS(
                appelpy_model_object.y,
                sm.add_constant(appelpy_model_object.X[regressors_subset]))
            reduced_model_results = reduced_model.fit()
            sq_resid = (reduced_model_results.resid**2).to_numpy()
        else:
            sq_resid = (appelpy_model_object.resid**2).to_numpy()

        # Scale the residuals
        scaled_sq_resid = sq_resid / sq_resid.mean()
        y_hat = appelpy_model_object.results.fittedvalues.to_numpy()

        # Model of norm resid on y_hat
        aux_model = sm.OLS(scaled_sq_resid, sm.add_constant(y_hat)).fit()

        # Calculate test stat and pval
        test_summary['nu'] = 1
        test_summary['test_stat'] = aux_model.ess / 2
        test_summary['p_value'] = sp.stats.chi2.sf(test_summary['test_stat'],
                                                   test_summary['nu'])
    elif test_name == 'breusch_pagan_studentized':
        if regressors_subset:
            if not set(regressors_subset).issubset(
                    set(appelpy_model_object.X_list)):
                raise ValueError(
                    'Regressor(s) not recognised in dataset.  Check the list given to the function.'
                )
            reduced_model = sm.OLS(
                appelpy_model_object.y,
                sm.add_constant(appelpy_model_object.X[regressors_subset]))
            reduced_model_results = reduced_model.fit()
            test_summary['nu'] = 1
            stat, pval, _, _ = sms.het_breuschpagan(
                reduced_model_results.resid, reduced_model_results.model.exog)
            test_summary['test_stat'], test_summary['p_value'] = stat, pval
        else:
            test_summary['nu'] = 1
            stat, pval, _, _ = sms.het_breuschpagan(
                appelpy_model_object.results.resid,
                appelpy_model_object.results.model.exog)
            test_summary['test_stat'], test_summary['p_value'] = stat, pval
    elif test_name == 'white':
        if regressors_subset:
            print(
                "Ignoring regressors_subset.  White test will use original regressors."
            )
        test_summary['nu'] = (int((len(appelpy_model_object.X_list)**2 +
                                   3 * len(appelpy_model_object.X_list)) / 2))
        white_test = sms.het_white(appelpy_model_object.resid,
                                   sm.add_constant(appelpy_model_object.X))
        test_summary['test_stat'] = white_test[0]
        test_summary['p_value'] = white_test[1]
    else:
        raise ValueError(
            """Choose one of 'breusch_pagan', 'breusch_pagan_studentized' or
            'white' as a test name.""")

    return test_summary
Exemplo n.º 11
0
 def ols_test_breusch_pagan(self):
     names = [
         'Lagrange multiplier statistic', 'p-value', 'f-value', 'f p-value'
     ]
     bp = sms.het_breuschpagan(self.residuals, self.model.model.exog)
     return lzip(names, bp)
Exemplo n.º 12
0
test1 = sms.jarque_bera(lr.resid)
lzip(name1, test1)
#null hypothesis: the data is normally distributed.

#======================Omni test:
name2 = ['Chi^2', 'Two-tail probability']
test2 = sms.omni_normtest(lr.resid)
lzip(name2, test2)
'''

#================================================
#================================================
#=========================Heteroskedasticity test
#======================Breush-Pagan test:
name3 = ['Lagrange multiplier statistic', 'p-value', 'f-value', 'f p-value']
test3 = sms.het_breuschpagan(lr.resid, lr.model.exog)
lzip(name3, test3)

#======================Goldfeld-Quandt test:
name5 = ['F statistic', 'p-value']
test5 = sms.het_goldfeldquandt(lr.resid, lr.model.exog)
lzip(name5, test5)


#================================================
#================================================
#==================================Linearity test
#======================Harvey-Collier:
name6 = ['t value', 'p value']
test6 = sms.linear_harvey_collier(lr)
lzip(name6, test6)
Exemplo n.º 13
0
def ols_diagnostics(formula, model, data, y_string):
    """
    Given the OLS model supplied, calculates statistics and draws graphs that check 4 of the 6 multiple linear
    regressions' hypothesis. Tests done: Harvey-Collier, Variance Influence Factor, RESET, Breusch-Pagan, Jarque-Bera.
    References:
        https://www.statsmodels.org/dev/examples/notebooks/generated/regression_diagnostics.html
        https://medium.com/@vince.shields913/regression-diagnostics-fa476b2f64db
    :param formula : patsy formula of the model;
    :param model : fitted model object;
    :param data : DataFrame containing the data;
    :param y_string : string (name) of the dependent variable
    """

    ## Harvey-Collier: linearity (MLR 1)
    try:
        print(f"Harvey-Collier P-value for linearity (MLR 1): {round(sms.linear_harvey_collier(model)[1], 4)}")
        print("H0: Model is linear.")
        print("For more information, see the 'Residuals vs Fitted Values' plot.\n")
    except ValueError:
        print("For information on linearity (MLR 1),  see the 'Residuals vs Fitted Values' plot.\n")

    ## Reset: specification of the functional form of the model
    reset = linear_reset(model, use_f=True, cov_type='HC1')
    print(f"Linear Reset (MLR 1) P-value: {reset.pvalue}")
    print("H0: model is well specified and linear.")
    print("For more information, see the Residuals vs Fitted Values plot.\n")

    ### Condition number: multicollinearity (MLR 3)
    print(f"Condition Number for Multicollinearity (MLR 3): {round(np.linalg.cond(model.model.exog), 2)}")
    print("The larger the number, the bigger the multicollinearity. For more information, see the 'VIF' plot.\n")

    ## Calculating Variance Influence Factors (VIF)
    # Matrices
    y, X = dmatrices(formula, data, return_type='dataframe')

    ## Calculating VIFs and storing in a DataFrame
    dfVIF = pd.DataFrame()
    dfVIF["Variables"] = X.columns
    dfVIF["VIF_Factor"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]

    ## Breusch-Pagan (MLR 5):
    breusch_pagan_pvalue = np.around(sms.het_breuschpagan(model.resid, model.model.exog)[3], 4)
    print(f"Breusch-Pagan P-value for heteroskedasticity (MLR 5): {breusch_pagan_pvalue}")
    print("H0: Variance is homoskedasticity.")
    print("For White's test and use in panel models, call the 'heteroscedascity_test' function.")
    print("For more information, see the 'Scale-Location' plot.\n")

    ## Durbin-Watson: correlation between the residuals
    print(f"Durbin-Watson statistic for residual correlation is: {np.around(durbin_watson(model.resid), 2)}")
    print("If the value is close to 0, there is positive serial correlation.")
    print("If the value is close to 4, there is negative serial correlation.")
    print("Rule of thumb: 1.5 < DW < 2.5 indicates no serial correlation.\n")

    ## Jarque-Bera: normality of the residuals (MLR 6, used for statistic inference)
    print(f"Jarque-Bera P-value (MLR 6): {np.around(sms.jarque_bera(model.resid)[1], 4)}")
    print("H0: Data has a normal distribution.")
    print("For more information, see the 'Normal Q-Q' plot.\n")

    print("To test for exogeneity (MLR 4), an IV2SLS must be constructed.")
    print("Test for random sampling (Heckit) are not yet available in this module.")

    ## Creating graphic object
    fig, ax = plt.subplots(nrows=2, ncols=2, figsize=(12, 12))
    plt.style.use('seaborn-white')

    ### Plots
    ## Linearity: residuals x predicted values. The less inclined the lowess, the more linear the model.
    ax00 = sns.residplot(x=model.fittedvalues, y=y_string, data=data, lowess=True,
                         scatter_kws={'facecolors': 'none', 'edgecolors': 'black'},
                         line_kws={'color': 'blue', 'lw': 1, 'alpha': 0.8}, ax=ax[0, 0])

    # Titles
    ax00.set_title('Linearity: Residuals vs Fitted', fontsize=12)
    ax00.set_xlabel('Fitted Values', fontsize=10)
    ax00.set_ylabel('Residuals (horizontal lowess: linearity)', fontsize=10)

    ## Multicollinearity: VIF
    ax01 = dfVIF["VIF_Factor"].plot(kind='bar', stacked=False, ax=ax[0, 1])

    # X tick labels
    ax01.set_xticklabels(labels=dfVIF["Variables"], rotation=0, color='k')

    # Annotations
    for p in ax01.patches:
        ax01.annotate(round(p.get_height(), 2), (p.get_x() + p.get_width() / 2., p.get_height()),
                      ha='center', va='center', xytext=(0, 10), textcoords='offset points')

    ## Titles
    ax01.set_title("Multicollinearity Test - VIF", color='k', fontsize=12)
    ax01.set_ylabel("Variance Influence Factor (> 5: multicollinearity)", color='k', fontsize=10)
    ax01.set_xlabel("Variable", color='k', fontsize=10)

    ## Heteroskedasticity: the more disperse and horizontal the points,
    # the more likely it is that homoskedasticity is present
    ax10 = sns.regplot(x=model.fittedvalues, y=np.sqrt(np.abs(model.get_influence().resid_studentized_internal)),
                       ci=False, lowess=True, line_kws={'color': 'blue', 'lw': 1, 'alpha': 0.8},
                       scatter_kws={'facecolors': 'none', 'edgecolors': 'black'}, ax=ax[1, 0])

    # Titles
    ax10.set_title('Heteroskedasticity: Scale-Location', fontsize=12)
    ax10.set_xlabel('Fitted Values', fontsize=10)
    ax10.set_ylabel('$\sqrt{|Standardized Residuals|}$ (disperse and horizontal: homoskedasticity)', fontsize=10)

    ## Normality of the residuals: Q-Q Plot
    probplot = sm.ProbPlot(model.get_influence().resid_studentized_internal, fit=True)
    ax11 = probplot.qqplot(line='45', marker='o', color='black', ax=ax[1, 1])
Exemplo n.º 14
0
def heteroskedasticity_test(test_name,
                            appelpy_model_object,
                            regressors_subset=None):
    """Return the results of a heteroskedasticity test given a model.

    Supported tests:
    - 'breusch_pagan': equivalent to Stata's `hettest` command.
    - 'breusch_pagan_studentized': equivalent to default behaviour of the
        bptest command in R.
    - 'white': equivalent to Stata's `imtest, white` command.

    Args:
        test_name (str): either 'breusch_pagan', 'breusch_pagan_studentized'
            or 'white'.
        appelpy_model_object: the object that contains the info about a model
            fitted with Appelpy.  e.g. for OLS regression the object would
            be of the type appelpy.linear_model.OLS.
        regressors_subset (list, optional): For breusch_pagan, this can be
            set so that the test runs on a subset of regressors. Defaults to
            None.

    Raises:
        ValueError: Choose one of 'breusch_pagan', 'breusch_pagan_studentized'
            or 'white' as a test name.
        ValueError: Check the regressors_subset items were used in the model.

    Returns:
        test_statistic, p_value: the test statistic and the corresponding
            p-value.
    """

    if test_name == 'breusch_pagan':
        # Get residuals (from model object or run again on a regressors subset)
        if regressors_subset:
            if not set(regressors_subset).issubset(
                    set(appelpy_model_object.X.columns)):
                raise ValueError(
                    'Regressor(s) not recognised in dataset.  Check the list given to the function.'
                )
            reduced_model = sm.OLS(
                appelpy_model_object.y,
                sm.add_constant(appelpy_model_object.X[regressors_subset]))
            reduced_model_results = reduced_model.fit()
            sq_resid = (reduced_model_results.resid**2).to_numpy()
        else:
            sq_resid = (appelpy_model_object.resid**2).to_numpy()

        # Scale the residuals
        scaled_sq_resid = sq_resid / sq_resid.mean()
        y_hat = appelpy_model_object.results.fittedvalues.to_numpy()

        # Model of norm resid on y_hat
        aux_model = sm.OLS(scaled_sq_resid, sm.add_constant(y_hat)).fit()

        # Calculate test stat and pval
        lm = aux_model.ess / 2
        pval = sp.stats.chi2.sf(lm, 1)  # dof=1
        return lm, pval
    elif test_name == 'breusch_pagan_studentized':
        if regressors_subset:
            if not set(regressors_subset).issubset(
                    set(appelpy_model_object.X.columns)):
                raise ValueError(
                    'Regressor(s) not recognised in dataset.  Check the list given to the function.'
                )
            reduced_model = sm.OLS(
                appelpy_model_object.y,
                sm.add_constant(appelpy_model_object.X[regressors_subset]))
            reduced_model_results = reduced_model.fit()
            lm, pval, _, _ = sms.het_breuschpagan(
                reduced_model_results.resid, reduced_model_results.model.exog)
            return lm, pval
        else:
            lm, pval, _, _ = sms.het_breuschpagan(
                appelpy_model_object.results.resid,
                appelpy_model_object.results.model.exog)
            return lm, pval
    elif test_name == 'white':
        if regressors_subset:
            print(
                "Ignoring regressors_subset.  White test will use original regressors."
            )
        white_test = sms.het_white(appelpy_model_object.resid,
                                   sm.add_constant(appelpy_model_object.X))
        return white_test[0], white_test[1]  # lm, pval
    else:
        raise ValueError(
            """Choose one of 'breusch_pagan', 'breusch_pagan_studentized' or
            'white' as a test name.""")
Exemplo n.º 15
0
# Fitting model
explan_vars = " + ".join(supervisor.columns[1:])
explan_vars
mylm = smf.ols("Rating ~" + explan_vars, data=supervisor).fit()
mylm.summary()
mylm.mse_resid
mylm.params
mylm.rsquared

# Linearity - added variable plot
smg.plot_partregress_grid(mylm)

# BP test & plot of residuals vs. fitted values
sns.regplot(x=mylm.fittedvalues, y=mylm.resid, color='blue', ci=None)
sm.het_breuschpagan(mylm.resid_pearson, mylm.model.exog)

# Normality
sns.distplot(mylm.resid_pearson, kde=False)
mylm.summary()

# Cross Validation
ncv = 250
bias = np.repeat(np.NaN, ncv)
rpmse = np.repeat(np.NaN, ncv)
wid = np.repeat(np.NaN, ncv)
ntest = round(supervisor.shape[0] / 10)

for cv in range(ncv):
    # Choose which obs to put
    testobs = np.random.choice(supervisor.shape[0], ntest)
Exemplo n.º 16
0
#%%
frame.head()
#%%
import statsmodels.formula.api as smf
m1 = smf.ols("ceb ~ age + educ + religion + idlnchld + knowmeth + usemeth +"\
                "agefm + heduc + urban + electric + radio + tv + bicycle +"\
                "nevermarr + idlnchld_noans + heduc_noans + usemeth_noans",
             data=frame)
fitted = m1.fit()
print fitted.summary()

# Обратите внимание, что для признака religion в модели автоматически создалось несколько бинарных фиктивных переменных. Сколько их?
# Проверьте критерием Бройша-Пагана гомоскедастичность ошибки в построенной модели. Выполняется ли она?
#%%
import statsmodels.stats.api as sms
print "Breusch-Pagan test: p=%f" % sms.het_breuschpagan(
    fitted.resid, fitted.model.exog)[1]

# Удалите из модели незначимые признаки religion, radio и tv. Проверьте гомоскедастичность ошибки, при необходимости сделайте поправку Уайта.
# Не произошло ли значимого ухудшения модели после удаления этой группы признаков? Проверьте с помощью критерия Фишера.
# Чему равен его достигаемый уровень значимости? Округлите до четырёх цифр после десятичной точки.
# Если достигаемый уровень значимости получился маленький, верните все удалённые признаки; если он достаточно велик, оставьте модель без религии, тв и радио.
#%%
m2 = smf.ols("ceb ~ age + educ + idlnchld + knowmeth + usemeth +"\
                "agefm + heduc + urban + electric + bicycle +"\
                "nevermarr + idlnchld_noans + heduc_noans + usemeth_noans",
             data=frame)
fitted2 = m2.fit(cov_type="HC1")
comparison_result_1_2 = fitted.compare_f_test(fitted2)
print "F=%.4f, p=%.4f, k1=%.4f" % (np.round(
    comparison_result_1_2[0], 4), np.round(
        comparison_result_1_2[1], 4), np.round(comparison_result_1_2[2], 4))
Exemplo n.º 17
0
print(pearsonr(nuclee, rating))

#Simple linear regression

x = nuclee

import statsmodels.api as sm
x = sm.add_constant(x)
model = sm.OLS(newprice, x)
results = model.fit()
print(results.summary())
print('Parameters:', results.params)
print('R2:', results.rsquared)
print('Standard errors:', results.bse)
print('Predicted values:', results.predict())
erori = results.resid
import scipy.stats
print(stats.ttest_1samp(erori, 0))
import statsmodels.stats.api as sms

#Heteroscedasticity test
test_BP = sms.het_breuschpagan(results.resid, results.model.exog)
print(test_BP)
#Test GQ
test_GQ = sms.het_goldfeldquandt(results.resid, results.model.exog)
print(test_GQ)
import matplotlib.pyplot as plt
import numpy as np
import seaborn
seaborn.lmplot(y='new price', x='procesor', data=baza)
plt.show()
Exemplo n.º 18
0
    def diagnostic_plots(self, linear_model):
        """
        :param linear_model: Linear Model Fit on the Data
        :return: None
        This method validates the assumptions of Linear Model
        """
        diagnostic_result = {}

        summary = linear_model.summary()
        #diagnostic_result['summary'] = str(summary)

        # fitted values
        fitted_y = linear_model.fittedvalues
        # model residuals
        residuals = linear_model.resid

        # normalized residuals
        residuals_normalized = linear_model.get_influence().resid_studentized_internal

        # absolute squared normalized residuals
        model_norm_residuals_abs_sqrt = np.sqrt(np.abs(residuals_normalized))

        # leverage, from statsmodels internals
        leverage = linear_model.get_influence().hat_matrix_diag

        # cook's distance, from statsmodels internals
        cooks = linear_model.get_influence().cooks_distance[0]

        self.check_linearity_assumption(fitted_y, residuals)

        self.check_residual_normality(residuals_normalized)

        self.check_homoscedacticity(fitted_y, model_norm_residuals_abs_sqrt)

        self.check_influcence(leverage, cooks, residuals_normalized)

        # 1. Non-Linearity Test
        try:
            name = ['F value', 'p value']
            test = sms.linear_harvey_collier(linear_model)
            linear_test_result = lzip(name, test)
        except Exception as e:
            linear_test_result = str(e)
        diagnostic_result['Non_Linearity_Test'] = linear_test_result

        # 2. Hetroskedasticity Test
        name = ['Lagrange multiplier statistic', 'p-value',
                'f-value', 'f p-value']
        test = sms.het_breuschpagan(linear_model.resid, linear_model.model.exog)
        test_val = lzip(name, test)
        diagnostic_result['Hetroskedasticity_Test'] = test_val

        # 3. Normality of Residuals
        name = ['Jarque-Bera', 'Chi^2 two-tail prob.', 'Skew', 'Kurtosis']
        test = sms.jarque_bera(linear_model.resid)
        test_val = lzip(name, test)
        diagnostic_result['Residual_Normality_Test'] = test_val

        # 4. MultiCollnearity Test
        test = np.linalg.cond(linear_model.model.exog)
        test_val = [('condition no',test)]
        diagnostic_result['MultiCollnearity_Test'] = test_val

        # 5. Residuals Auto-Correlation Tests
        test = sms.durbin_watson(linear_model.resid)
        test_val = [('p value', test)]
        diagnostic_result['Residual_AutoCorrelation_Test'] = test_val

        json_result = json.dumps(diagnostic_result)
        return summary, json_result
# Other plotting options can be found on the [Graphics page.](https://www.statsmodels.org/stable/graphics.html)

# ## Multicollinearity
# 
# Condition number:

np.linalg.cond(results.model.exog)


# ## Heteroskedasticity tests
# 
# Breush-Pagan test:

name = ['Lagrange multiplier statistic', 'p-value',
        'f-value', 'f p-value']
test = sms.het_breuschpagan(results.resid, results.model.exog)
lzip(name, test)


# Goldfeld-Quandt test

name = ['F statistic', 'p-value']
test = sms.het_goldfeldquandt(results.resid, results.model.exog)
lzip(name, test)


# ## Linearity
# 
# Harvey-Collier multiplier test for Null hypothesis that the linear specification is correct:

name = ['t value', 'p value']
    def check_error_term_constant_variance(self) -> bool:
        """
        Checks if the error term has constant variance (there is no heteroscedascity) by:
        - Breusch-Pagan's statistical test,
        - Goldfeld-Quandt's statstical test.
        If:
         - silent_mode = True, method returns:
                                              a) True (which means that the assumption is
                                                 fulfilled) if the percentage of statistical tests
                                                 for which the assumption is fulfilled is higher
                                                 than or equal to set min_fulfill_ratio
                                              b) False (which means that the assumption is not
                                                 fulfilled) if the percentage of statistical tests
                                                 for which the assumption is fulfilled is lower
                                                 than set min_fulfill_ratio
         - silent_mode = False, method returns True/False as above and shows additional statistics,
         descriptions which are helpful in assessing the fulfilment of assumption
        """

        bp_test = pd.DataFrame(
            sms.het_breuschpagan(self.residuals, self.results.model.exog)[:2],
            columns=["value"],
            index=["Lagrange multiplier statistic", "p-value"])
        gq_test = pd.DataFrame(sms.het_goldfeldquandt(
            self.residuals, self.results.model.exog)[:-1],
                               columns=["value"],
                               index=["F statistic", "p-value"])
        heteroscedascity_tests = [bp_test, gq_test]

        true_counts = 0
        for test in heteroscedascity_tests:
            true_counts = true_counts + test_hypothesis(
                significance_level=self.alpha,
                p_value=test.iloc[1].value,
                print_outcome=False)
        true_ratio = true_counts / 2

        if not self.silent_mode:
            print(
                Color.BOLD +
                "Assumption 5. The error term has a constant variance." +
                Color.END, "\n")

            print("This assumption affects on: \n", "- prediction \n",
                  "- interpretation \n")

            print(
                "Heteroscedasticity does not cause bias in the coefficient estimates, it does "
                "make them less precise. Heteroscedasticity also tends to produce p-values that "
                "are smaller than they should be. If you notice this problem in your model, "
                "you can try one of this solutions to fix it: redefine independent variable to "
                "focus on rates/per capita, try using weighted least squares, experiment with "
                "data transformations (f.g. Box-Cox's/Johnson's transformation).\n"
            )

            print(Color.BOLD + "Breusch-Pagan " + Color.END +
                  "Lagrange Multiplier "
                  "statistical test: \n")
            print(bp_test, "\n")

            test_hypothesis(
                significance_level=self.alpha,
                p_value=bp_test.iloc[1].value,
                null_hypothesis="error term's variance is constant.")

            print(Color.BOLD + "Goldfeld-Quandt " + Color.END +
                  "test that examines whether the "
                  "residual variance is the same in "
                  "two subsamples: \n")

            print(gq_test, "\n")

            test_hypothesis(
                significance_level=self.alpha,
                p_value=gq_test.iloc[1].value,
                null_hypothesis="error term's variance is constant.")

            check_fulfill_ratio(true_fulfill_ratio=true_ratio,
                                min_fulfill_ratio=self.min_fulfill_ratio)

            print(
                "HINT: If you see randomly scattered points => there is no heteroscedascity. \n",
                "If you see fan or cone pattern => probably there exists heteroscedascity. \n"
            )

            plot_standarized_residuals_vs_fitted(fitted_model=self.results)
            plt.show()

        return check_fulfill_ratio(true_fulfill_ratio=true_ratio,
                                   min_fulfill_ratio=self.min_fulfill_ratio,
                                   print_outcome=False)
Exemplo n.º 21
0
print(lzip(name, test))


# Assumption of Normality of the Residuals

sold_prediction['sale_price_raw_m'].plot(kind='hist', 
                       title= 'Log of Sale Price Distribution')



# Assumption of Normality of the Residuals

sold_prediction['sale_price_raw_m_log'] = np.log(sold_prediction['sale_price_raw_m'])
sold_prediction['sale_price_raw_m_log'].plot(kind='hist', 
                       title= 'Log of Sale Price Distribution')


# Assumption of Normality of the Residuals

stats.probplot(lm2.resid, dist="norm", plot= plt)
plt.title("Model1 Residuals Q-Q Plot")


# Assumption of Homoscedasticity

name = ['Lagrange multiplier statistic', 'p-value', 
        'f-value', 'f p-value']
test = sms.het_breuschpagan(lm2.resid, lm2.model.exog)
lzip(name, test)

Exemplo n.º 22
0
def breusch_pagan(model, **kwargs):
    results = sms.het_breuschpagan(model.resid, model.model.exog, **kwargs)
    return pd.Series(results, index=["lm", "lm_pval", "f_val", "f_pval"])
Exemplo n.º 23
0
# In[21]:


mod.summary()


# In[23]:


plt.hist(mod.resid_pearson)


# In[25]:


sm.het_breuschpagan(mod.resid_pearson,mod.model.exog)[1]


# In[33]:


plt.scatter(mod.fittedvalues,mod.resid_pearson)
plt.plot([mod.fittedvalues.min(),mod.fittedvalues.max()],[0,0],color='red')
plt.ylabel('Std. Residuals')
plt.xlabel('Fitted Values')
plt.show()


# In[26]:

Exemplo n.º 24
0
def breusch_pagan_test(data):
    bp_test = statm.ols('y~X', data=data).fit()
    test = sms.het_breuschpagan(bp_test.resid, bp_test.model.exog)
    print(test)
Clv.head(5)
model = smf.ols("futureMargin ~ daysSinceLastOrder + margin + returnRatio +  shareOwnBrand + shareVoucher + shareSale + itemsPerOrder", data = Clv).fit()
model.summary()


stats.probplot(model.resid, plot= plt)
plt.title("Model1 Residuals Probability Plot")

# Residuals are normally distributed! Woot! Hence inference tests can be used


# Homoscedasticity or constant variance of residuals

TestNames = ['Lagrange multiplier statistic', 'p-value',
        'f-value', 'f p-value']
test = sms.het_breuschpagan(model.resid, model.model.exog)
lzip(TestNames, test)


# Split the data into training/testing sets
clv_X_train = Clv6[:-20]
clv_X_test = Clv6[-20:]


# Split the targets into training/testing sets
clv_y_train = Clv.futureMargin[:-20]
clv_y_test = Clv.futureMargin[-20:]


# Create linear regression object
regr = sk.linear_model.LinearRegression()
Exemplo n.º 26
0
            line_kws={'color': 'red'})
plt.title('Fitted vrs Residual')

# In[148]:

fig, ax = plt.subplots(figsize=(12, 8))
sm.graphics.influence_plot(lin_reg, alpha=0.05, ax=ax, criterion="cooks")

# In[161]:

X.loc[365:372, :]

# In[ ]:

#Breusch-Pagan test
bp_test = sms.het_breuschpagan(resid_val, lin_reg.model.exog)
print(bp_test)
print("Breush-Pagan Test:  pvalue = ", bp_test[1])

# In[ ]:

## Check Durbin -Watson  in summary for autocorrelation, is it 2?   values < 2 indicates Positive autocorrelation
# use GLS
#Correlation between explanatory variables?
from scipy.stats.stats import pearsonr
X.columns
pearsonr(X['TAX'], lin_reg.resid)
#corr and p-value
# p-value <0.05?  then reject Ho. if not, lack of correlation

# In[84]:
Exemplo n.º 27
0
botswana.usemeth[botswana.usemeth.isnull()] = -1
botswana = botswana.dropna()

elem_num = botswana.shape[0] * botswana.shape[1]
print('Array size: %d' % elem_num)

botswana.info()
formula = 'ceb ~ ' + ' + '.join(botswana.columns[1:])

reg_m = smf.ols(formula, data=botswana)
fitted_m = reg_m.fit()
print(fitted_m.summary())

print(botswana.religion.value_counts())
print('Breusch-Pagan test: p=%f' %
      sms.het_breuschpagan(fitted_m.resid, fitted_m.model.exog)[1])

reg_m2 = smf.ols(formula, data=botswana)
fitted_m2 = reg_m2.fit(cov_type='HC1')
print(fitted_m2.summary())

formula2 = 'ceb ~ age + educ + idlnchld + knowmeth + usemeth + agefm + heduc + urban + electric + bicycle \
+ nevermarr + idlnchld_noans + heduc_noans + usemeth_noans'

reg_m3 = smf.ols(formula2, data=botswana)
fitted_m3 = reg_m3.fit()
print(fitted_m3.summary())

print('Breusch-Pagan test: p=%f' %
      sms.het_breuschpagan(fitted_m3.resid, fitted_m3.model.exog)[1])
reg_m4 = smf.ols(formula2, data=botswana)
Exemplo n.º 28
0
def main(processed_path = "data/processed",
         models_path = "models"):
    
    """Nested 10-fold cross-validation for linear regression of
    ranking_log and score with with lasso regularization
    (inner CV for alpha tuning, outer for R^2 robustness)."""
    
    # logging
    logger = logging.getLogger(__name__)
    
    # normalize paths
    processed_path = os.path.normpath(processed_path)
    logger.debug("Path to processed data normalized: {}"
                 .format(processed_path))
    models_path = os.path.normpath(models_path)
    logger.debug("Path to models normalized: {}"
                 .format(models_path))
    
    # load selected_df
    selected_df = pd.read_pickle(os.path.join(processed_path,
                                              'selected_df.pkl'))
    logger.info("Loaded selected_df. Shape of df: {}"
                .format(selected_df.shape))
    
    #%% split df into dependent and independent variables
    teams_df = selected_df.iloc[:, :9]
    y = selected_df.iloc[:, 9:10]
    X = selected_df.iloc[:, 10:]
    X_columns = X.columns
    X_index = X.index
    
    #%% standardize
    
    scaler = StandardScaler()
    not_standardize = ['core',
                       'visualization',
                       'machine_learning',
                       'deep_learning']
    X_standardized = scaler.fit_transform(X
                                          .drop(columns=not_standardize)
                                          .values)
    X_standardized = pd.DataFrame(X_standardized,
                                  index = X_index,
                                  columns = X_columns.drop(not_standardize))
    X_not_standardized = X[not_standardize]
    X = pd.concat([X_standardized, X_not_standardized], axis=1)
    logger.debug("After Standardization:\n{}".format(X.describe().to_string))
    
    #%% define hyperparameter
    
    start = time()

    L1_RATIOS = [1.0, .95, .7, .5, .3, .1]
    EPS = 0.001
    N_ALPHAS = 100
    ALPHAS = None
    # normalize data
    # If True, the regressors X will be normalized before regression by
    # subtracting the mean (column-wise) and dividing by the l2-norm in
    # order for each feature to have norm = 1.
    NORMALIZE = False
    MAX_ITER = 10000
    TOL = 0.0001
    CV = 20
    N_JOBS = 1
    RS = 1
    SELECTION = 'cyclic'
    
    logger.info("l1_ratio={}, eps={}, n_alphas={}, alphas={}, normalize={}"
                 .format(L1_RATIOS, EPS, N_ALPHAS, ALPHAS, NORMALIZE))
    logger.info("max_iter={}, tol={}, cv={}, n_jobs={}, rs={}, selection={}"
                 .format(MAX_ITER, TOL, CV, N_JOBS, RS, SELECTION))
    logger.debug("Try following L1-ratios: {}".format(L1_RATIOS))
    
    # print R^2 values for bounding alphas 0 and 1 to make sense of alphas
    logger.info("Bounding score: R^2 for alpha=0 and l1_ratio=0.5: {}"
                .format(ElasticNet(alpha=0, l1_ratio=.5,
                                   normalize=NORMALIZE, random_state=RS)
                        .fit(X.values, y.values)
                        .score(X.values, y.values)))
    logger.info("Bounding score: R^2 for alpha=1 and l1_ratio=0.5: {}"
                .format(ElasticNet(alpha=1, l1_ratio=.5,
                                   normalize=NORMALIZE, random_state=RS)
                        .fit(X.values, y.values)
                        .score(X.values, y.values)))
    
    #%% train model
    
    mod = ElasticNetCV(l1_ratio = L1_RATIOS,
                       eps = EPS,
                       n_alphas = N_ALPHAS,
                       alphas = ALPHAS,
                       normalize = NORMALIZE,
                       max_iter = MAX_ITER,
                       tol = TOL,
                       cv = CV,
                       n_jobs = N_JOBS,
                       random_state = RS,
                       selection = SELECTION)\
          .fit(X.values, y.values)
    
    # log some statistics
    best_r2 = mod.score(X.values, y.values)
    logger.info("best R^2 score: {:.2f}%".format(best_r2*100))
    best_l1_ratio = mod.l1_ratio_
    logger.info("best l1_ratio: {}".format(best_l1_ratio))
    best_alpha = mod.alpha_
    logger.info("best alpha: {:.3f}".format(best_alpha))
    alphas = mod.alphas_
    logger.debug("tested alphas:\n{}".format(alphas))
    coef = pd.Series(data=mod.coef_, index=X_columns)
    logger.debug("best coefficients:\n{}".format(coef))
#    mse_path = mod.mse_path_
    
    #%% Nested Cross-Validation to test robustness of R^2
    
    cv_results = cross_validate(ElasticNetCV(l1_ratio = L1_RATIOS,
                                             eps = EPS,
                                             n_alphas = N_ALPHAS,
                                             alphas = ALPHAS,
                                             normalize = NORMALIZE,
                                             max_iter = MAX_ITER,
                                             tol = TOL,
                                             cv = CV,
                                             n_jobs = N_JOBS,
                                             random_state = RS,
                                             selection = SELECTION),
                                X.values, y.values, cv=CV,
                                return_train_score=True, n_jobs=N_JOBS)
    logger.info("95% confidence intervall: {:.2f} +/- {:.2f} (mean +/- 2*std)"
                .format(cv_results['test_score'].mean(),
                        cv_results['test_score'].std()*2))
    logger.debug("Nested cross-validation results:\n{}"
                .format(pd.DataFrame(data=cv_results)))
    
    #%% Elastic Net regression with statsmodels for summary
    
    mod_sm = sm.OLS(y.values, sm.add_constant(pd.DataFrame(data=X.values,
                                                    columns=X_columns,
                                                    index=X_index)))\
          .fit_regularized(method='elastic_net',
                           alpha=best_alpha,
                           L1_wt=best_l1_ratio,
                           refit=True)
    res = mod_sm.summary().as_text()
    logger.info("ElasticNet regression of selected_df regarding ranking_log")
    logger.info("with alpha={:.5f} and L1_wt={}:\n{}"
                .format(best_alpha, best_l1_ratio, res))
    
    # Normality of residuals
    # Jarque-Bera test:
    name = ['Jarque-Bera', 'Chi^2 two-tail prob.', 'Skew', 'Kurtosis']
    test = sms.jarque_bera(mod_sm.resid)
    logger.info("Jarque-Bera test: {}".format(lzip(name, test)))
    # Omni test:
    name = ['Chi^2', 'Two-tail probability']
    test = sms.omni_normtest(mod_sm.resid)
    logger.info("Omnibus test: {}".format(lzip(name, test)))
    
    # Multicollinearity
    # Conditional Number:
    logger.info("Conditional Number: {}"
                .format(np.linalg.cond(mod_sm.model.exog)))
    
    # Heteroskedasticity tests
    # Breush-Pagan test:
    name = ['Lagrange multiplier statistic', 'p-value', 
        'f-value', 'f p-value']
    test = sms.het_breuschpagan(mod_sm.resid, mod_sm.model.exog)
    logger.info("Breush-Pagan test: {}".format(lzip(name, test)))
    # Goldfeld-Quandt test
    name = ['F statistic', 'p-value']
    test = sms.het_goldfeldquandt(mod_sm.resid, mod_sm.model.exog)
    logger.info("Goldfeld-Quandt test: {}".format(lzip(name, test)))
    
    #%% export results as pickle file to models folder
    
    # pickle mod
    with open(os.path.join(models_path, 'sklearn_ElasticNetCV.pkl'),
              'wb') as handle:
        pickle.dump(mod, handle, protocol=pickle.HIGHEST_PROTOCOL)
    logger.info("Saved elastic net model of sklearn to {}."
                .format(os.path.join(models_path,
                                     'sklearn_ElasticNetCV.pkl')))
    
    # pickle mod_sm
    with open(os.path.join(models_path, 'sm_OLS_fit_regularized.pkl'),
              'wb') as handle:
        pickle.dump(mod_sm, handle, protocol=pickle.HIGHEST_PROTOCOL)
    logger.info("Saved elastic net model of statsmodels to {}."
                .format(os.path.join(models_path,
                                     'sm_OLS_fit_regularized.pkl')))
    
    # save res as .txt
    f = open(os.path.join(models_path,
                          'sm_OLS_fit_regularized_summary.txt'), "w+")
    f.write(res)
    f.close()
    
    
    #%% logging time passed
    end = time()
    time_passed = pd.Timedelta(seconds=end-start).round(freq='s')
    logger.info("Time needed to train Elastic Net Model: {}"
                .format(time_passed))