예제 #1
0
def whites_test(results):
    # White's Test for Heteroscedasticity
    test = sms.het_white(results.resid, results.model.exog)
    names = ['White Statistic', 'p-value', 'f-value', 'f p-value']
    lzip(names, test)
    white_results = pd.DataFrame([names, test])
    print(white_results)
예제 #2
0
def breusch_pagan_test(results):
    # Breusch-Pagan test for Heteroscedasticity
    test = sms.het_breuschpagan(results.resid, results.model.exog)
    print("")
    names = ['Breusch Pagan Statistics', 'p-value', 'f-value', 'f p-value']
    lzip(names, test)
    bp_results = pd.DataFrame([names, test])
    print(bp_results)
    return bp_results
예제 #3
0
def ARCH(x, y):
    ols_results = ols(x, y)
    name = [
        'LM statistic', 'p-value of LM test', 'f-statistic of the hypothesis',
        'f p-value'
    ]
    test = sms.het_arch(ols_results.resid, maxlag=1)
    return lzip(name, test)
예제 #4
0
def Breusch_Goldfrey(x, y):
    ols_results = ols(x, y)
    name = [
        'LM statistic', 'p-value of LM test', 'f-statistic of the hypothesis',
        'f p-value'
    ]
    test = sms.acorr_breusch_godfrey(ols_results)
    return lzip(name, test)
예제 #5
0
def regress_bp(SP5002):
    Y = SP5002["SP500"]
    BetaHAT1 = SP5002["Dividend"]
    BetaHAT2 = SP5002["Earnings"]
    BetaHAT3 = SP5002["Consumer Price Index"]
    BetaHAT4 = SP5002["Long Interest Rate"]
    results = sm.ols(formula="Y ~ BetaHAT1 + BetaHAT2 + BetaHAT3 + BetaHAT4",
                     data=SP5002).fit()
    print(results.summary())
    names = [
        'Lagrange multiplier statistic', 'p-value', 'f-value', 'f p-value'
    ]
    test = sms.het_breuschpagan(results.resid, results.model.exog)
    print("")
    lzip(names, test)
    bp_results = pd.DataFrame([names, test])
    print(bp_results)
    return results
예제 #6
0
def load():
    """
    Load the data and return a Dataset class instance.

    Returns
    -------
    Dataset instance:
        See DATASET_PROPOSAL.txt for more information.
    """
    data = _get_data()
    names = data.columns.tolist()
    dtype = lzip(names, ['a45', 'a3', 'a40', 'a14'] + ['<f8'] * 54)
    data = lmap(tuple, data.values.tolist())
    dataset = du.Dataset(data=np.array(data, dtype=dtype).view(np.recarray), names=names)
    return dataset
예제 #7
0
def residuals_vs_fitted(predictions, residuals, out_path=None):
    """Create and return a scatter plot of a model's fitted values (predictions) versus the residuals

    Args:
        predictions: The predictions from a regression
        residuals: The residuals from a regression
        out_path: An optional path to save the graph to

    Returns:
        The residuals vs. fitted graph
    """
    # Get Jarque-bera test of normality
    name = ['Jarque-Bera', 'Chi^2 two-tail prob.', 'Skew', 'Kurtosis']
    test = sm.stats.jarque_bera(residuals)
    jarque_bera = lzip(name, test)
    p_value = jarque_bera[1][1]

    mu = 0
    variance = stats.variance(residuals)
    sigma = math.sqrt(variance)
    x = np.linspace(mu - 4 * sigma, mu + 4 * sigma, 100)

    # Build Scatterplot
    fig, ax = plt.subplots(nrows=1,
                           ncols=2,
                           gridspec_kw={'width_ratios': [3, 1]})
    ax[0].scatter(predictions, residuals)
    ax[0].set_title("Residuals vs. Fitted Values")
    ax[0].set_xlabel("Fitted Values")
    ax[0].set_ylabel("Residuals")
    ax[0].axhline(0, c="k", linewidth=0.5)
    ax[1].hist(residuals, bins=30, orientation="horizontal")
    # ax[1].set_xticks(np.linspace(0, round(ax[1].get_xbound()[1]), 3))
    ax2 = ax[1].twiny()
    # ax2.set_xticks(np.linspace(0, round(ax2.get_xbound()[1], 2), 3))
    ax2.plot(sci_stats.norm.pdf(x, mu, sigma), x, color="red")
    ax[1].set_xlabel("Frequency")
    ax[1].set_title("Residual Distribution")
    fig.tight_layout()
    align_xaxis(ax[1], 0, ax2, 0)
    if out_path:
        fig.savefig(out_path)
    return fig
def actual_test(dict_split, params, definition, reg_type):
    data_reg = {'x_data': dict_split[0], 'y_data': dict_split[2]["co"]}
    reg = ols_reg(p_data=data_reg,
                  p_params=params,
                  p_model=reg_type,
                  p_iter=100000)
    pred_test = reg["model"].predict(dict_split[1])
    residuales = reg['results']['y_data'] - reg['results']['y_data_p']
    #Pruebas de residuales
    vis.residual(residuales=residuales)
    vis.histograma(residuales)
    #heterocedasticidad
    hetero = check_hetero(residuales)
    #jungbox
    ljung = acorr_ljungbox(residuales, lags=7, return_df=True)
    #normality
    name = ["Jarque-Bera", "Chi2 two tail prob", "Skew", "Kurtosis"]
    test = sms.jarque_bera(residuales)
    jarquebera = lzip(name, test)
    rss = sum((dict_split[3]["co"] - pred_test)**2)
    return rss, data_reg, reg, definition, hetero, ljung, jarquebera
예제 #9
0
plt.plot(df['GDP'], predictions, color='teal')
plt.title("Vehicles vs GDP")
plt.xlabel("GDP")
plt.plot(df['GDP'], y, marker='o', linewidth=0, markersize=1.6, color='black')
# plt.savefig("stats_gdp.png")
plt.show()

plt.plot(df['Population'], predictions, color='teal')
plt.title("Vehicles vs Population")
plt.xlabel("Population")
plt.plot(df['Population'],
         y,
         marker='o',
         linewidth=0,
         markersize=1.6,
         color='black')
# plt.savefig("stats_pop.png")
plt.show()

vif_df = pd.DataFrame()
vif_df["VIF Factor"] = [
    variance_inflation_factor(X.values, i) for i in range(X.shape[1])
]
vif_df["features"] = X.columns
print(vif_df)

name = ['Lagrange multiplier statistic', 'p-value', 'f-value', 'f p-value']
test = sm.stats.het_breuschpagan(model.resid, model.model.exog)
test = lzip(name, test)

pprint(test)
예제 #10
0
# In[20]:


##heteroskedasticity

x = reg01.model.data.orig_exog
print(x.head())
print('\n')
print( reg01.resid.head())
white = sm.stats.diagnostic.het_white( reg01.resid, x)

ret = ['Test Statistic', 'p-Value', 'F Statistic', 'p-Value']
xzip01 = zip(ret, white)

print( '\nWhites Test for Heteroskedasticity')
lzip(xzip01)


# In[29]:


##vif

indepvar = ['drugabuse', 'alcabuse', 'wage', 'mentalhealth', 'housing']

x = np.diag( np.linalg.inv( corr_m))

xzip = zip(indepvar, x)

lzip( xzip)
예제 #11
0
weights = rob_crime_model.weights
idx = weights > 0
X = rob_crime_model.model.exog[idx.values]
ww = weights[idx] / weights[idx].mean()
hat_matrix_diag = ww * (X * np.linalg.pinv(X).T).sum(1)
resid = rob_crime_model.resid
resid2 = resid**2
resid2 /= resid2.sum()
nobs = int(idx.sum())
hm = hat_matrix_diag.mean()
rm = resid2.mean()

from statsmodels.graphics import utils
fig, ax = plt.subplots(figsize=(12, 8))
ax.plot(resid2[idx], hat_matrix_diag, 'o')
ax = utils.annotate_axes(
    range(nobs),
    labels=rob_crime_model.model.data.row_labels[idx],
    points=lzip(resid2[idx], hat_matrix_diag),
    offset_points=[(-5, 5)] * nobs,
    size="large",
    ax=ax)
ax.set_xlabel("resid2")
ax.set_ylabel("leverage")
ylim = ax.get_ylim()
ax.vlines(rm, *ylim)
xlim = ax.get_xlim()
ax.hlines(hm, *xlim)
ax.margins(0, 0)
model = smf.ols("futureMargin ~ daysSinceLastOrder + margin + returnRatio +  shareOwnBrand + shareVoucher + shareSale + itemsPerOrder", data = Clv).fit()
model.summary()


stats.probplot(model.resid, plot= plt)
plt.title("Model1 Residuals Probability Plot")

# Residuals are normally distributed! Woot! Hence inference tests can be used


# Homoscedasticity or constant variance of residuals

TestNames = ['Lagrange multiplier statistic', 'p-value',
        'f-value', 'f p-value']
test = sms.het_breuschpagan(model.resid, model.model.exog)
lzip(TestNames, test)


# Split the data into training/testing sets
clv_X_train = Clv6[:-20]
clv_X_test = Clv6[-20:]


# Split the targets into training/testing sets
clv_y_train = Clv.futureMargin[:-20]
clv_y_test = Clv.futureMargin[-20:]


# Create linear regression object
regr = sk.linear_model.LinearRegression()
np.sqrt(np.mean(model.resid**2)) # 244

# checking normality of residuals
stats.anderson(model.resid) # residuals are normal

# checking auto-correlation of residuals
from statsmodels.stats import diagnostic as diag
diag.acorr_ljungbox(model.resid, lags=1)
# pvalue is <0.05, so autocorrelation is present

# checking heteroscedasticity
import statsmodels.stats.api as sms
from statsmodels.compat import lzip
name = ['F-statistic','p-value']
gold_test = sms.het_goldfeldquandt(model.resid,model.model.exog)
lzip(name,gold_test)
# ('F-stat', 0.6722696289421596), ('p-value', 0.9999999999999999)], 
# go with null, residuals are homoscedastic: constant variance

pred_price = model.predict(computer2.drop(['price'],axis=1))
pred_price[0:4]
computer2.price[0:4]
model.resid[0:4]
1499-1787

# except for autocorrelation all assumptions are satisfied

# splitting data
from sklearn.model_selection import train_test_split
train,test = train_test_split(computer2,test_size=0.30, random_state=100)
 1837/6122
예제 #14
0
파일: olsrr.py 프로젝트: evanwire/OLSRR
 def ols_test_breusch_pagan(self):
     names = [
         'Lagrange multiplier statistic', 'p-value', 'f-value', 'f p-value'
     ]
     bp = sms.het_breuschpagan(self.residuals, self.model.model.exog)
     return lzip(names, bp)
예제 #15
0
def harveyCollier(results):
    name = ['t value', 'p value']
    test = sms.linear_harvey_collier(results)
    lzip(name, test)
예제 #16
0

# There aren't yet an influence diagnostics as part of RLM, but we can recreate them. (This depends on the status of [issue #888](https://github.com/statsmodels/statsmodels/issues/808))

weights = rob_crime_model.weights
idx = weights > 0
X = rob_crime_model.model.exog[idx]
ww = weights[idx] / weights[idx].mean()
hat_matrix_diag = ww*(X*np.linalg.pinv(X).T).sum(1)
resid = rob_crime_model.resid
resid2 = resid**2
resid2 /= resid2.sum()
nobs = int(idx.sum())
hm = hat_matrix_diag.mean()
rm = resid2.mean()


from statsmodels.graphics import utils
fig, ax = plt.subplots(figsize=(12,8))
ax.plot(resid2[idx], hat_matrix_diag, 'o')
ax = utils.annotate_axes(range(nobs), labels=rob_crime_model.model.data.row_labels[idx],
                    points=lzip(resid2[idx], hat_matrix_diag), offset_points=[(-5,5)]*nobs,
                    size="large", ax=ax)
ax.set_xlabel("resid2")
ax.set_ylabel("leverage")
ylim = ax.get_ylim()
ax.vlines(rm, *ylim)
xlim = ax.get_xlim()
ax.hlines(hm, *xlim)
ax.margins(0,0)
예제 #17
0
def omniTest(residuals):
    name = ['Chi^2', 'Two-tail probability']
    test = sms.omni_normtest(residuals)
    lzip(name, test)
예제 #18
0
def Goldfeld_Quant(X,y):
    ols_retults=ols(X,y)
    name = ['F statistic', 'p-value']
    test = sms.HetGoldfeldQuandt().run(ols_results.model.endog, ols_results.model.exog, idx=None, \
                                 split=0.25, drop =0.5, alternative ='two-sided', attach=True )
    return lzip(name, test)
예제 #19
0
def Breush_Pagan(X,y):
    ols_retults=ols(X,y)
    name = ['LM statistic', 'p-value of LM test', 
            'f-statistic of the hypothesis', 'f p-value']
    test = sms.het_breushpagan(ols_retults.resid, ols_retults.model.exog)
    return lzip(name, test)
plt.figure(figsize=(12, 5))

# Plot a simple histogram with binsize determined automatically
sns.distplot(res_2.resid, 20)
plt.title('Histogram of residuals')
plt.xlabel('Residuals')
plt.ylabel('Density')
plt.grid(True)
plt.show()

from statsmodels.compat import lzip
import statsmodels.stats.api as sms

name = ['Lagrange multiplier statistic', 'p-value', 'f-value', 'f p-value']
results1 = sms.acorr_breusch_godfrey(res_2, 10)
print(lzip(name, results1))

name = ['Jarque-Bera', 'Chi^2 two-tail prob.', 'Skew', 'Kurtosis']
JB, JBpv, skw, kurt = sm.stats.stattools.jarque_bera(res_2.resid)
print(lzip(name, results1))

print(res_2.expected_durations)

print(res_2.conf_int())

predict = res_2.predict()
predict = pd.DataFrame(predict.tail(20))
predict.rename(columns={0: 'Predicted'}, inplace=True)
predict.rename(columns={0: 'Predicted'}, inplace=True)
combine = pd.concat([predict, data['forecast_variable'].tail(20)], axis=1)
combine = combine.reset_index()
예제 #21
0
#%% DATA TRASFORMATION - LOGARITHMIC
''' some predictors and the target variable present a very skewed distribution.
THerefore we should consider to apply the logarithmic transformation.
This helps in turning the distribution is something more gaussian.
Let's apply a log-log trasformation '''

import numpy as np
from statsmodels.stats.stattools import jarque_bera as jb
from statsmodels.stats.stattools import omni_normtest as omb
from statsmodels.compat import lzip

# Jarque-Bera normality test
name = ['Jarque-Bera', 'Chi^2 two-tail probability', 'Skewness', 'Kurtosis']
test_results = jb(data.duration)
lzip(name, test_results)

# vote_count
data.vote_count = np.log(data.vote_count + 1)
# comment_count
data.comment_count = np.log(data.comment_count + 1)
# description_length
data.description_length = np.log(data.description_length + 1)
# watch-count
data.watch_count = np.log(data.watch_count + 1)
# duration
data.duration = np.log(data.duration)

# run test again
test_results = jb(data.duration)
lzip(name, test_results)  # very improved! :)
dat = pd.read_csv(url)

# Fit regression model (using the natural log of one of the regressaors)
results = smf.ols('Lottery ~ Literacy + np.log(Pop1831)', data=dat).fit()

# Inspect the results
print(results.summary())


# ## Normality of the residuals

# Jarque-Bera test:

name = ['Jarque-Bera', 'Chi^2 two-tail prob.', 'Skew', 'Kurtosis']
test = sms.jarque_bera(results.resid)
lzip(name, test)


# Omni test:

name = ['Chi^2', 'Two-tail probability']
test = sms.omni_normtest(results.resid)
lzip(name, test)


# ## Influence tests
# 
# Once created, an object of class ``OLSInfluence`` holds attributes and methods that allow users to assess the influence of each observation. For example, we can compute and extract the first few rows of DFbetas by:

from statsmodels.stats.outliers_influence import OLSInfluence
test_class = OLSInfluence(results)
예제 #23
0
def goldfeldQuandtTest(residuals, exogVars):
    name = ['F statistic', 'p-value']
    test = sms.het_goldfeldquandt(residuals, exogVars)
    lzip(name, test)
예제 #24
0
print(session.s12.describe())
print(session.s2.describe())
print(session.on.describe())

#各種統計量の算出、比較
import statsmodels.api as sm
import numpy as np
from statsmodels.compat import lzip
import statsmodels.stats.api as sms
print(sm.tsa.adfuller(session.s1, regression='nc')[1])  #[1]はp値の検定結果
print(sm.tsa.adfuller(session.s1, regression='c')[1])  #[1]はp値の検定結果
print(sm.tsa.adfuller(session.s1, regression='ct')[1])  #[1]はp値の検定結果
print(session.s1.mean() / session.s1.std() * np.sqrt(session.s1.count()))
estimator = ['JB', 'Chi-squared p-value', 'Skew', 'Kurtosis']
test = sms.jarque_bera(session.s1)
print('s1: ', lzip(estimator, test))

print(sm.tsa.adfuller(session.s12, regression='nc')[1])  #[1]はp値の検定結果
print(sm.tsa.adfuller(session.s12, regression='c')[1])  #[1]はp値の検定結果
print(sm.tsa.adfuller(session.s12, regression='ct')[1])  #[1]はp値の検定結果
print(session.s1.mean() / session.s12.std() * np.sqrt(session.s1.count()))
estimator = ['JB', 'Chi-squared p-value', 'Skew', 'Kurtosis']
test = sms.jarque_bera(session.s12)
print('s12: ', lzip(estimator, test))

print(sm.tsa.adfuller(session.s2, regression='nc')[1])  #[1]はp値の検定結果
print(sm.tsa.adfuller(session.s2, regression='c')[1])  #[1]はp値の検定結果
print(sm.tsa.adfuller(session.s2, regression='ct')[1])  #[1]はp値の検定結果
print(session.s2.mean() / session.s2.std() * np.sqrt(session.s2.count()))
estimator = ['JB', 'Chi-squared p-value', 'Skew', 'Kurtosis']
test = sms.jarque_bera(session.s2)
예제 #25
0
def jarqueBeraTest(residuals):
    name = ['Jarque-Bera', 'Chi^2 two-tail prob.', 'Skew', 'Kurtosis']
    test = sms.jarque_bera(residuals)
    lzip(name, test)
예제 #26
0
def linear_regression_analysis(linear_regression):
    """ Compute and plot a complete analysis of a linear regression computed with Stats Models.
    Args:
         linear_regression (Stats Models Results): the result obtained  with Stats Models.

    """

    # Data
    resid = linear_regression.resid_pearson.copy()
    resid_index = linear_regression.resid.index
    exog = linear_regression.model.exog
    endog = linear_regression.model.endog
    fitted_values = linear_regression.fittedvalues
    influences = outliers_influence.OLSInfluence(linear_regression)

    p = exog.shape[1] # Number of features
    n = len(resid) # Number of individuals

    # Paramètres
    color1 = "#3498db"
    color2 = "#e74c3c"

    ##############################################################################
    # Tests statistiques                                                         #
    ##############################################################################

    # Homoscédasticité - Test de Breusch-Pagan
    ##########################################

    names = ['Lagrande multiplier statistic', 'p-value', 'f-value', 'f p-value']
    breusch_pagan = sm.stats.diagnostic.het_breuschpagan(resid, exog)
    print(lzip(names, breusch_pagan))

    # Test de normalité - Shapiro-Wilk
    ###################################

    print(f"Shapiro pvalue : {st.shapiro(resid)[1]}")

    ##############################################################################
    # Analyses de forme                                                          #
    ##############################################################################

    # Histogramme des résidus
    ##########################
    data = resid
    data_filter = data[data < 5]
    data_filter = data[data > -5]
    len_data = len(data)
    len_data_filter = len(data_filter)
    ratio = len_data_filter / len_data

    fig, ax = plt.subplots()
    plt.hist(data_filter, bins=20, color=color1)
    plt.xlabel("Residual values")
    plt.ylabel("Number of residuals")
    plt.title(f"Histogramme des résidus de -5 à 5 ({ratio:.2%})")

    # Normal distribution vs residuals (QQ Plot, droite de Henry)
    #############################################################
    data = pd.Series(resid).sort_values()
    len_data = len(data)

    normal = pd.Series(np.random.normal(size=len_data)).sort_values()
    fig, ax = plt.subplots()
    plt.scatter(data, normal, c=color1)
    plt.plot((-4,4), (-4, 4), c=color2)
    plt.xlabel("Residuals")
    plt.ylabel("Normal distribution")
    plt.xlim(-4, 4)
    plt.ylim(-4, 4)
    plt.title("Residuals vs Normal (QQ Plot)")

    #  Fitted vs Residuals
    ######################
    data = resid
    fig, ax = plt.subplots()
    plt.scatter(fitted_values, data, alpha=0.5, c=color1)
    plt.xlabel("Fitted values")
    plt.ylabel("Residuals")
    plt.title("Fitted vs Residuals")

    # Actual vs Predict plot
    fig, ax = plt.subplots()
    plt.scatter(endog, fitted_values, c=color1, alpha=0.5)
    plt.plot(endog, endog, c=color2)
    plt.xlabel("Actual values")
    plt.ylabel("Fitted values")
    plt.title("Acutal vs Predict")

    ##############################################################################
    # Analyse des outliers                                                       #
    ##############################################################################

    # Leviers (hii, diagonale de la matrice chapeau)
    ################################################

    # Individus atypiques (distance à la moyenne des observations)

    # Calcul de la proportion
    data = influences.hat_matrix_diag
    seuil = 2*p/n
    len_data = len(data)
    data_filter = data[data <= seuil]
    len_data_filter = len(data_filter)
    ratio = len_data_filter / len_data

    # Plot
    fig, ax = plt.subplots()
    plt.plot(data)
    plt.plot((0, len_data), (seuil, seuil), c="#d35400")
    plt.ylabel("Leverage values (hii)")
    plt.title(f"Leviers avec seuil à 2*p/n ({ratio:.2%})")

    # Résidus studentisés
    #####################

    # Individus mal représentés par le modèle

    # Calcul de la proportion
    data = influences.resid_studentized_internal
    len_data = len(data)
    data_filter = data[data <= 2]
    data_filter = data_filter[data_filter >= -2]
    len_data_filter = len(data_filter)
    ratio = len_data_filter / len_data

    # Plot
    fig, ax = plt.subplots()
    plt.plot(data)
    plt.plot((0, len_data), (2, 2), c="#d35400")
    plt.plot((0, len_data), (-2, -2), c="#d35400")
    plt.ylabel("Studentized Residuals")
    plt.title(f"Résidus studentisés avec seuil à 2 et -2 ({ratio:.2%})")

    # Distances de cook
    ###################

    # Outliers dont la supression influencent fortement le modèle

    # Calcul de la proportion
    data = influences.cooks_distance[0]
    seuil = 4/(n-p)
    len_data = len(data)
    data_filter = data[data <= seuil]
    len_data_filter = len(data_filter)
    ratio = len_data_filter / len_data

    # Plot
    fig, ax = plt.subplots()
    plt.plot(data)
    plt.plot((0, len_data), (seuil, seuil))
    plt.ylabel("Cook Distance")
    plt.title(f"Distances de Cook avec seuil à 4/(n-p) ({ratio:.2%})")

    # Plot
    plt.show()
예제 #27
0
#================================================
#================================================
#======================Normality of the residuals 
sns.distplot(np.array(residual));
plt.show()

sns.distplot(residual_z);
plt.show()

#======================Jarque-Bera test:
import statsmodels.stats.api as sms
from statsmodels.compat import lzip
name1 = ['Jarque-Bera', 'Chi^2 two-tail prob.', 'Skew', 'Kurtosis']
test1 = sms.jarque_bera(lr.resid)
lzip(name1, test1)
#null hypothesis: the data is normally distributed.

#======================Omni test:
name2 = ['Chi^2', 'Two-tail probability']
test2 = sms.omni_normtest(lr.resid)
lzip(name2, test2)
'''

#================================================
#================================================
#=========================Heteroskedasticity test
#======================Breush-Pagan test:
name3 = ['Lagrange multiplier statistic', 'p-value', 'f-value', 'f p-value']
test3 = sms.het_breuschpagan(lr.resid, lr.model.exog)
lzip(name3, test3)
예제 #28
0
    def diagnostic_plots(self, linear_model):
        """
        :param linear_model: Linear Model Fit on the Data
        :return: None
        This method validates the assumptions of Linear Model
        """
        diagnostic_result = {}

        summary = linear_model.summary()
        #diagnostic_result['summary'] = str(summary)

        # fitted values
        fitted_y = linear_model.fittedvalues
        # model residuals
        residuals = linear_model.resid

        # normalized residuals
        residuals_normalized = linear_model.get_influence().resid_studentized_internal

        # absolute squared normalized residuals
        model_norm_residuals_abs_sqrt = np.sqrt(np.abs(residuals_normalized))

        # leverage, from statsmodels internals
        leverage = linear_model.get_influence().hat_matrix_diag

        # cook's distance, from statsmodels internals
        cooks = linear_model.get_influence().cooks_distance[0]

        self.check_linearity_assumption(fitted_y, residuals)

        self.check_residual_normality(residuals_normalized)

        self.check_homoscedacticity(fitted_y, model_norm_residuals_abs_sqrt)

        self.check_influcence(leverage, cooks, residuals_normalized)

        # 1. Non-Linearity Test
        try:
            name = ['F value', 'p value']
            test = sms.linear_harvey_collier(linear_model)
            linear_test_result = lzip(name, test)
        except Exception as e:
            linear_test_result = str(e)
        diagnostic_result['Non_Linearity_Test'] = linear_test_result

        # 2. Hetroskedasticity Test
        name = ['Lagrange multiplier statistic', 'p-value',
                'f-value', 'f p-value']
        test = sms.het_breuschpagan(linear_model.resid, linear_model.model.exog)
        test_val = lzip(name, test)
        diagnostic_result['Hetroskedasticity_Test'] = test_val

        # 3. Normality of Residuals
        name = ['Jarque-Bera', 'Chi^2 two-tail prob.', 'Skew', 'Kurtosis']
        test = sms.jarque_bera(linear_model.resid)
        test_val = lzip(name, test)
        diagnostic_result['Residual_Normality_Test'] = test_val

        # 4. MultiCollnearity Test
        test = np.linalg.cond(linear_model.model.exog)
        test_val = [('condition no',test)]
        diagnostic_result['MultiCollnearity_Test'] = test_val

        # 5. Residuals Auto-Correlation Tests
        test = sms.durbin_watson(linear_model.resid)
        test_val = [('p value', test)]
        diagnostic_result['Residual_AutoCorrelation_Test'] = test_val

        json_result = json.dumps(diagnostic_result)
        return summary, json_result
# correlation 
al_cor=sold_prediction.corr()
al_cor=al_cor.unstack()
al_cor["sale_price_raw_m"].sort_values(ascending=False)



# Assumption of Independent Errors

statsmodels.stats.stattools.durbin_watson(lm2.resid)



name = ['Jarque-Bera', 'Chi^2 two-tail prob.', 'Skew', 'Kurtosis']
test = sms.jarque_bera(lm2.resid)
print(lzip(name, test))


# Assumption of Normality of the Residuals

sold_prediction['sale_price_raw_m'].plot(kind='hist', 
                       title= 'Log of Sale Price Distribution')



# Assumption of Normality of the Residuals

sold_prediction['sale_price_raw_m_log'] = np.log(sold_prediction['sale_price_raw_m'])
sold_prediction['sale_price_raw_m_log'].plot(kind='hist', 
                       title= 'Log of Sale Price Distribution')
예제 #30
0
# In[115]:


#各種統計量の算出、比較
import statsmodels.api as sm
import numpy as np
from statsmodels.compat import lzip
import statsmodels.stats.api as sms
print('adf nc',sm.tsa.adfuller(session.s1,regression='nc')[1]) #[1]はp値の検定結果
print('adf  c',sm.tsa.adfuller(session.s1,regression='c')[1]) #[1]はp値の検定結果
print('adf ct',sm.tsa.adfuller(session.s1,regression='ct')[1]) #[1]はp値の検定結果
print(session.s1.mean()/session.s1.std()*np.sqrt(session.s1.count()))
estimator = ['JB', 'Chi-squared p-value', 'Skew', 'Kurtosis']
test = sms.jarque_bera(session.s1)
print('s1: ',lzip(estimator, test))


# In[116]:


print(sm.tsa.adfuller(session.s12,regression='nc')[1]) #[1]はp値の検定結果
print(sm.tsa.adfuller(session.s12,regression='c')[1]) #[1]はp値の検定結果
print(sm.tsa.adfuller(session.s12,regression='ct')[1]) #[1]はp値の検定結果
print(session.s1.mean()/session.s12.std()*np.sqrt(session.s1.count()))
estimator = ['JB', 'Chi-squared p-value', 'Skew', 'Kurtosis']
test = sms.jarque_bera(session.s12)
print('s12: ',lzip(estimator, test))


# In[117]:
# #888](https://github.com/statsmodels/statsmodels/issues/808))

weights = rob_crime_model.weights
idx = weights > 0
X = rob_crime_model.model.exog[idx.values]
ww = weights[idx] / weights[idx].mean()
hat_matrix_diag = ww * (X * np.linalg.pinv(X).T).sum(1)
resid = rob_crime_model.resid
resid2 = resid**2
resid2 /= resid2.sum()
nobs = int(idx.sum())
hm = hat_matrix_diag.mean()
rm = resid2.mean()

from statsmodels.graphics import utils
fig, ax = plt.subplots(figsize=(12, 8))
ax.plot(resid2[idx], hat_matrix_diag, 'o')
ax = utils.annotate_axes(range(nobs),
                         labels=rob_crime_model.model.data.row_labels[idx],
                         points=lzip(resid2[idx], hat_matrix_diag),
                         offset_points=[(-5, 5)] * nobs,
                         size="large",
                         ax=ax)
ax.set_xlabel("resid2")
ax.set_ylabel("leverage")
ylim = ax.get_ylim()
ax.vlines(rm, *ylim)
xlim = ax.get_xlim()
ax.hlines(hm, *xlim)
ax.margins(0, 0)
예제 #32
0
    for d in range(len(Direction)):
        print(files[i][:-4] + Direction[d])
        sheet = files[i][:-4] + Direction[d]
        dat = pd.read_excel('TempVDistance 2.xlsx', sheetname=sheet)
        Dis = dat['Distance'].tolist()
        T = dat['Temp'].tolist()
        # Fit regression model (using the natural log of one of the regressaors)
        results = smf.ols('Temp ~ Distance', data=dat).fit()

        name = [
            'Lagrange multiplier statistic', 'p-value', 'f-value', 'f p-value'
        ]

        #BP Test for HeteroScadicity
        test = sms.het_breushpagan(results.resid, results.model.exog)
        p_value = lzip(name, test)[1][1]

        #print(p_value)
        if p_value < .05:
            Dis = sm.add_constant(Dis)
            ols_resid = sm.OLS(T, Dis).fit().resid
            res_fit = sm.OLS(ols_resid[1:], ols_resid[:-1]).fit()
            rho = res_fit.params
            order = toeplitz(range(len(ols_resid)))
            sigma = rho**order
            gls_model = sm.GLS(T, Dis, sigma=sigma)
            gls_results = gls_model.fit()
            ws.cell(row=3 * i + 1, column=d + 2).value = gls_results.params[1]
            ws.cell(row=3 * i + 2, column=d + 2).value = gls_results.rsquared
            ws.cell(row=3 * i + 3, column=d + 2).value = gls_results.pvalues[1]
        else:
# %% [code]
p = sns.scatterplot(y_pred, residuals)
plt.xlabel('y_pred/predicted values')
plt.ylabel('Residuals')
plt.ylim(-10, 10)
plt.xlim(0, 26)
p = sns.lineplot([0, 26], [0, 0], color='blue')
p = plt.title('Residuals vs fitted values plot for homoscedasticity check')

# %% [code]
import statsmodels.stats.api as sms
from statsmodels.compat import lzip
name = ['F statistic', 'p-value']
test = sms.het_goldfeldquandt(residuals, X_train)
lzip(name, test)

# %% [code]
from scipy.stats import bartlett
test = bartlett(X_train, residuals)
print(test)

# %% [markdown]
# ## <a id="normal">4. Check for Normality of error terms/residuals</a>

# %% [code]
p = sns.distplot(residuals, kde=True)
p = plt.title('Normality of error terms/residuals')

# %% [markdown]
# ## <a id="auto">5. No autocorrelation of residuals</a>
예제 #34
0
def linear_regression_analysis(linear_regression):
    """ Compute and plot a complete analysis of a linear regression computed with Stats Models.
    Args:
         linear_regression (Stats Models Results): the result obtained  with Stats Models.

    """

    # Data
    resid = linear_regression.resid_pearson.copy()
    resid_index = linear_regression.resid.index
    exog = linear_regression.model.exog
    endog = linear_regression.model.endog
    fitted_values = linear_regression.fittedvalues
    influences = outliers_influence.OLSInfluence(linear_regression)

    p = exog.shape[1]  # Number of features
    n = len(resid)  # Number of individuals

    # Paramètres
    color1 = "#3498db"
    color2 = "#e74c3c"

    ##############################################################################
    # Tests statistiques                                                         #
    ##############################################################################

    # Homoscédasticité - Test de Breusch-Pagan
    ##########################################

    names = [
        'Lagrande multiplier statistic', 'p-value', 'f-value', 'f p-value'
    ]
    breusch_pagan = sm.stats.diagnostic.het_breuschpagan(resid, exog)
    print(lzip(names, breusch_pagan))

    # Test de normalité - Shapiro-Wilk
    ###################################

    print(f"Shapiro pvalue : {st.shapiro(resid)[1]}")

    ##############################################################################
    # Analyses de forme                                                          #
    ##############################################################################

    # Histogramme des résidus
    ##########################
    data = resid
    data_filter = data[data < 5]
    data_filter = data[data > -5]
    len_data = len(data)
    len_data_filter = len(data_filter)
    ratio = len_data_filter / len_data

    fig, ax = plt.subplots()
    plt.hist(data_filter, bins=20, color=color1)
    plt.xlabel("Residual values")
    plt.ylabel("Number of residuals")
    plt.title(f"Histogramme des résidus de -5 à 5 ({ratio:.2%})")

    # Normal distribution vs residuals (QQ Plot, droite de Henry)
    #############################################################
    data = pd.Series(resid).sort_values()
    len_data = len(data)

    normal = pd.Series(np.random.normal(size=len_data)).sort_values()
    fig, ax = plt.subplots()
    plt.scatter(data, normal, c=color1)
    plt.plot((-4, 4), (-4, 4), c=color2)
    plt.xlabel("Residuals")
    plt.ylabel("Normal distribution")
    plt.xlim(-4, 4)
    plt.ylim(-4, 4)
    plt.title("Residuals vs Normal (QQ Plot)")

    # Plot
    plt.show()