示例#1
0
    def _detect_outliers(self, max_prob):
        """Detects outlier time points.

    Find the dates in the data set that are recommended to be removed as
    outliers.

    Args:
      max_prob: (float between 0 and 1) Maximum acceptable probability of
        having observed percentile of maximum studentized residual be greater
        than the reference distribution.

    Returns:
      A list of dates (in the data set) that were detected to be outliers.
    """
        excluded_dates = []
        while True:
            data_subset = self._analysis_data.drop(excluded_dates)
            if data_subset.shape[0] == 0:
                break
            reg_fit = smf.ols('y ~ x', data=data_subset).fit()
            absresid = abs(
                OLSInfluence(reg_fit).get_resid_studentized_external())
            pretest_len = data_subset.shape[0] - len(excluded_dates)
            beta_quantile = stats.beta.ppf(1 - max_prob, pretest_len, 1)
            threshold = stats.t.ppf((1 + beta_quantile) / 2,
                                    df=pretest_len - 3)
            max_resid = max(absresid)
            if max_resid < threshold:
                break
            exclude_date = list(data_subset.index[absresid == max_resid])
            excluded_dates.extend(exclude_date)

        return excluded_dates
示例#2
0
 def _setup_cooks_distance(self, ols_results):
     ols_influence = OLSInfluence(ols_results)
     cooks_distance, _ = ols_influence.cooks_distance
     dates_index = self.input_data.regressors_df.index
     self.cooks_distance_tms = QFSeries(data=cooks_distance,
                                        index=dates_index.copy())
     self.ols_influence = ols_influence
示例#3
0
def sm_lin_fit_diagnostics(model,
                           max_leverage=0.3,
                           cook_th=0.25,
                           cook_labels=None):
    """
    Make diagnostic plots of linear regression.

    Parameters
    ----------
    model: statsmodels.RegressionResult
        E.g. returned by `sm_lin_fit`.

    Returns
    -------
    pyplot.Axes:
        On which diagnostics are drawn.

    """
    def cook_distance(residuals, leverage, k_vars):
        return residuals**2 * leverage / (1 - leverage) / k_vars

    influence = OLSInfluence(model)
    k_vars = influence.k_vars
    standardized_resid = influence.resid_studentized_internal

    # Init plot
    gs_kw = {'hspace': 0.35, 'wspace': 0.35, 'right': 0.95, 'left': 0.1}
    _, ax = plt.subplots(2, 2, figsize=(8, 8), gridspec_kw=gs_kw)

    # Residuals vs. predicted panel
    ax[0, 0].plot(model.fittedvalues, model.resid, 'ko')
    ax[0, 0].set(xlabel='Fitted values', ylabel='Residuals')

    # Q-Q plot panel
    sm.qqplot(model.resid, ax=ax[0, 1], line='r')

    # Scale-location panel
    ax[1, 0].plot(model.fittedvalues, np.sqrt(standardized_resid**2), 'ko')
    ax[1, 0].set(xlabel='Fitted values',
                 ylabel=r'$|$Standardized residuals$|^{\frac{1}{2}}$')

    # Leverage panel
    leverage = model.get_influence().hat_matrix_diag
    ax[1, 1].plot(leverage, standardized_resid, 'ko')
    ax[1, 1].set(xlabel='Leverage', ylabel='Standradized residuals')
    if cook_labels is not None:
        for n_, l_, r_ in zip(cook_labels, leverage, standardized_resid):
            if cook_distance(r_, l_, k_vars) > cook_th:
                ax[1, 1].text(l_, r_, n_)

    # --- Cook distance lines
    x = np.linspace(0, max_leverage, 100)
    y = np.linspace(-6, 6, 100)
    X, Y = np.meshgrid(x, y)
    C = cook_distance(Y, X, k_vars)
    ctr = ax[1, 1].contour(X, Y, C, [0.5, 1], colors='r', linestyles='--')
    plt.clabel(ctr, fmt='%.1f')

    return ax, cook_distance(standardized_resid, leverage, k_vars)
示例#4
0
def plot_influence(model):
    residuals = pd.Series(model.resid, name="Residuals")
    leverage = pd.Series(OLSInfluence(model).influence, name="Leverage")
    _ = sns.regplot(residuals, leverage, fit_reg=False)
    plt.show()
    sm.graphics.influence_plot(model, alpha=0.05, criterion="cooks")
    plt.show()
    return leverage
def conf_pred_band_ex(regress_ex , poly, model, alpha=0.05):
    
    # Regressionsfunktion übernehmen und Matrix der neuen Stützstellen aufbauen
    # Neue Stützstellen übernehmen, alle Terme entsprechen einer Spalte 
    poly_ex = ols(poly.formula,regress_ex)    
    x0 = poly_ex.exog
    
    # Konfidenz- und Prognodebereich berechnen
    # Kenngrößen aus model verwenden, weil das zur ursprünglichen Regression verwendet wurde
    infl = OLSInfluence(model)    
    d = np.dot(x0,np.dot(infl.results.model.normalized_cov_params,x0.T))
    tppf = stats.t.isf(alpha/2, model.df_resid)
    lconf_ex = tppf*np.sqrt(np.diag(d)*model.mse_resid)
    lprog_ex=tppf *np.sqrt((1+np.diag(d))*model.mse_resid)
    
    return lconf_ex, lprog_ex
示例#6
0
def get_cooks_d(lm):
    vals = OLSInfluence(lm).summary_frame()
    cooks_d = vals['cooks_d'].values
    return cooks_d
示例#7
0
def get_standard_residuals(lm):
    vals = OLSInfluence(lm).summary_frame()
    std_resid = vals['standard_resid'].values
    return std_resid
示例#8
0
    if not coverage:
        if varflag:
             if 'VARTRUE' not in d['varflag']: # code here in case we decided to downweight differently later
                coverage=[0]
        else:
            coverage=[0]
    ys.append(sum(coverage))

X['intercept'] = np.ones(len(ys))
if removesyn:
    X.pop('syn', None)
X = pd.DataFrame(X)


results = sm.OLS(ys, X, hasconst=True).fit()
resid = OLSInfluence(results).get_resid_studentized_external()
#variables={}
#variables['cpg']=X['CpG']
#variables['cov']=ys
#variables['resid']=resid
#variables['rawresid']=results.resid
#variables['genes']=genes
#variables['gerp']=gerp
#variables['intercept']=results.params['intercept']
#variables['cpgcoef']=results.params['CpG']
#pickle.dump(variables, open("var.pickle", "wb"))

lowestresidual=np.min(resid)-.001
#for i, row in enumerate(genes):
#    if "VARTRUE" in row[7] and varflag: #row[7] is varflag
#        resid[i]=lowestresidual
示例#9
0
ax.plot(x, predictions.obs_ci_lower, color='0.75', label="Prediction Interval")
ax.plot(x, predictions.obs_ci_upper, color='0.75', label="")

# plot the high and low mean confidence intervals
ax.plot(x, predictions.mean_ci_lower, color='r', label="Predicted Mean CI")
ax.plot(x, predictions.mean_ci_upper, color='r', label="")

ax.legend(loc='best')
plt.xlabel('LSTAT')
plt.ylabel('MEDV')
plt.savefig(PATH + 'medv.png', dpi=300)
plt.close()

# We need this for leverage and studentized residuilas calculations.
from statsmodels.stats.outliers_influence import OLSInfluence
influence = OLSInfluence(lm_fit)
leverage = influence.hat_matrix_diag
stud_res = influence.resid_studentized_external

# Create plots of residuals
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 6))

# Plot the residual for each fitted value
ax1.scatter(lm_fit.fittedvalues,
            lm_fit.resid,
            facecolors='none',
            edgecolors='b')
ax1.set_xlabel('fitted values')
ax1.set_ylabel('residuals')
# The residual plot indicates significant nonlinearity (a u-shape pattern is clear)
示例#10
0
#You can grab individual statistics from the model by accessing the result attributes such as .resid and .fvalue


# Creating a dataframe of values for prediction, we will predict the medv
# with lstats of 5, 10 and 15, as well as computing our confidence interval
new = pd.DataFrame([[1,5], [1,10],[1,15]], columns = ['Intercept', 'lstat'])
print(predict(result, new));

# Plotting the lstat and medv data in boston. The regplot automatically produces
# an OLS fit. *fit_reg  = True produces an estimate of the regression line uncorrelated with our model *

#sns.regplot('lstat', 'medv', boston, line_kws = {"color": 'r'}, ci=100, fit_reg = True)		#Data plot with estimated regression line


# Pulling the fitted values and residuals and 
fitted_values = pd.Series(result.fittedvalues, name = "Fitted Values")
residuals = pd.Series(result.resid, name = "Residuals")					

#sns.regplot(fitted_values, residuals, fit_reg=False)					#Residuals Plot

# Looking for high leverage points
from statsmodels.stats.outliers_influence import OLSInfluence
s_residuals = pd.Series(result.resid_pearson, name = "S. Residuals")			#Normalized residuals can be retrieved with result.resid_pearson
leverage = pd.Series(OLSInfluence(result).influence, name = "Leverage")
sns.regplot(leverage, s_residuals, fit_reg = False)`


plt.show()


# plot the high and low prediction intervals
ax.plot(x, predictions.obs_ci_lower, color='0.75', label="Prediction Interval")
ax.plot(x, predictions.obs_ci_upper, color='0.75', label="")

# plot the high and low mean confidence intervals
ax.plot(x, predictions.mean_ci_lower, color='r', label="Predicted Mean CI")
ax.plot(x, predictions.mean_ci_upper, color='r', label="")

ax.legend(loc='best')

plt.xlabel('LSTAT')
plt.ylabel('MEDV')

# For checking the Linearity, homoskedasticity, Outliers
influence = OLSInfluence(fit_model)
leverage = influence.hat_matrix_diag
studentized_res = influence.resid_studentized_external

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 6))
# Plotting the residual for each fitted value
ax1.scatter(fit_model.fittedvalues,
            fit_model.resid,
            facecolors='none',
            edgecolors='b')
ax1.set_xlabel('fitted values')
ax1.set_ylabel('residuals')

# Plotting the studentized residuals
ax2.scatter(fit_model.fittedvalues,
            studentized_res,
示例#12
0
文件: plot.py 项目: t-hdd/lmdiag
 def get_cooks_d(self):
     vals = OLSInfluence(self.lm).summary_frame()
     cooks_d = vals["cooks_d"].values
     return cooks_d
示例#13
0
文件: plot.py 项目: t-hdd/lmdiag
 def get_standard_residuals(self):
     vals = OLSInfluence(self.lm).summary_frame()
     std_resid = vals["standard_resid"].values
     return std_resid
示例#14
0
residuals = df_flow.obs - obs_predicted

import statsmodels.api as sm
from statsmodels.formula.api import ols
from statsmodels.stats.outliers_influence import OLSInfluence

# Cook's distance
threshold = 4 / len(df_flow)
print('Cook\'s distance: ', round(threshold, 2))

# fit the regression model using statsmodels library
f = 'obs ~ sim'
model = ols(formula=f, data=df_flow).fit()

# calculate the cooks_distance - the OLSInfluence object contains multiple influence measurements
cook_distance = OLSInfluence(model).cooks_distance
(distance, p_value) = cook_distance

# Drawing graph
plt.figure(figsize=(7, 7), edgecolor='black')

# scatter plot - x axis (independent variable sim), y-axis (dependent variable obs), size and color of the marks according to its cook's distance
sns.scatterplot(df_flow.sim,
                df_flow.obs,
                hue=distance,
                size=distance,
                sizes=(50, 200),
                edgecolor='black',
                linewidth=1)

# labels and title
示例#15
0
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import OLSInfluence

n = 200
x1 = np.random.uniform(-10, 10, n)
x2 = np.random.uniform(-4, 4, n)
x3 = np.random.uniform(-2, 8, n)
y = 2.89 * x1**2 + 4.33 * x2**2 + 6.1 * x1 * x2 + 5.9 * x2 * x3 + np.random.normal(
    size=n)

data = pd.DataFrame(data=[x1, x2, y]).T


def build_formula(label: str, features: str) -> str:
    featlist = features.split(',')
    quads = ' + '.join(map(lambda feat: 'I(' + feat + ' ** 2)', featlist))
    ints = ' + '.join(
        map(lambda feat_pair: 'I(%s * %s)' % (feat_pair[0], feat_pair[1]),
            combinations(featlist, 2)))
    return "%s ~ %s + %s" % (label, quads, ints)


label = 'y'
features = 'x1,x2,x3'

formula = build_formula(label, features)

res = sm.OLS.from_formula(formula, data=data).fit()
print(res.params)
rst = OLSInfluence(res).summary_frame().student_resid
示例#16
0
temp_index = ft.studentized_residual(exp['Income_ln'],
                                     exp['avg_exp_ln'], ['Income_ln'],
                                     'avg_exp_ln',
                                     num=2)
print(temp_index)
exp.loc[temp_index]
# In[]
# 离群特征检测
ft.outlier_detection(exp, 'Income_ln', exp[['avg_exp_ln']], 'avg_exp_ln')

# In[23]:
# 7.3.2.2、statemodels包提供了更多强影响点判断指标
from statsmodels.stats.outliers_influence import OLSInfluence

# 使用的就是 ln(exp)~ln(Income) 模型,计算 强影响点
OLSInfluence(ana3).summary_frame().head()

# 7.3.3多重共线性分析:方差膨胀因子
# 增加变量
# 经过单变量线性回归的处理,我们基本对模型的性质有了一定的了解,接下来可以放入更多的连续型解释变量。在加入变量之前,要注意变量的函数形式转变。比如当地房屋均价、当地平均收入,其性质和个人收入一样,都需要取对数
# In[24]:
# exp2 是已经剔除了 两个 强影响点 后的数据集
exp2['dist_home_val_ln'] = np.log(exp2['dist_home_val'])  # 所住小区房屋均价(万元)|
exp2['dist_avg_income_ln'] = np.log(exp2['dist_avg_income'])  # 当地人均收入

# ols类计算 线性回归模型
# 第一次: Income_ln 和 dist_avg_income_ln 是强相关性,必须剔除一个(根据方差膨胀因子) R-squared=0.553
ana5 = ols(
    '''avg_exp_ln ~ Income_ln + dist_home_val_ln + dist_avg_income_ln''',
    exp2).fit()
# 第二次
示例#17
0
文件: MLR.py 项目: iris3333/pre2
print("Normality (Jarque-Bera P-value)", round(jarque_bera(reg.resid)[1], 3))
print("Homoscedasticity (Breusch-Pagan P-value)",
      round(het_breuschpagan(reg.resid, reg.model.exog)[3], 3))
print()

outlier = pd.DataFrame(reg.outlier_test(method="bonf", alpha=0.05))
outlier = outlier.rename(columns={
    "student_resid": "resid",
    "unadj_p": "unadj_p",
    "bonf(p)": "bonf_p"
})
print(outlier[outlier.bonf_p < 0.05])
print()

leverage = OLSInfluence(reg).summary_frame().loc[:, ["hat_diag"]]
print(leverage[leverage.hat_diag > 0.2])
print()

influence = OLSInfluence(reg).summary_frame().loc[:, ["cooks_d"]]
print(influence[influence.cooks_d > (4 / (len(df) - len(df.columns) - 1))])
print()

fig, ax = plt.subplots()
ax.scatter(x=df["Price"],
           y=reg.predict(sm.add_constant(df.ix[:, :-1])),
           c="Black",
           s=9)
ax.set(xlim=(-4.5, -0.5), ylim=(-4.5, -0.5))
ax.plot(ax.get_xlim(), ax.get_ylim(), linewidth=1, color="Red", linestyle="--")
plt.xlabel("Actual Price", fontsize=10)
示例#18
0
 def residual_analysis(self):
     self.residual_analysis = OLSInfluence(self.results).summary_frame()
name = ['Jarque-Bera', 'Chi^2 two-tail prob.', 'Skew', 'Kurtosis']
test = sms.jarque_bera(results.resid)
lzip(name, test)

# Omni test:

name = ['Chi^2', 'Two-tail probability']
test = sms.omni_normtest(results.resid)
lzip(name, test)

# ## Influence tests
#
# Once created, an object of class ``OLSInfluence`` holds attributes and methods that allow users to assess the influence of each observation. For example, we can compute and extract the first few rows of DFbetas by:

from statsmodels.stats.outliers_influence import OLSInfluence
test_class = OLSInfluence(results)
test_class.dfbetas[:5, :]

# Explore other options by typing ``dir(influence_test)``
#
# Useful information on leverage can also be plotted:

from statsmodels.graphics.regressionplots import plot_leverage_resid2
print(plot_leverage_resid2(results))

# Other plotting options can be found on the [Graphics page.](http://www.statsmodels.org/stable/graphics.html)

# ## Multicollinearity
#
# Condition number:
示例#20
0
    def save_residuals(self,
                       unstandardized=True,
                       standardized=False,
                       studentized=False,
                       deleted=False,
                       studentized_deleted=False,
                       add_to_data=False):
        """
        Produce values of various residuals. 
        Residuals are returned only for data used to fit a model.
        
        Parameters
        ----------
        unstandardized : bool 
            Whether to save unstandardized (raw) residuals
        standardized : bool 
            Whether to save standardized (z-scores) residuals
        studentized : bool 
            Whether to save studentized residuals
        deleted : bool 
            Whether to save deleted residuals
        studentized_deleted : bool
            Whether to save studentized deleted residuals
        add_to_data : bool
            Whether to merge new values with data.
            Currently, this option returns data with a sorted index

        Returns
        -------
        pd.DataFrame
            Requested residuals
        """

        columns_to_show = [f'{k.capitalize().replace("ized", ".").replace("eted", ".").replace("_", " ")} res.' \
                           for k, v in vars().items() if v==True and k!='add_to_data']

        infl = OLSInfluence(self._model)

        result = []

        res_unstand = infl.resid
        res_unstand.name = 'Unstandard. res.'

        res_stand = (res_unstand - res_unstand.mean()) / res_unstand.std()
        res_stand.name = 'Standard. res.'

        res_stud = infl.resid_studentized_internal
        res_stud.name = 'Student. res.'

        result.extend([res_unstand, res_stand, res_stud])

        if deleted:
            res_del = infl.resid_press
            res_del.name = 'Del. res.'
            result.append(res_del)

        if studentized_deleted:
            res_stud_del = infl.resid_studentized_external
            res_stud_del.name = 'Student. del. res.'
            result.append(res_stud_del)

        result = pd.concat(result, axis=1)
        result = result[columns_to_show].copy()

        if add_to_data:
            result = pd.concat([self._data, result], axis=1)

        return result
# The _statsmodels_ package has the most developed support for outlier analysis.

house_98105 = house.loc[house['ZipCode'] == 98105, ]

predictors = ['SqFtTotLiving', 'SqFtLot', 'Bathrooms', 'Bedrooms', 'BldgGrade']
outcome = 'AdjSalePrice'

house_outlier = sm.OLS(house_98105[outcome],
                       house_98105[predictors].assign(const=1))
result_98105 = house_outlier.fit()
print(result_98105.summary())

# The `OLSInfluence` class is initialized with the OLS regression results and gives access to a number of usefule properties. Here we use the studentized residuals.

influence = OLSInfluence(result_98105)
sresiduals = influence.resid_studentized_internal

print(sresiduals.idxmin(), sresiduals.min())

print(result_98105.resid.loc[sresiduals.idxmin()])

outlier = house_98105.loc[sresiduals.idxmin(), :]
print('AdjSalePrice', outlier[outcome])
print(outlier[predictors])

### Influential values

from scipy.stats import linregress

np.random.seed(5)
    def fit(self, data_frame: DataFrame):
        self.data_frame = data_frame

        t_ref = data_frame.reference_temperature
        t_ref_vector = t_ref * np.ones(len(data_frame.temp))

        c_0 = data_frame.reference_value
        c_1 = data_frame.reference_cvalue

        (self.aux_values, self.aux_weights) = auxiliary_function(data_frame)

        updated_experiment = data_frame.experiment - c_0 * np.ones(len(data_frame.temp)) - \
                             c_1 * (data_frame.temp - t_ref_vector)

        self.updated_matrix = np.column_stack(
            [data_frame.temp ** i + (i - 1) * t_ref_vector ** i - i * data_frame.temp * t_ref_vector ** (i - 1) \
             for i in range(self.min_power, self.max_power + 1) if i not in [0, 1]])

        ols_result = sm.OLS(updated_experiment, self.updated_matrix).fit()
        # cooks_distance_influential = 4/(len(self.data_frame.temp - (self.max_power - self.min_power) - 1))
        # ols_cooks_distance = OLSInfluence(ols_result).cooks_distance[1]
        ols_stud_residuals = OLSInfluence(ols_result).dfbetas
        # ols_influence = OLSInfluence(ols_result).influence

        w = np.ones(len(data_frame.temp))
        for residual, weight in zip(ols_stud_residuals, w):
            if residual > 2:
                w = 0.1
            else:
                w = 1

        self.aux_fit = sm.WLS(updated_experiment,
                              self.updated_matrix,
                              weights=w).fit()

        self.aux_coefficients = self.aux_fit.params

        a_1 = c_1 - \
              sum([i * self.aux_coefficients[i - self.min_power] * t_ref ** (i - 1) \
                   for i in range(self.min_power, 0)]) - \
              sum([i * self.aux_coefficients[i - 2 - self.min_power] * t_ref ** (i - 1) \
                   for i in range(2, self.max_power + 1)])
        a_0 = c_0 - a_1 * t_ref - \
              sum([self.aux_coefficients[i - self.min_power] * t_ref ** i \
                   for i in range(self.min_power, 0)]) - \
              sum([self.aux_coefficients[i - 2 - self.min_power] * t_ref ** i \
                   for i in range(2, self.max_power + 1)])

        self.fit_coefficients = []
        self.fit_coefficients.extend(self.aux_coefficients[:-self.min_power])
        self.fit_coefficients.extend([a_0, a_1])
        self.fit_coefficients.extend(self.aux_coefficients[-self.min_power:])

        self.source_matrix = np.vstack([
            data_frame.temp**i if i != 0 else np.ones(len(data_frame.temp))
            for i in range(self.min_power, self.max_power + 1)
        ]).T

        self.fit = np.dot(self.source_matrix, self.fit_coefficients)

        self.heat_capacity_matrix = np.vstack([
            i * data_frame.temp**(i - 1)
            for i in range(self.min_power, self.max_power + 1)
        ]).T
        self.fit_heat_capacity = np.dot(self.heat_capacity_matrix,
                                        self.fit_coefficients)