def qq_plot(self, ax=None): """ Standarized Residual vs Theoretical Quantile plot Used to visually check if residuals are normally distributed. Points spread along the diagonal line will suggest so. """ if ax is None: fig, ax = plt.subplots() QQ = ProbPlot(self.residual_norm) QQ.qqplot(line='45', alpha=0.5, lw=1, ax=ax) # annotations abs_norm_resid = np.flip(np.argsort(np.abs(self.residual_norm)), 0) abs_norm_resid_top_3 = abs_norm_resid[:3] for r, i in enumerate(abs_norm_resid_top_3): ax.annotate(i, xy=(np.flip(QQ.theoretical_quantiles, 0)[r], self.residual_norm[i]), ha='right', color='C3') ax.set_title('Normal Q-Q', fontweight="bold") ax.set_xlabel('Theoretical Quantiles') ax.set_ylabel('Standardized Residuals') return ax
def QQ_plot(model_fit, data): model_norm_residuals = model_fit.get_influence().resid_studentized_internal QQ = ProbPlot(model_norm_residuals) fig = QQ.qqplot(line='45', alpha=0.5, color='#4C72B0', lw=1) fig.set_figheight(5) fig.set_figwidth(5) ax = fig.axes[0] ax.set_title('Normal Q-Q') ax.set_xlabel('Theoretical Quantiles') ax.set_ylabel('Standardized Residuals') # annotations abs_norm_resid = np.argsort(np.abs(model_norm_residuals)) abs_norm_resid_top_3 = abs_norm_resid[-3:] # indices of 3 most extreme observations for i in abs_norm_resid_top_3: ax.annotate(i, xy=( QQ.theoretical_quantiles[np.where(QQ.sample_quantiles == model_norm_residuals[i])[0][0]], model_norm_residuals[i]) ) fig.tight_layout() return ax
def test_param_unpacking(): expected = np.array([2.0, 3, 0, 1]) pp = ProbPlot(np.empty(100), dist=stats.beta(2, 3)) assert_equal(pp.fit_params, expected) pp = ProbPlot(np.empty(100), stats.beta(2, b=3)) assert_equal(pp.fit_params, expected) pp = ProbPlot(np.empty(100), stats.beta(a=2, b=3)) assert_equal(pp.fit_params, expected) expected = np.array([2.0, 3, 4, 1]) pp = ProbPlot(np.empty(100), stats.beta(2, 3, 4)) assert_equal(pp.fit_params, expected) pp = ProbPlot(np.empty(100), stats.beta(a=2, b=3, loc=4)) assert_equal(pp.fit_params, expected) expected = np.array([2.0, 3, 4, 5]) pp = ProbPlot(np.empty(100), stats.beta(2, 3, 4, 5)) assert_equal(pp.fit_params, expected) pp = ProbPlot(np.empty(100), stats.beta(2, 3, 4, scale=5)) assert_equal(pp.fit_params, expected) pp = ProbPlot(np.empty(100), stats.beta(2, 3, loc=4, scale=5)) assert_equal(pp.fit_params, expected) pp = ProbPlot(np.empty(100), stats.beta(2, b=3, loc=4, scale=5)) assert_equal(pp.fit_params, expected) pp = ProbPlot(np.empty(100), stats.beta(a=2, b=3, loc=4, scale=5)) assert_equal(pp.fit_params, expected)
def diagnostic_plots(X, y, model_fit=None): """ Function to reproduce the 4 base plots of an OLS model from R. --- Inputs: X: A numpy array or pandas dataframe of the features to use in building the linear regression model y: A numpy array or pandas series/dataframe of the target variable of the linear regression model model_fit [optional]: a statsmodel.api.OLS model after regressing y on X. If not provided, will be generated from X, y """ if not model_fit: model_fit = sm.OLS(y, sm.add_constant(X)).fit() # create dataframe from X, y for easier plot handling dataframe = pd.concat([X, y], axis=1) # model values model_fitted_y = model_fit.fittedvalues # model residuals model_residuals = model_fit.resid # normalized residuals model_norm_residuals = model_fit.get_influence().resid_studentized_internal # absolute squared normalized residuals model_norm_residuals_abs_sqrt = np.sqrt(np.abs(model_norm_residuals)) # absolute residuals model_abs_resid = np.abs(model_residuals) # leverage, from statsmodels internals model_leverage = model_fit.get_influence().hat_matrix_diag # cook's distance, from statsmodels internals model_cooks = model_fit.get_influence().cooks_distance[0] # Residuals vs Fitted Plot plot_lm_1 = plt.figure(figsize=(8, 5)) plot_lm_1.axes[0] = sns.residplot(model_fitted_y, dataframe.columns[-1], data=dataframe, lowess=True, scatter_kws={'alpha': 0.5}, line_kws={ 'color': 'red', 'lw': 1, 'alpha': 0.8 }) plot_lm_1.axes[0].set_title('Residuals vs Fitted') plot_lm_1.axes[0].set_xlabel('Fitted values') plot_lm_1.axes[0].set_ylabel('Residuals') # Normal Q-Q Plot QQ = ProbPlot(model_norm_residuals) plot_lm_2 = QQ.qqplot(line='45', alpha=0.5, color='#4C72B0', lw=1) plot_lm_2.axes[0].set_title('Normal Q-Q') plot_lm_2.axes[0].set_xlabel('Theoretical Quantiles') plot_lm_2.axes[0].set_ylabel('Standardized Residuals')
def setup(self): self.data = sm.datasets.longley.load() self.data.exog = sm.add_constant(self.data.exog, prepend=False) self.mod_fit = sm.OLS(self.data.endog, self.data.exog).fit() self.res = self.mod_fit.resid self.prbplt = ProbPlot(self.mod_fit.resid, dist=stats.t, distargs=(4,)) self.other_array = np.random.normal(size=self.prbplt.data.shape) self.other_prbplot = ProbPlot(self.other_array)
def plot_normality(self, ax, color, label): """qq normality test""" qq = ProbPlot(self.residuals) qq.qqplot(line='45', alpha=0.5, color=color, lw=0.5, ax=ax, label=label) self.annotate_residuals(ax)
def test_exceptions(self): with pytest.raises(ValueError): ProbPlot(self.data, dist=stats.norm(loc=8.5, scale=3.0), fit=True) with pytest.raises(ValueError): ProbPlot( self.data, dist=stats.norm(loc=8.5, scale=3.0), distargs=(8.5, 3.0), ) with pytest.raises(ValueError): ProbPlot(self.data, dist=stats.norm(loc=8.5, scale=3.0), loc=8.5) with pytest.raises(ValueError): ProbPlot(self.data, dist=stats.norm(loc=8.5, scale=3.0), scale=3.0)
def qq_plot(self, ax): QQ = ProbPlot(self.residuals.normalised_residuals) QQ.qqplot(line='45', alpha=0.5, color='#4C72B0', lw=1, ax=ax) ax.set_title('Normal Q-Q') ax.set_xlabel('Theoretical Quintiles') ax.set_ylabel('Standardised Residuals') # annotations abs_norm_resid_top_3 = np.flip(np.argsort(np.abs(self.residuals.normalised_residuals)), 0)[:3] for r, i in enumerate(abs_norm_resid_top_3): ax.annotate( i, xy=(np.flip(QQ.theoretical_quantiles, 0)[r], self.residuals.normalised_residuals[i]) )
def check_residual_normality(residuals_normalized): qq = ProbPlot(residuals_normalized) plot_2 = qq.qqplot(line='45', alpha=0.5, color='#4C72B0', lw=1) plot_2.axes[0].set_title('Normal Q-Q') plot_2.axes[0].set_xlabel('Theoretical Quantiles') plot_2.axes[0].set_ylabel('Standardized Residuals') # annotations abs_norm_resid = np.flip(np.argsort(np.abs(residuals_normalized)), 0) abs_norm_resid_top_3 = abs_norm_resid[:3] for r, i in enumerate(abs_norm_resid_top_3): plot_2.axes[0].annotate(i, xy=(np.flip(qq.theoretical_quantiles, 0)[r], residuals_normalized[i]))
def check_linearity_assumption(fitted_y, residuals): qq = ProbPlot(residuals) plot_1 = plt.figure() plot_1.axes[0] = sns.residplot(fitted_y, residuals, lowess=True, scatter_kws={'alpha': 0.5}, line_kws={ 'color': 'red', 'lw': 1, 'alpha': 0.8 }) plot_1.axes[0].set_title('Residuals vs Fitted') #plot_1.axes[0].set_xlim(min(residuals) - 0.1, max(residuals) + 0.01) plot_1.axes[0].set_xlabel('Fitted values') plot_1.axes[0].set_ylabel('Residuals') # annotations ''' norm_resid = np.flip(np.argsort(residuals), 0) s_thresh = max(qq.theoretical_quantiles) norm_resid_top = pd.DataFrame(residuals).index[abs(residuals)>s_thresh].to_list() for i in norm_resid_top: plot_1.axes[0].annotate(fitted_y.index[i], xy=(fitted_y[i], residuals[i])) ''' plt.savefig("ResVsFitted.png")
def slm_plot_qq(fitted_model, ax=None, scolor='C0', lcolor='C1', lw=2, line=True, annotations=3): """Produce standard Q-Q plot.""" resids = fitted_model.get_influence().resid_studentized_internal pp = ProbPlot(resids) ax = sns.scatterplot(pp.theoretical_quantiles, pp.sorted_data, ax=ax, color=scolor, linewidth=0, alpha=0.7) if line: ax.plot(pp.theoretical_quantiles[[0, -1]], pp.theoretical_quantiles[[0, -1]], color=lcolor, lw=lw) if annotations: idxs = pd.Series(resids).abs().nlargest(annotations).index jdxs = pd.Series(pp.sorted_data).abs().nlargest(annotations).index for idx, jdx in zip(idxs, jdxs): qq = pp.theoretical_quantiles[jdx] resid = pp.sorted_data[jdx] ax.annotate(fitted_model.resid.index[idx], (qq, resid)) ax.set_title('Normal Q-Q') ax.set_xlabel('Theoretical Quantiles') ax.set_ylabel('Standardised Residuals') return ax
def annotate_residuals(self, ax): qq = ProbPlot(self.residuals) sorted_residuals = np.flip(np.argsort(np.abs(self.residuals)), 0) top_3_residuals = sorted_residuals[:3] for r, i in enumerate(top_3_residuals): ax.annotate(self.temperature[i], xy=(np.sign(self.residuals[i]) * np.flip(qq.theoretical_quantiles, 0)[r], self.residuals[i]))
def test_invalid_dist_config(close_figures): # GH 4226 np.random.seed(5) data = sm.datasets.longley.load(as_pandas=False) data.exog = sm.add_constant(data.exog, prepend=False) mod_fit = sm.OLS(data.endog, data.exog).fit() with pytest.raises(TypeError, match=r"dist\(0, 1, 4, loc=0, scale=1\)"): ProbPlot(mod_fit.resid, stats.t, distargs=(0, 1, 4))
def setup(self): np.random.seed(5) self.data = sm.datasets.longley.load() self.data.exog = sm.add_constant(self.data.exog, prepend=False) self.mod_fit = sm.OLS(self.data.endog, self.data.exog).fit() self.prbplt = ProbPlot( self.mod_fit.resid, dist=stats.t, distargs=(4,), fit=True ) self.line = "r" super().setup()
def qqplot(X, y, model_fit=None): if not model_fit: model_fit = smf.OLS(y, smf.add_constant(X)).fit() model_fitted_y = model_fit.fittedvalues model_norm_residuals = model_fit.get_influence().resid_studentized_internal model_norm_residuals_abs_sqrt = np.sqrt(np.abs(model_norm_residuals)) QQ = ProbPlot(model_norm_residuals) plot_lm_2 = QQ.qqplot(line='45', alpha=0.5, color='#4C72B0', lw=1) plot_lm_2.axes[0].set_title('Normal Q-Q') plot_lm_2.axes[0].set_xlabel('Theoretical Quantiles') plot_lm_2.axes[0].set_ylabel('Standardized Residuals') # annotations abs_norm_resid = np.flip(np.argsort(np.abs(model_norm_residuals)), 0) abs_norm_resid_top_5 = abs_norm_resid[:5] for r, i in enumerate(abs_norm_resid_top_5): plot_lm_2.axes[0].annotate(i, xy=(np.flip(QQ.theoretical_quantiles, 0)[r], model_norm_residuals[i]))
def plot_qq(resid, title='', ax=None, z=2.807, strftime='%Y-%m-%d'): pp = ProbPlot(resid, fit=True) outliers = abs(pp.sample_quantiles) > z ax = ax or plt.gca() pp.qqplot(ax=ax, color='C0', alpha=.5) sm.qqline(ax=ax, line='45', fmt='r--', lw=1) z = resid.sort_values().index[outliers] for x, y, i in zip(pp.theoretical_quantiles[outliers], pp.sample_quantiles[outliers], z): ax.annotate(i.strftime(strftime) if strftime else str(i), xy=(x, y), c='m') ax.set_title(title or 'Normal Q-Q') ax.set_ylabel('Standardized residuals') return DataFrame( { 'residuals': pp.sorted_data[outliers], 'standardized': pp.sample_quantiles[outliers] }, index=z)
def plotQQ(self, annot=0, ax=None): if not ax: ax = plt.subplot() QQ = ProbPlot(self.df['residual_stud']) QQ.qqplot(line='45', alpha=0.5, color='#4C72B0', lw=1, ax=ax) ax.set_title('Normal Q-Q') ax.set_xlabel('Theoretical Quantiles') ax.set_ylabel('Standardized Residuals') # annotations if annot: abs_norm_resid = np.flip( np.argsort(np.abs(self.df['residual_stud'])), 0) abs_norm_resid_top = abs_norm_resid[:annot] abs_norm_resid_top = self.df['residual_stud'].index[ abs_norm_resid_top] for r, i in enumerate(abs_norm_resid_top): ax.annotate(i, xy=(np.flip(QQ.theoretical_quantiles, 0)[r], self.df['residual_stud'][i])) return ax
def QQ(model_norm_residuals): QQ = ProbPlot(model_norm_residuals) plot_lm_2 = QQ.qqplot(line='45', alpha=0.5, color='#4C72B0', lw=1) plot_lm_2.set_figheight(8) plot_lm_2.set_figwidth(12) plot_lm_2.axes[0].set_title('Normal Q-Q') plot_lm_2.axes[0].set_xlabel('Theoretical Quantiles') plot_lm_2.axes[0].set_ylabel('Standardized Residuals') # annotations abs_norm_resid = np.flip(np.argsort(np.abs(model_norm_residuals)), 0) abs_norm_resid_top_3 = abs_norm_resid[:3] for r, i in enumerate(abs_norm_resid_top_3): plot_lm_2.axes[0].annotate(i, xy=(np.flip(QQ.theoretical_quantiles, 0)[r], model_norm_residuals[i])) return
def qqPlot(fitted): QQ = ProbPlot(fitted.get_influence().resid_studentized_internal) plot_lm_2 = QQ.qqplot(line='45', alpha=0.5, color='#4C72B0', lw=1) plot_lm_2.set_figheight(8) plot_lm_2.set_figwidth(12) plot_lm_2.axes[0].set_title('Normal Q-Q') plot_lm_2.axes[0].set_xlabel('Theoretical Quantiles') plot_lm_2.axes[0].set_ylabel('Standardized Residuals') # annotations abs_norm_resid = np.flip( np.argsort(np.abs(fitted.get_influence().resid_studentized_internal)), 0) abs_norm_resid_top_3 = abs_norm_resid[:3] for r, i in enumerate(abs_norm_resid_top_3): plot_lm_2.axes[0].annotate( i, xy=(np.flip(QQ.theoretical_quantiles, 0)[r], fitted.get_influence().resid_studentized_internal[i])) return plot_lm_2
class TestCompareSamplesDifferentSize: def setup(self): np.random.seed(5) self.data1 = ProbPlot(np.random.normal(loc=8.25, scale=3.25, size=37)) self.data2 = ProbPlot(np.random.normal(loc=8.25, scale=3.25, size=55)) @pytest.mark.matplotlib def test_qqplot(self, close_figures): self.data1.qqplot(other=self.data2) with pytest.raises(ValueError): self.data2.qqplot(other=self.data1) @pytest.mark.matplotlib def test_ppplot(self, close_figures): self.data1.ppplot(other=self.data2) self.data2.ppplot(other=self.data1)
def setup(self): try: import matplotlib.pyplot as plt self.fig, self.ax = plt.subplots() except ImportError: pass self.other_array = np.random.normal(size=self.prbplt.data.shape) self.other_prbplot = ProbPlot(self.other_array) self.plot_options = dict( marker="d", markerfacecolor="cornflowerblue", markeredgecolor="white", alpha=0.5, )
def check_residual_normality(fitted_y, t_residuals_normalized): qq = ProbPlot(t_residuals_normalized) plot_2 = qq.qqplot(line='45', alpha=0.5, color='#4C72B0', lw=1) plot_2.axes[0].set_title('Normal Q-Q') plot_2.axes[0].set_xlabel('Theoretical Quantiles') plot_2.axes[0].set_ylabel('T Student Standardized Residuals') # annotations df = pd.DataFrame( pd.DataFrame(t_residuals_normalized).set_index(fitted_y.index)) df['ranks'] = df.rank(method='dense').astype(int) - 1 ranks = df.ranks.values s_thresh = max(qq.theoretical_quantiles) abs_norm_resid_top = pd.DataFrame(t_residuals_normalized).index[ abs(t_residuals_normalized) > s_thresh].to_list() for r, i in enumerate(abs_norm_resid_top): plot_2.axes[0].annotate(fitted_y.index[i], xy=(qq.theoretical_quantiles[ranks[i]], t_residuals_normalized[i])) plt.savefig("Normality.png")
def qq_plot(results, n_annotate=3): # normalized residuals model_norm_residuals = results.get_influence().resid_studentized_internal QQ = ProbPlot(model_norm_residuals) fig = QQ.qqplot(line='45', alpha=0.5, color='#4C72B0', lw=1) fig.set_figheight(8) fig.set_figwidth(12) fig.axes[0].set_title('Normal Q-Q') fig.axes[0].set_xlabel('Theoretical Quantiles') fig.axes[0].set_ylabel('Standardized Residuals') # annotations abs_norm_resid = np.flip(np.argsort(np.abs(model_norm_residuals)), 0) n_annotate = min(n_annotate, len(abs_norm_resid)) abs_norm_resid_top_n = abs_norm_resid[:n_annotate] for r, i in enumerate(abs_norm_resid_top_n): fig.axes[0].annotate(i, xy=(np.flip(QQ.theoretical_quantiles, 0)[r], model_norm_residuals[i])) plt.close() return fig
def normal_resid_distr(norm_resid, sample_n=None, height=2, width=4): ''' This plot shows if residuals are normally distributed. Do residuals follow a straight line well or do they deviate severely? It’s good if residuals are lined well on the straight dashed line. ''' plt.style.use('seaborn') # if sampling value turned on then limit data in the output if sample_n != None: if len(norm_resid) > sample_n: i = np.random.choice(len(norm_resid), sample_n, replace=False) norm_resid = norm_resid[i] ## residual normal quantile plot norm_q_plot = ProbPlot(norm_resid) plot_lm_q = norm_q_plot.qqplot(line='45', alpha=0.2, color='#4C72B0', lw=1) plot_lm_q.set_figheight(height) plot_lm_q.set_figwidth(width) # labels plot_lm_q.axes[0].set_title('Normal Quantile') plot_lm_q.axes[0].set_xlabel('Theoretical Quantiles') plot_lm_q.axes[0].set_ylabel('Normalized Residuals') plt.show()
def update_qq_plot(predictors, response): """ Normal QQ Plot """ MYCLASS.reg(predictors, response) model = MYCLASS.get_model() resid_norm = model.get_influence().resid_studentized_internal qq_res = ProbPlot(resid_norm) theo = qq_res.theoretical_quantiles sample = qq_res.sample_quantiles marker_size = 8 opacity = 0.5 traces = [ go.Scatter( x=theo, y=sample, mode='markers', name='data', marker=dict( size=marker_size, opacity=opacity ) ), go.Scatter( x=theo, y=theo, type='scatter', mode='lines', name='line', line=dict( width=1, color='red' ) ) ] return { 'data': traces, 'layout': dict( title='Normal Q-Q', xaxis={'title': 'Theoretical Quantiles'}, yaxis={'title': 'Standardized Residuals'}, plot_bgcolor='#e6e6e6', showlegend=False, hovermode='closest' ) }
def check_homoscedacticity(fitted_y, t_residuals_normalized): qq = ProbPlot(t_residuals_normalized) s_thresh = np.sqrt(max(qq.theoretical_quantiles)) # absolute squared normalized residuals residuals_norm_abs_sqrt = np.sqrt(np.abs(t_residuals_normalized)) plot_3 = plt.figure() plt.scatter(fitted_y, residuals_norm_abs_sqrt, alpha=0.5) sns.regplot(fitted_y, residuals_norm_abs_sqrt, scatter=False, ci=False, lowess=True, line_kws={ 'color': 'red', 'lw': 1, 'alpha': 0.8 }) plot_3.axes[0].set_title('Scale-Location') plot_3.axes[0].set_xlabel('Fitted values') plot_3.axes[0].set_ylabel( "$\\sqrt{|T Student Standardized Residuals|}$") plot_3.axes[0].axhline(s_thresh, ls='--', color='black') # annotations abs_sq_norm_resid = np.flip(np.argsort(residuals_norm_abs_sqrt), 0) abs_sq_norm_resid_top = pd.DataFrame(residuals_norm_abs_sqrt).index[ residuals_norm_abs_sqrt > s_thresh].to_list() for i in abs_sq_norm_resid_top: plot_3.axes[0].annotate(fitted_y.index[i], xy=(fitted_y[i], residuals_norm_abs_sqrt[i])) plt.savefig("Homoscadasticity.png")
def qq_plot(self, var_x=None, var_y=None): """ Creates a normal qq plot Input: - var_x: A list of predictor variable(s) (default=None) - var_y: A response variable (default=None) """ # priority: arguments var_x, var_y if var_x is not None and var_y is not None: self.reg(var_x, var_y) else: var_x = self.get_predictors() var_y = self.get_response() if var_x is not None and var_y is not None: self.reg(var_x, var_y) else: raise ValueError('No predictors or response assigned') model = self.get_model() resid_norm = model.get_influence().resid_studentized_internal qq_plt = ProbPlot(resid_norm) theo = qq_plt.theoretical_quantiles sample = qq_plt.sample_quantiles plt.scatter(theo, sample, alpha=0.5) sns.regplot(theo, theo, scatter=False, ci=False, lowess=True, line_kws={ 'color': 'red', 'lw': 1, 'alpha': 1 }) plt.title('Normal Q-Q') plt.xlabel('Theoretical Quantiles') plt.ylabel('Standardized Residuals')
def plot_lm(model_fit, data, key, n=3): import numpy as np import pandas as pd import seaborn as sns import matplotlib.pyplot as plt import statsmodels.formula.api as smf from statsmodels.graphics.gofplots import ProbPlot # fitted values (need a constant term for intercept) model_fitted_y = model_fit.fittedvalues # model residuals model_residuals = model_fit.resid # normalized residuals model_norm_residuals = model_fit.get_influence().resid_studentized_internal # absolute squared normalized residuals model_norm_residuals_abs_sqrt = np.sqrt(np.abs(model_norm_residuals)) # absolute residuals model_abs_resid = np.abs(model_residuals) # leverage, from statsmodels internals model_leverage = model_fit.get_influence().hat_matrix_diag # cook's distance, from statsmodels internals model_cooks = model_fit.get_influence().cooks_distance[0] fig = plt.figure() fig.set_figheight(8) fig.set_figwidth(12) # first plot plot_lm_1 = plt.subplot(2, 2, 1) temp_plot = sns.residplot(model_fitted_y, key, data=data, lowess=True, scatter_kws={'alpha': 0.5}, line_kws={ 'color': 'red', 'lw': 1, 'alpha': 0.8 }) plot_lm_1.set_title('Residuals vs Fitted') plot_lm_1.set_xlabel('Fitted values') plot_lm_1.set_ylabel('Residuals') # annotations abs_resid = model_abs_resid.sort_values(ascending=False) abs_resid_top_3 = abs_resid[:3] for i in abs_resid_top_3.index: plot_lm_1.annotate(i, xy=(model_fitted_y[i], model_residuals[i])) # second plot plot_lm_2 = plt.subplot(2, 2, 2) QQ = ProbPlot(model_norm_residuals) temp_plot = QQ.qqplot(line='45', alpha=0.5, color='#4C72B0', lw=1, ax=plot_lm_2) plot_lm_2.set_title('Normal Q-Q') plot_lm_2.set_xlabel('Theoretical Quantiles') plot_lm_2.set_ylabel('Standardized Residuals') # annotations abs_norm_resid = np.flip(np.argsort(np.abs(model_norm_residuals)), 0) abs_norm_resid_top_3 = abs_norm_resid[:3] for r, i in enumerate(abs_norm_resid_top_3): plot_lm_2.annotate(i, xy=(np.flip(QQ.theoretical_quantiles, 0)[r], model_norm_residuals[i])) # third plot plot_lm_3 = plt.subplot(2, 2, 3) plt.scatter(model_fitted_y, model_norm_residuals_abs_sqrt, alpha=0.5) sns.regplot(model_fitted_y, model_norm_residuals_abs_sqrt, scatter=False, ci=False, lowess=True, line_kws={ 'color': 'red', 'lw': 1, 'alpha': 0.8 }) plot_lm_3.set_title('Scale-Location') plot_lm_3.set_xlabel('Fitted values') plot_lm_3.set_ylabel('$\sqrt{|Standardized Residuals|}$') # annotations abs_sq_norm_resid = np.flip(np.argsort(model_norm_residuals_abs_sqrt), 0) abs_sq_norm_resid_top_3 = abs_sq_norm_resid[:3] for i in abs_norm_resid_top_3: try: plot_lm_3.annotate(i, xy=(model_fitted_y[i], model_norm_residuals_abs_sqrt[i])) except KeyError as err: continue # fourth plot plot_lm_4 = plt.subplot(2, 2, 4) plt.scatter(model_leverage, model_norm_residuals, alpha=0.5) sns.regplot(model_leverage, model_norm_residuals, scatter=False, ci=False, lowess=True, line_kws={ 'color': 'red', 'lw': 1, 'alpha': 0.8 }) plot_lm_4.set_xlim(0, 0.20) plot_lm_4.set_ylim(-3, 5) plot_lm_4.set_title('Residuals vs Leverage') plot_lm_4.set_xlabel('Leverage') plot_lm_4.set_ylabel('Standardized Residuals') # annotations leverage_top_3 = np.flip(np.argsort(model_cooks), 0)[:3] for i in leverage_top_3: plot_lm_4.annotate(i, xy=(model_leverage[i], model_norm_residuals[i])) # shenanigans for cook's distance contours def graph(formula, x_range, label=None): x = x_range y = formula(x) plt.plot(x, y, label=label, lw=1, ls='--', color='red') p = len(model_fit.params) # number of model parameters graph(lambda x: np.sqrt((0.5 * p * (1 - x)) / x), np.linspace(0.001, 0.200, 50), 'Cook\'s distance') # 0.5 line graph(lambda x: np.sqrt((1 * p * (1 - x)) / x), np.linspace(0.001, 0.200, 50)) # 1 line plt.legend(loc='upper right') plt.xlim(xmax=np.max(model_cooks)) plt.tight_layout()
# annotations abs_resid = lm_abs_resid.sort_values(ascending = False) abs_resid_top_3 = abs_resid[:3] for i in abs_resid_top_3.index: plot_lm_1.axes[0].annotate(i, xy = (lm_fitted_y[i], lm_residuals[i])); # **QQ plot** # In[15]: QQ = ProbPlot(lm_norm_residuals) plot_lm_2 = QQ.qqplot(line = '45', alpha = 0.5, color = '#4C72B0', lw = 1) plot_lm_2.set_figheight(8) plot_lm_2.set_figwidth(12) plot_lm_2.axes[0].set_title('Normal Q-Q') plot_lm_2.axes[0].set_xlabel('Theoretical Quantiles') plot_lm_2.axes[0].set_ylabel('Standardized Residuals'); # annotations abs_norm_resid = np.flip(np.argsort(np.abs(lm_norm_residuals)), 0) abs_norm_resid_top_3 = abs_norm_resid[:3] for r, i in enumerate(abs_norm_resid_top_3): plot_lm_2.axes[0].annotate(i, xy=(np.flip(QQ.theoretical_quantiles, 0)[r], lm_norm_residuals[i]));
def diagnostic_plots(x_true: np.ndarray, y_true: np.ndarray, y_predicted: np.ndarray) -> None: """Generate diagnostic plots for regression evaluation. Source: Emre @ https://emredjan.github.io/blog/2017/07/11/emulating-r-plots-in-python/ Args: x_true (np.ndarray): Known x-values. y_true (np.ndarray): Known y-values. y_predicted (np.ndarray): Predicted y-values. """ residuals, studentized_residuals, cooks_distance, hat_diag = regression_eval_metrics( x_true, y_true, y_predicted) fig, axs = plt.subplots(nrows=2, ncols=2, figsize=(18, 10)) plt.tight_layout(pad=5, w_pad=5, h_pad=5) # 1. residual plot sns.residplot(x=y_predicted, y=residuals, lowess=True, scatter_kws={"alpha": 0.5}, line_kws={ "color": "red", "lw": 1, "alpha": 0.8 }, ax=axs[0, 0]) axs[0, 0].set_title("Residuals vs Fitted") axs[0, 0].set_xlabel("Fitted values") axs[0, 0].set_ylabel("Residuals") # 2. qq plot qq = ProbPlot(studentized_residuals) qq.qqplot(line="45", alpha=0.5, color="#2578B2", lw=0.5, ax=axs[0, 1]) axs[0, 1].set_title("Normal Q-Q") axs[0, 1].set_xlabel("Theoretical Quantiles") axs[0, 1].set_ylabel("Standardized Residuals") # 3. scale-location plot studentized_residuals_abs_sqrt = np.sqrt(np.abs(studentized_residuals)) axs[1, 0].scatter(y_predicted, studentized_residuals_abs_sqrt, alpha=0.5) sns.regplot( y_predicted, studentized_residuals_abs_sqrt, scatter=False, ci=False, lowess=True, line_kws={ "color": "red", "lw": 1, "alpha": 0.8 }, ax=axs[1, 0], ) axs[1, 0].set_title("Scale-Location") axs[1, 0].set_xlabel("Fitted values") axs[1, 0].set_ylabel("$\sqrt{|Standardised Residuals|}$") # 4. leverage plot axs[1, 1].scatter(hat_diag, studentized_residuals, alpha=0.5) sns.regplot(hat_diag, studentized_residuals, scatter=False, ci=False, lowess=True, line_kws={ "color": "red", "lw": 1, "alpha": 0.8 }, ax=axs[1, 1]) axs[1, 1].set_xlim(min(hat_diag), max(hat_diag)) axs[1, 1].set_ylim(min(studentized_residuals), max(studentized_residuals)) axs[1, 1].set_title("Residuals vs Leverage") axs[1, 1].set_xlabel("Leverage") axs[1, 1].set_ylabel("Standardised Residuals") # annotations leverage_top_3 = np.flip(np.argsort(cooks_distance), 0)[:3] for i in leverage_top_3: axs[1, 1].annotate(i, xy=(hat_diag[i], studentized_residuals[i])) def graph(formula, x_range, label=None): x = x_range y = formula(x) axs[1, 1].plot(x, y, label=label, lw=1, ls="--", color="red") p = np.size(x_true, 1) # number of model parameters graph(lambda x: np.sqrt((0.5 * p * (1 - x)) / x), np.linspace(0.001, max(hat_diag), 50), "Cook's distance") graph(lambda x: np.sqrt((1 * p * (1 - x)) / x), np.linspace(0.001, max(hat_diag), 50)) axs[1, 1].legend(loc="upper right")