def qq_plot(self, ax=None):
        """
        Standarized Residual vs Theoretical Quantile plot

        Used to visually check if residuals are normally distributed.
        Points spread along the diagonal line will suggest so.
        """
        if ax is None:
            fig, ax = plt.subplots()

        QQ = ProbPlot(self.residual_norm)
        QQ.qqplot(line='45', alpha=0.5, lw=1, ax=ax)

        # annotations
        abs_norm_resid = np.flip(np.argsort(np.abs(self.residual_norm)), 0)
        abs_norm_resid_top_3 = abs_norm_resid[:3]
        for r, i in enumerate(abs_norm_resid_top_3):
            ax.annotate(i,
                        xy=(np.flip(QQ.theoretical_quantiles,
                                    0)[r], self.residual_norm[i]),
                        ha='right',
                        color='C3')

        ax.set_title('Normal Q-Q', fontweight="bold")
        ax.set_xlabel('Theoretical Quantiles')
        ax.set_ylabel('Standardized Residuals')
        return ax
Exemplo n.º 2
0
def QQ_plot(model_fit, data):
    model_norm_residuals = model_fit.get_influence().resid_studentized_internal

    QQ = ProbPlot(model_norm_residuals)
    fig = QQ.qqplot(line='45', alpha=0.5, color='#4C72B0', lw=1)

    fig.set_figheight(5)
    fig.set_figwidth(5)

    ax = fig.axes[0]
    ax.set_title('Normal Q-Q')
    ax.set_xlabel('Theoretical Quantiles')
    ax.set_ylabel('Standardized Residuals')

    # annotations
    abs_norm_resid = np.argsort(np.abs(model_norm_residuals))
    abs_norm_resid_top_3 = abs_norm_resid[-3:]  # indices of 3 most extreme observations

    for i in abs_norm_resid_top_3:
        ax.annotate(i, xy=(
            QQ.theoretical_quantiles[np.where(QQ.sample_quantiles == model_norm_residuals[i])[0][0]],
            model_norm_residuals[i])
        )

    fig.tight_layout()
    return ax
Exemplo n.º 3
0
def test_param_unpacking():
    expected = np.array([2.0, 3, 0, 1])
    pp = ProbPlot(np.empty(100), dist=stats.beta(2, 3))
    assert_equal(pp.fit_params, expected)
    pp = ProbPlot(np.empty(100), stats.beta(2, b=3))
    assert_equal(pp.fit_params, expected)
    pp = ProbPlot(np.empty(100), stats.beta(a=2, b=3))
    assert_equal(pp.fit_params, expected)

    expected = np.array([2.0, 3, 4, 1])
    pp = ProbPlot(np.empty(100), stats.beta(2, 3, 4))
    assert_equal(pp.fit_params, expected)
    pp = ProbPlot(np.empty(100), stats.beta(a=2, b=3, loc=4))
    assert_equal(pp.fit_params, expected)

    expected = np.array([2.0, 3, 4, 5])
    pp = ProbPlot(np.empty(100), stats.beta(2, 3, 4, 5))
    assert_equal(pp.fit_params, expected)
    pp = ProbPlot(np.empty(100), stats.beta(2, 3, 4, scale=5))
    assert_equal(pp.fit_params, expected)
    pp = ProbPlot(np.empty(100), stats.beta(2, 3, loc=4, scale=5))
    assert_equal(pp.fit_params, expected)
    pp = ProbPlot(np.empty(100), stats.beta(2, b=3, loc=4, scale=5))
    assert_equal(pp.fit_params, expected)
    pp = ProbPlot(np.empty(100), stats.beta(a=2, b=3, loc=4, scale=5))
    assert_equal(pp.fit_params, expected)
Exemplo n.º 4
0
def diagnostic_plots(X, y, model_fit=None):
    """
    Function to reproduce the 4 base plots of an OLS model from R.

    ---
    Inputs:

    X: A numpy array or pandas dataframe of the features to use in building the linear regression model

    y: A numpy array or pandas series/dataframe of the target variable of the linear regression model

    model_fit [optional]: a statsmodel.api.OLS model after regressing y on X. If not provided, will be
                          generated from X, y
    """

    if not model_fit:
        model_fit = sm.OLS(y, sm.add_constant(X)).fit()

    # create dataframe from X, y for easier plot handling
    dataframe = pd.concat([X, y], axis=1)

    # model values
    model_fitted_y = model_fit.fittedvalues
    # model residuals
    model_residuals = model_fit.resid
    # normalized residuals
    model_norm_residuals = model_fit.get_influence().resid_studentized_internal
    # absolute squared normalized residuals
    model_norm_residuals_abs_sqrt = np.sqrt(np.abs(model_norm_residuals))
    # absolute residuals
    model_abs_resid = np.abs(model_residuals)
    # leverage, from statsmodels internals
    model_leverage = model_fit.get_influence().hat_matrix_diag
    # cook's distance, from statsmodels internals
    model_cooks = model_fit.get_influence().cooks_distance[0]

    # Residuals vs Fitted Plot
    plot_lm_1 = plt.figure(figsize=(8, 5))
    plot_lm_1.axes[0] = sns.residplot(model_fitted_y,
                                      dataframe.columns[-1],
                                      data=dataframe,
                                      lowess=True,
                                      scatter_kws={'alpha': 0.5},
                                      line_kws={
                                          'color': 'red',
                                          'lw': 1,
                                          'alpha': 0.8
                                      })

    plot_lm_1.axes[0].set_title('Residuals vs Fitted')
    plot_lm_1.axes[0].set_xlabel('Fitted values')
    plot_lm_1.axes[0].set_ylabel('Residuals')

    # Normal Q-Q Plot
    QQ = ProbPlot(model_norm_residuals)
    plot_lm_2 = QQ.qqplot(line='45', alpha=0.5, color='#4C72B0', lw=1)
    plot_lm_2.axes[0].set_title('Normal Q-Q')
    plot_lm_2.axes[0].set_xlabel('Theoretical Quantiles')
    plot_lm_2.axes[0].set_ylabel('Standardized Residuals')
Exemplo n.º 5
0
 def setup(self):
     self.data = sm.datasets.longley.load()
     self.data.exog = sm.add_constant(self.data.exog, prepend=False)
     self.mod_fit = sm.OLS(self.data.endog, self.data.exog).fit()
     self.res = self.mod_fit.resid
     self.prbplt = ProbPlot(self.mod_fit.resid, dist=stats.t, distargs=(4,))
     self.other_array = np.random.normal(size=self.prbplt.data.shape)
     self.other_prbplot = ProbPlot(self.other_array)
Exemplo n.º 6
0
    def plot_normality(self, ax, color, label):
        """qq normality test"""
        qq = ProbPlot(self.residuals)
        qq.qqplot(line='45',
                  alpha=0.5,
                  color=color,
                  lw=0.5,
                  ax=ax,
                  label=label)

        self.annotate_residuals(ax)
Exemplo n.º 7
0
 def test_exceptions(self):
     with pytest.raises(ValueError):
         ProbPlot(self.data, dist=stats.norm(loc=8.5, scale=3.0), fit=True)
     with pytest.raises(ValueError):
         ProbPlot(
             self.data,
             dist=stats.norm(loc=8.5, scale=3.0),
             distargs=(8.5, 3.0),
         )
     with pytest.raises(ValueError):
         ProbPlot(self.data, dist=stats.norm(loc=8.5, scale=3.0), loc=8.5)
     with pytest.raises(ValueError):
         ProbPlot(self.data, dist=stats.norm(loc=8.5, scale=3.0), scale=3.0)
    def qq_plot(self, ax):
        QQ = ProbPlot(self.residuals.normalised_residuals)
        QQ.qqplot(line='45', alpha=0.5, color='#4C72B0', lw=1, ax=ax)
        ax.set_title('Normal Q-Q')
        ax.set_xlabel('Theoretical Quintiles')
        ax.set_ylabel('Standardised Residuals')

        # annotations
        abs_norm_resid_top_3 = np.flip(np.argsort(np.abs(self.residuals.normalised_residuals)), 0)[:3]

        for r, i in enumerate(abs_norm_resid_top_3):
            ax.annotate(
                i,
                xy=(np.flip(QQ.theoretical_quantiles, 0)[r], self.residuals.normalised_residuals[i])
            )
Exemplo n.º 9
0
    def check_residual_normality(residuals_normalized):
        qq = ProbPlot(residuals_normalized)
        plot_2 = qq.qqplot(line='45', alpha=0.5, color='#4C72B0', lw=1)
        plot_2.axes[0].set_title('Normal Q-Q')
        plot_2.axes[0].set_xlabel('Theoretical Quantiles')
        plot_2.axes[0].set_ylabel('Standardized Residuals')

        # annotations
        abs_norm_resid = np.flip(np.argsort(np.abs(residuals_normalized)), 0)
        abs_norm_resid_top_3 = abs_norm_resid[:3]
        for r, i in enumerate(abs_norm_resid_top_3):
            plot_2.axes[0].annotate(i,
                                    xy=(np.flip(qq.theoretical_quantiles,
                                                0)[r],
                                        residuals_normalized[i]))
Exemplo n.º 10
0
    def check_linearity_assumption(fitted_y, residuals):
        qq = ProbPlot(residuals)
        plot_1 = plt.figure()
        plot_1.axes[0] = sns.residplot(fitted_y,
                                       residuals,
                                       lowess=True,
                                       scatter_kws={'alpha': 0.5},
                                       line_kws={
                                           'color': 'red',
                                           'lw': 1,
                                           'alpha': 0.8
                                       })

        plot_1.axes[0].set_title('Residuals vs Fitted')
        #plot_1.axes[0].set_xlim(min(residuals) - 0.1, max(residuals) + 0.01)
        plot_1.axes[0].set_xlabel('Fitted values')
        plot_1.axes[0].set_ylabel('Residuals')

        # annotations
        '''
        norm_resid = np.flip(np.argsort(residuals), 0)

        s_thresh = max(qq.theoretical_quantiles)
        norm_resid_top = pd.DataFrame(residuals).index[abs(residuals)>s_thresh].to_list()

        for i in norm_resid_top:
            plot_1.axes[0].annotate(fitted_y.index[i],
                                    xy=(fitted_y[i],
                                        residuals[i]))
        '''

        plt.savefig("ResVsFitted.png")
Exemplo n.º 11
0
def slm_plot_qq(fitted_model, ax=None, scolor='C0', lcolor='C1', lw=2, line=True,
                annotations=3):
    """Produce standard Q-Q plot."""
    
    resids = fitted_model.get_influence().resid_studentized_internal
    pp = ProbPlot(resids)
    
    ax = sns.scatterplot(pp.theoretical_quantiles, pp.sorted_data, 
                         ax=ax, color=scolor, linewidth=0, alpha=0.7)
    if line:
        ax.plot(pp.theoretical_quantiles[[0, -1]], pp.theoretical_quantiles[[0, -1]], color=lcolor, lw=lw)

    if  annotations:
        idxs = pd.Series(resids).abs().nlargest(annotations).index
        jdxs = pd.Series(pp.sorted_data).abs().nlargest(annotations).index
        for idx, jdx in zip(idxs, jdxs):
            qq = pp.theoretical_quantiles[jdx]
            resid = pp.sorted_data[jdx]
            ax.annotate(fitted_model.resid.index[idx], (qq, resid))

    ax.set_title('Normal Q-Q')
    ax.set_xlabel('Theoretical Quantiles')
    ax.set_ylabel('Standardised Residuals')

    return ax
Exemplo n.º 12
0
 def annotate_residuals(self, ax):
     qq = ProbPlot(self.residuals)
     sorted_residuals = np.flip(np.argsort(np.abs(self.residuals)), 0)
     top_3_residuals = sorted_residuals[:3]
     for r, i in enumerate(top_3_residuals):
         ax.annotate(self.temperature[i],
                     xy=(np.sign(self.residuals[i]) * np.flip(qq.theoretical_quantiles, 0)[r], self.residuals[i]))
Exemplo n.º 13
0
def test_invalid_dist_config(close_figures):
    # GH 4226
    np.random.seed(5)
    data = sm.datasets.longley.load(as_pandas=False)
    data.exog = sm.add_constant(data.exog, prepend=False)
    mod_fit = sm.OLS(data.endog, data.exog).fit()
    with pytest.raises(TypeError, match=r"dist\(0, 1, 4, loc=0, scale=1\)"):
        ProbPlot(mod_fit.resid, stats.t, distargs=(0, 1, 4))
Exemplo n.º 14
0
 def setup(self):
     np.random.seed(5)
     self.data = sm.datasets.longley.load()
     self.data.exog = sm.add_constant(self.data.exog, prepend=False)
     self.mod_fit = sm.OLS(self.data.endog, self.data.exog).fit()
     self.prbplt = ProbPlot(
         self.mod_fit.resid, dist=stats.t, distargs=(4,), fit=True
     )
     self.line = "r"
     super().setup()
Exemplo n.º 15
0
def qqplot(X, y, model_fit=None):
    if not model_fit:
        model_fit = smf.OLS(y, smf.add_constant(X)).fit()
    model_fitted_y = model_fit.fittedvalues
    model_norm_residuals = model_fit.get_influence().resid_studentized_internal
    model_norm_residuals_abs_sqrt = np.sqrt(np.abs(model_norm_residuals))

    QQ = ProbPlot(model_norm_residuals)
    plot_lm_2 = QQ.qqplot(line='45', alpha=0.5, color='#4C72B0', lw=1)
    plot_lm_2.axes[0].set_title('Normal Q-Q')
    plot_lm_2.axes[0].set_xlabel('Theoretical Quantiles')
    plot_lm_2.axes[0].set_ylabel('Standardized Residuals')
    # annotations
    abs_norm_resid = np.flip(np.argsort(np.abs(model_norm_residuals)), 0)
    abs_norm_resid_top_5 = abs_norm_resid[:5]
    for r, i in enumerate(abs_norm_resid_top_5):
        plot_lm_2.axes[0].annotate(i,
                                   xy=(np.flip(QQ.theoretical_quantiles,
                                               0)[r], model_norm_residuals[i]))
Exemplo n.º 16
0
def plot_qq(resid, title='', ax=None, z=2.807, strftime='%Y-%m-%d'):
    pp = ProbPlot(resid, fit=True)
    outliers = abs(pp.sample_quantiles) > z
    ax = ax or plt.gca()
    pp.qqplot(ax=ax, color='C0', alpha=.5)
    sm.qqline(ax=ax, line='45', fmt='r--', lw=1)
    z = resid.sort_values().index[outliers]
    for x, y, i in zip(pp.theoretical_quantiles[outliers],
                       pp.sample_quantiles[outliers], z):
        ax.annotate(i.strftime(strftime) if strftime else str(i),
                    xy=(x, y),
                    c='m')
    ax.set_title(title or 'Normal Q-Q')
    ax.set_ylabel('Standardized residuals')
    return DataFrame(
        {
            'residuals': pp.sorted_data[outliers],
            'standardized': pp.sample_quantiles[outliers]
        },
        index=z)
Exemplo n.º 17
0
 def plotQQ(self, annot=0, ax=None):
     if not ax:
         ax = plt.subplot()
     QQ = ProbPlot(self.df['residual_stud'])
     QQ.qqplot(line='45', alpha=0.5, color='#4C72B0', lw=1, ax=ax)
     ax.set_title('Normal Q-Q')
     ax.set_xlabel('Theoretical Quantiles')
     ax.set_ylabel('Standardized Residuals')
     # annotations
     if annot:
         abs_norm_resid = np.flip(
             np.argsort(np.abs(self.df['residual_stud'])), 0)
         abs_norm_resid_top = abs_norm_resid[:annot]
         abs_norm_resid_top = self.df['residual_stud'].index[
             abs_norm_resid_top]
         for r, i in enumerate(abs_norm_resid_top):
             ax.annotate(i,
                         xy=(np.flip(QQ.theoretical_quantiles,
                                     0)[r], self.df['residual_stud'][i]))
     return ax
def QQ(model_norm_residuals):
    QQ = ProbPlot(model_norm_residuals)
    plot_lm_2 = QQ.qqplot(line='45', alpha=0.5, color='#4C72B0', lw=1)

    plot_lm_2.set_figheight(8)
    plot_lm_2.set_figwidth(12)

    plot_lm_2.axes[0].set_title('Normal Q-Q')
    plot_lm_2.axes[0].set_xlabel('Theoretical Quantiles')
    plot_lm_2.axes[0].set_ylabel('Standardized Residuals')

    # annotations
    abs_norm_resid = np.flip(np.argsort(np.abs(model_norm_residuals)), 0)
    abs_norm_resid_top_3 = abs_norm_resid[:3]

    for r, i in enumerate(abs_norm_resid_top_3):
        plot_lm_2.axes[0].annotate(i,
                                   xy=(np.flip(QQ.theoretical_quantiles,
                                               0)[r], model_norm_residuals[i]))

    return
Exemplo n.º 19
0
def qqPlot(fitted):
    QQ = ProbPlot(fitted.get_influence().resid_studentized_internal)
    plot_lm_2 = QQ.qqplot(line='45', alpha=0.5, color='#4C72B0', lw=1)

    plot_lm_2.set_figheight(8)
    plot_lm_2.set_figwidth(12)

    plot_lm_2.axes[0].set_title('Normal Q-Q')
    plot_lm_2.axes[0].set_xlabel('Theoretical Quantiles')
    plot_lm_2.axes[0].set_ylabel('Standardized Residuals')

    # annotations
    abs_norm_resid = np.flip(
        np.argsort(np.abs(fitted.get_influence().resid_studentized_internal)),
        0)
    abs_norm_resid_top_3 = abs_norm_resid[:3]

    for r, i in enumerate(abs_norm_resid_top_3):
        plot_lm_2.axes[0].annotate(
            i,
            xy=(np.flip(QQ.theoretical_quantiles, 0)[r],
                fitted.get_influence().resid_studentized_internal[i]))
    return plot_lm_2
Exemplo n.º 20
0
class TestCompareSamplesDifferentSize:
    def setup(self):
        np.random.seed(5)
        self.data1 = ProbPlot(np.random.normal(loc=8.25, scale=3.25, size=37))
        self.data2 = ProbPlot(np.random.normal(loc=8.25, scale=3.25, size=55))

    @pytest.mark.matplotlib
    def test_qqplot(self, close_figures):
        self.data1.qqplot(other=self.data2)
        with pytest.raises(ValueError):
            self.data2.qqplot(other=self.data1)

    @pytest.mark.matplotlib
    def test_ppplot(self, close_figures):
        self.data1.ppplot(other=self.data2)
        self.data2.ppplot(other=self.data1)
Exemplo n.º 21
0
    def setup(self):
        try:
            import matplotlib.pyplot as plt

            self.fig, self.ax = plt.subplots()
        except ImportError:
            pass
        self.other_array = np.random.normal(size=self.prbplt.data.shape)
        self.other_prbplot = ProbPlot(self.other_array)
        self.plot_options = dict(
            marker="d",
            markerfacecolor="cornflowerblue",
            markeredgecolor="white",
            alpha=0.5,
        )
Exemplo n.º 22
0
    def check_residual_normality(fitted_y, t_residuals_normalized):
        qq = ProbPlot(t_residuals_normalized)
        plot_2 = qq.qqplot(line='45', alpha=0.5, color='#4C72B0', lw=1)
        plot_2.axes[0].set_title('Normal Q-Q')
        plot_2.axes[0].set_xlabel('Theoretical Quantiles')
        plot_2.axes[0].set_ylabel('T Student Standardized Residuals')

        # annotations

        df = pd.DataFrame(
            pd.DataFrame(t_residuals_normalized).set_index(fitted_y.index))
        df['ranks'] = df.rank(method='dense').astype(int) - 1
        ranks = df.ranks.values

        s_thresh = max(qq.theoretical_quantiles)
        abs_norm_resid_top = pd.DataFrame(t_residuals_normalized).index[
            abs(t_residuals_normalized) > s_thresh].to_list()

        for r, i in enumerate(abs_norm_resid_top):
            plot_2.axes[0].annotate(fitted_y.index[i],
                                    xy=(qq.theoretical_quantiles[ranks[i]],
                                        t_residuals_normalized[i]))

        plt.savefig("Normality.png")
Exemplo n.º 23
0
def qq_plot(results, n_annotate=3):
    # normalized residuals
    model_norm_residuals = results.get_influence().resid_studentized_internal

    QQ = ProbPlot(model_norm_residuals)
    fig = QQ.qqplot(line='45', alpha=0.5, color='#4C72B0', lw=1)

    fig.set_figheight(8)
    fig.set_figwidth(12)

    fig.axes[0].set_title('Normal Q-Q')
    fig.axes[0].set_xlabel('Theoretical Quantiles')
    fig.axes[0].set_ylabel('Standardized Residuals')

    # annotations
    abs_norm_resid = np.flip(np.argsort(np.abs(model_norm_residuals)), 0)
    n_annotate = min(n_annotate, len(abs_norm_resid))
    abs_norm_resid_top_n = abs_norm_resid[:n_annotate]

    for r, i in enumerate(abs_norm_resid_top_n):
        fig.axes[0].annotate(i, xy=(np.flip(QQ.theoretical_quantiles, 0)[r], model_norm_residuals[i]))

    plt.close()
    return fig
Exemplo n.º 24
0
def normal_resid_distr(norm_resid, sample_n=None, height=2, width=4):
    '''
    This plot shows if residuals are normally distributed. 
    Do residuals follow a straight line well or do they deviate severely? 
    It’s good if residuals are lined well on the straight dashed line.
    '''
    plt.style.use('seaborn')

    # if sampling value turned on then limit data in the output
    if sample_n != None:
        if len(norm_resid) > sample_n:
            i = np.random.choice(len(norm_resid), sample_n, replace=False)
            norm_resid = norm_resid[i]

    ## residual normal quantile plot
    norm_q_plot = ProbPlot(norm_resid)
    plot_lm_q = norm_q_plot.qqplot(line='45', alpha=0.2, color='#4C72B0', lw=1)
    plot_lm_q.set_figheight(height)
    plot_lm_q.set_figwidth(width)
    # labels
    plot_lm_q.axes[0].set_title('Normal Quantile')
    plot_lm_q.axes[0].set_xlabel('Theoretical Quantiles')
    plot_lm_q.axes[0].set_ylabel('Normalized Residuals')
    plt.show()
Exemplo n.º 25
0
def update_qq_plot(predictors, response):
    """
    Normal QQ Plot
    """
    MYCLASS.reg(predictors, response)
    model = MYCLASS.get_model()
    resid_norm = model.get_influence().resid_studentized_internal
    qq_res = ProbPlot(resid_norm)
    theo = qq_res.theoretical_quantiles
    sample = qq_res.sample_quantiles
    marker_size = 8
    opacity = 0.5
    traces = [
        go.Scatter(
            x=theo,
            y=sample,
            mode='markers',
            name='data',
            marker=dict(
                size=marker_size,
                opacity=opacity
            )
        ),
        go.Scatter(
            x=theo,
            y=theo,
            type='scatter',
            mode='lines',
            name='line',
            line=dict(
                width=1,
                color='red'
            )
        )
    ]

    return {
        'data': traces,
        'layout': dict(
            title='Normal Q-Q',
            xaxis={'title': 'Theoretical Quantiles'},
            yaxis={'title': 'Standardized Residuals'},
            plot_bgcolor='#e6e6e6',
            showlegend=False,
            hovermode='closest'
        )
    }
Exemplo n.º 26
0
    def check_homoscedacticity(fitted_y, t_residuals_normalized):

        qq = ProbPlot(t_residuals_normalized)
        s_thresh = np.sqrt(max(qq.theoretical_quantiles))

        # absolute squared normalized residuals
        residuals_norm_abs_sqrt = np.sqrt(np.abs(t_residuals_normalized))

        plot_3 = plt.figure()
        plt.scatter(fitted_y, residuals_norm_abs_sqrt, alpha=0.5)
        sns.regplot(fitted_y,
                    residuals_norm_abs_sqrt,
                    scatter=False,
                    ci=False,
                    lowess=True,
                    line_kws={
                        'color': 'red',
                        'lw': 1,
                        'alpha': 0.8
                    })
        plot_3.axes[0].set_title('Scale-Location')
        plot_3.axes[0].set_xlabel('Fitted values')
        plot_3.axes[0].set_ylabel(
            "$\\sqrt{|T Student Standardized Residuals|}$")
        plot_3.axes[0].axhline(s_thresh, ls='--', color='black')

        # annotations
        abs_sq_norm_resid = np.flip(np.argsort(residuals_norm_abs_sqrt), 0)

        abs_sq_norm_resid_top = pd.DataFrame(residuals_norm_abs_sqrt).index[
            residuals_norm_abs_sqrt > s_thresh].to_list()
        for i in abs_sq_norm_resid_top:
            plot_3.axes[0].annotate(fitted_y.index[i],
                                    xy=(fitted_y[i],
                                        residuals_norm_abs_sqrt[i]))
        plt.savefig("Homoscadasticity.png")
 def qq_plot(self, var_x=None, var_y=None):
     """
     Creates a normal qq plot
     Input:
         - var_x: A list of predictor variable(s) (default=None)
         - var_y: A response variable (default=None)
     """
     # priority: arguments var_x, var_y
     if var_x is not None and var_y is not None:
         self.reg(var_x, var_y)
     else:
         var_x = self.get_predictors()
         var_y = self.get_response()
         if var_x is not None and var_y is not None:
             self.reg(var_x, var_y)
         else:
             raise ValueError('No predictors or response assigned')
     model = self.get_model()
     resid_norm = model.get_influence().resid_studentized_internal
     qq_plt = ProbPlot(resid_norm)
     theo = qq_plt.theoretical_quantiles
     sample = qq_plt.sample_quantiles
     plt.scatter(theo, sample, alpha=0.5)
     sns.regplot(theo,
                 theo,
                 scatter=False,
                 ci=False,
                 lowess=True,
                 line_kws={
                     'color': 'red',
                     'lw': 1,
                     'alpha': 1
                 })
     plt.title('Normal Q-Q')
     plt.xlabel('Theoretical Quantiles')
     plt.ylabel('Standardized Residuals')
Exemplo n.º 28
0
def plot_lm(model_fit, data, key, n=3):
    import numpy as np
    import pandas as pd
    import seaborn as sns
    import matplotlib.pyplot as plt
    import statsmodels.formula.api as smf

    from statsmodels.graphics.gofplots import ProbPlot

    # fitted values (need a constant term for intercept)
    model_fitted_y = model_fit.fittedvalues
    # model residuals
    model_residuals = model_fit.resid
    # normalized residuals
    model_norm_residuals = model_fit.get_influence().resid_studentized_internal
    # absolute squared normalized residuals
    model_norm_residuals_abs_sqrt = np.sqrt(np.abs(model_norm_residuals))
    # absolute residuals
    model_abs_resid = np.abs(model_residuals)
    # leverage, from statsmodels internals
    model_leverage = model_fit.get_influence().hat_matrix_diag
    # cook's distance, from statsmodels internals
    model_cooks = model_fit.get_influence().cooks_distance[0]

    fig = plt.figure()
    fig.set_figheight(8)
    fig.set_figwidth(12)

    # first plot
    plot_lm_1 = plt.subplot(2, 2, 1)

    temp_plot = sns.residplot(model_fitted_y,
                              key,
                              data=data,
                              lowess=True,
                              scatter_kws={'alpha': 0.5},
                              line_kws={
                                  'color': 'red',
                                  'lw': 1,
                                  'alpha': 0.8
                              })

    plot_lm_1.set_title('Residuals vs Fitted')
    plot_lm_1.set_xlabel('Fitted values')
    plot_lm_1.set_ylabel('Residuals')

    # annotations
    abs_resid = model_abs_resid.sort_values(ascending=False)
    abs_resid_top_3 = abs_resid[:3]

    for i in abs_resid_top_3.index:
        plot_lm_1.annotate(i, xy=(model_fitted_y[i], model_residuals[i]))

    # second plot
    plot_lm_2 = plt.subplot(2, 2, 2)
    QQ = ProbPlot(model_norm_residuals)
    temp_plot = QQ.qqplot(line='45',
                          alpha=0.5,
                          color='#4C72B0',
                          lw=1,
                          ax=plot_lm_2)

    plot_lm_2.set_title('Normal Q-Q')
    plot_lm_2.set_xlabel('Theoretical Quantiles')
    plot_lm_2.set_ylabel('Standardized Residuals')

    # annotations
    abs_norm_resid = np.flip(np.argsort(np.abs(model_norm_residuals)), 0)
    abs_norm_resid_top_3 = abs_norm_resid[:3]

    for r, i in enumerate(abs_norm_resid_top_3):
        plot_lm_2.annotate(i,
                           xy=(np.flip(QQ.theoretical_quantiles,
                                       0)[r], model_norm_residuals[i]))

    # third plot
    plot_lm_3 = plt.subplot(2, 2, 3)

    plt.scatter(model_fitted_y, model_norm_residuals_abs_sqrt, alpha=0.5)
    sns.regplot(model_fitted_y,
                model_norm_residuals_abs_sqrt,
                scatter=False,
                ci=False,
                lowess=True,
                line_kws={
                    'color': 'red',
                    'lw': 1,
                    'alpha': 0.8
                })

    plot_lm_3.set_title('Scale-Location')
    plot_lm_3.set_xlabel('Fitted values')
    plot_lm_3.set_ylabel('$\sqrt{|Standardized Residuals|}$')

    # annotations
    abs_sq_norm_resid = np.flip(np.argsort(model_norm_residuals_abs_sqrt), 0)
    abs_sq_norm_resid_top_3 = abs_sq_norm_resid[:3]

    for i in abs_norm_resid_top_3:
        try:
            plot_lm_3.annotate(i,
                               xy=(model_fitted_y[i],
                                   model_norm_residuals_abs_sqrt[i]))
        except KeyError as err:
            continue

    # fourth plot
    plot_lm_4 = plt.subplot(2, 2, 4)

    plt.scatter(model_leverage, model_norm_residuals, alpha=0.5)
    sns.regplot(model_leverage,
                model_norm_residuals,
                scatter=False,
                ci=False,
                lowess=True,
                line_kws={
                    'color': 'red',
                    'lw': 1,
                    'alpha': 0.8
                })

    plot_lm_4.set_xlim(0, 0.20)
    plot_lm_4.set_ylim(-3, 5)
    plot_lm_4.set_title('Residuals vs Leverage')
    plot_lm_4.set_xlabel('Leverage')
    plot_lm_4.set_ylabel('Standardized Residuals')

    # annotations
    leverage_top_3 = np.flip(np.argsort(model_cooks), 0)[:3]

    for i in leverage_top_3:
        plot_lm_4.annotate(i, xy=(model_leverage[i], model_norm_residuals[i]))

    # shenanigans for cook's distance contours
    def graph(formula, x_range, label=None):
        x = x_range
        y = formula(x)
        plt.plot(x, y, label=label, lw=1, ls='--', color='red')

    p = len(model_fit.params)  # number of model parameters

    graph(lambda x: np.sqrt((0.5 * p * (1 - x)) / x),
          np.linspace(0.001, 0.200, 50), 'Cook\'s distance')  # 0.5 line
    graph(lambda x: np.sqrt((1 * p * (1 - x)) / x),
          np.linspace(0.001, 0.200, 50))  # 1 line
    plt.legend(loc='upper right')
    plt.xlim(xmax=np.max(model_cooks))

    plt.tight_layout()

# annotations
abs_resid = lm_abs_resid.sort_values(ascending = False)
abs_resid_top_3 = abs_resid[:3]

for i in abs_resid_top_3.index:
    plot_lm_1.axes[0].annotate(i, xy = (lm_fitted_y[i], lm_residuals[i]));


# **QQ plot**

# In[15]:


QQ = ProbPlot(lm_norm_residuals)
plot_lm_2 = QQ.qqplot(line = '45', alpha = 0.5, color = '#4C72B0', lw = 1)

plot_lm_2.set_figheight(8)
plot_lm_2.set_figwidth(12)

plot_lm_2.axes[0].set_title('Normal Q-Q')
plot_lm_2.axes[0].set_xlabel('Theoretical Quantiles')
plot_lm_2.axes[0].set_ylabel('Standardized Residuals');

# annotations
abs_norm_resid = np.flip(np.argsort(np.abs(lm_norm_residuals)), 0)
abs_norm_resid_top_3 = abs_norm_resid[:3]

for r, i in enumerate(abs_norm_resid_top_3):
    plot_lm_2.axes[0].annotate(i, xy=(np.flip(QQ.theoretical_quantiles, 0)[r], lm_norm_residuals[i]));
def diagnostic_plots(x_true: np.ndarray, y_true: np.ndarray,
                     y_predicted: np.ndarray) -> None:
    """Generate diagnostic plots for regression evaluation.

    Source: Emre @ https://emredjan.github.io/blog/2017/07/11/emulating-r-plots-in-python/

    Args:
        x_true (np.ndarray): Known x-values.
        y_true (np.ndarray): Known y-values.
        y_predicted (np.ndarray): Predicted y-values.
    """
    residuals, studentized_residuals, cooks_distance, hat_diag = regression_eval_metrics(
        x_true, y_true, y_predicted)

    fig, axs = plt.subplots(nrows=2, ncols=2, figsize=(18, 10))
    plt.tight_layout(pad=5, w_pad=5, h_pad=5)

    # 1. residual plot
    sns.residplot(x=y_predicted,
                  y=residuals,
                  lowess=True,
                  scatter_kws={"alpha": 0.5},
                  line_kws={
                      "color": "red",
                      "lw": 1,
                      "alpha": 0.8
                  },
                  ax=axs[0, 0])
    axs[0, 0].set_title("Residuals vs Fitted")
    axs[0, 0].set_xlabel("Fitted values")
    axs[0, 0].set_ylabel("Residuals")

    # 2. qq plot
    qq = ProbPlot(studentized_residuals)
    qq.qqplot(line="45", alpha=0.5, color="#2578B2", lw=0.5, ax=axs[0, 1])
    axs[0, 1].set_title("Normal Q-Q")
    axs[0, 1].set_xlabel("Theoretical Quantiles")
    axs[0, 1].set_ylabel("Standardized Residuals")

    # 3. scale-location plot
    studentized_residuals_abs_sqrt = np.sqrt(np.abs(studentized_residuals))
    axs[1, 0].scatter(y_predicted, studentized_residuals_abs_sqrt, alpha=0.5)
    sns.regplot(
        y_predicted,
        studentized_residuals_abs_sqrt,
        scatter=False,
        ci=False,
        lowess=True,
        line_kws={
            "color": "red",
            "lw": 1,
            "alpha": 0.8
        },
        ax=axs[1, 0],
    )
    axs[1, 0].set_title("Scale-Location")
    axs[1, 0].set_xlabel("Fitted values")
    axs[1, 0].set_ylabel("$\sqrt{|Standardised Residuals|}$")

    # 4. leverage plot
    axs[1, 1].scatter(hat_diag, studentized_residuals, alpha=0.5)
    sns.regplot(hat_diag,
                studentized_residuals,
                scatter=False,
                ci=False,
                lowess=True,
                line_kws={
                    "color": "red",
                    "lw": 1,
                    "alpha": 0.8
                },
                ax=axs[1, 1])
    axs[1, 1].set_xlim(min(hat_diag), max(hat_diag))
    axs[1, 1].set_ylim(min(studentized_residuals), max(studentized_residuals))
    axs[1, 1].set_title("Residuals vs Leverage")
    axs[1, 1].set_xlabel("Leverage")
    axs[1, 1].set_ylabel("Standardised Residuals")

    # annotations
    leverage_top_3 = np.flip(np.argsort(cooks_distance), 0)[:3]
    for i in leverage_top_3:
        axs[1, 1].annotate(i, xy=(hat_diag[i], studentized_residuals[i]))

    def graph(formula, x_range, label=None):
        x = x_range
        y = formula(x)
        axs[1, 1].plot(x, y, label=label, lw=1, ls="--", color="red")

    p = np.size(x_true, 1)  # number of model parameters

    graph(lambda x: np.sqrt((0.5 * p * (1 - x)) / x),
          np.linspace(0.001, max(hat_diag), 50), "Cook's distance")
    graph(lambda x: np.sqrt((1 * p * (1 - x)) / x),
          np.linspace(0.001, max(hat_diag), 50))
    axs[1, 1].legend(loc="upper right")