def qq_plot(self, ax=None):
        """
        Standarized Residual vs Theoretical Quantile plot

        Used to visually check if residuals are normally distributed.
        Points spread along the diagonal line will suggest so.
        """
        if ax is None:
            fig, ax = plt.subplots()

        QQ = ProbPlot(self.residual_norm)
        QQ.qqplot(line='45', alpha=0.5, lw=1, ax=ax)

        # annotations
        abs_norm_resid = np.flip(np.argsort(np.abs(self.residual_norm)), 0)
        abs_norm_resid_top_3 = abs_norm_resid[:3]
        for r, i in enumerate(abs_norm_resid_top_3):
            ax.annotate(i,
                        xy=(np.flip(QQ.theoretical_quantiles,
                                    0)[r], self.residual_norm[i]),
                        ha='right',
                        color='C3')

        ax.set_title('Normal Q-Q', fontweight="bold")
        ax.set_xlabel('Theoretical Quantiles')
        ax.set_ylabel('Standardized Residuals')
        return ax
Exemplo n.º 2
0
    def plot_normality(self, ax, color, label):
        """qq normality test"""
        qq = ProbPlot(self.residuals)
        qq.qqplot(line='45',
                  alpha=0.5,
                  color=color,
                  lw=0.5,
                  ax=ax,
                  label=label)

        self.annotate_residuals(ax)
    def qq_plot(self, ax):
        QQ = ProbPlot(self.residuals.normalised_residuals)
        QQ.qqplot(line='45', alpha=0.5, color='#4C72B0', lw=1, ax=ax)
        ax.set_title('Normal Q-Q')
        ax.set_xlabel('Theoretical Quintiles')
        ax.set_ylabel('Standardised Residuals')

        # annotations
        abs_norm_resid_top_3 = np.flip(np.argsort(np.abs(self.residuals.normalised_residuals)), 0)[:3]

        for r, i in enumerate(abs_norm_resid_top_3):
            ax.annotate(
                i,
                xy=(np.flip(QQ.theoretical_quantiles, 0)[r], self.residuals.normalised_residuals[i])
            )
Exemplo n.º 4
0
class TestCompareSamplesDifferentSize:
    def setup(self):
        np.random.seed(5)
        self.data1 = ProbPlot(np.random.normal(loc=8.25, scale=3.25, size=37))
        self.data2 = ProbPlot(np.random.normal(loc=8.25, scale=3.25, size=55))

    @pytest.mark.matplotlib
    def test_qqplot(self, close_figures):
        self.data1.qqplot(other=self.data2)
        with pytest.raises(ValueError):
            self.data2.qqplot(other=self.data1)

    @pytest.mark.matplotlib
    def test_ppplot(self, close_figures):
        self.data1.ppplot(other=self.data2)
        self.data2.ppplot(other=self.data1)
Exemplo n.º 5
0
def QQ_plot(model_fit, data):
    model_norm_residuals = model_fit.get_influence().resid_studentized_internal

    QQ = ProbPlot(model_norm_residuals)
    fig = QQ.qqplot(line='45', alpha=0.5, color='#4C72B0', lw=1)

    fig.set_figheight(5)
    fig.set_figwidth(5)

    ax = fig.axes[0]
    ax.set_title('Normal Q-Q')
    ax.set_xlabel('Theoretical Quantiles')
    ax.set_ylabel('Standardized Residuals')

    # annotations
    abs_norm_resid = np.argsort(np.abs(model_norm_residuals))
    abs_norm_resid_top_3 = abs_norm_resid[-3:]  # indices of 3 most extreme observations

    for i in abs_norm_resid_top_3:
        ax.annotate(i, xy=(
            QQ.theoretical_quantiles[np.where(QQ.sample_quantiles == model_norm_residuals[i])[0][0]],
            model_norm_residuals[i])
        )

    fig.tight_layout()
    return ax
Exemplo n.º 6
0
def diagnostic_plots(X, y, model_fit=None):
    """
    Function to reproduce the 4 base plots of an OLS model from R.

    ---
    Inputs:

    X: A numpy array or pandas dataframe of the features to use in building the linear regression model

    y: A numpy array or pandas series/dataframe of the target variable of the linear regression model

    model_fit [optional]: a statsmodel.api.OLS model after regressing y on X. If not provided, will be
                          generated from X, y
    """

    if not model_fit:
        model_fit = sm.OLS(y, sm.add_constant(X)).fit()

    # create dataframe from X, y for easier plot handling
    dataframe = pd.concat([X, y], axis=1)

    # model values
    model_fitted_y = model_fit.fittedvalues
    # model residuals
    model_residuals = model_fit.resid
    # normalized residuals
    model_norm_residuals = model_fit.get_influence().resid_studentized_internal
    # absolute squared normalized residuals
    model_norm_residuals_abs_sqrt = np.sqrt(np.abs(model_norm_residuals))
    # absolute residuals
    model_abs_resid = np.abs(model_residuals)
    # leverage, from statsmodels internals
    model_leverage = model_fit.get_influence().hat_matrix_diag
    # cook's distance, from statsmodels internals
    model_cooks = model_fit.get_influence().cooks_distance[0]

    # Residuals vs Fitted Plot
    plot_lm_1 = plt.figure(figsize=(8, 5))
    plot_lm_1.axes[0] = sns.residplot(model_fitted_y,
                                      dataframe.columns[-1],
                                      data=dataframe,
                                      lowess=True,
                                      scatter_kws={'alpha': 0.5},
                                      line_kws={
                                          'color': 'red',
                                          'lw': 1,
                                          'alpha': 0.8
                                      })

    plot_lm_1.axes[0].set_title('Residuals vs Fitted')
    plot_lm_1.axes[0].set_xlabel('Fitted values')
    plot_lm_1.axes[0].set_ylabel('Residuals')

    # Normal Q-Q Plot
    QQ = ProbPlot(model_norm_residuals)
    plot_lm_2 = QQ.qqplot(line='45', alpha=0.5, color='#4C72B0', lw=1)
    plot_lm_2.axes[0].set_title('Normal Q-Q')
    plot_lm_2.axes[0].set_xlabel('Theoretical Quantiles')
    plot_lm_2.axes[0].set_ylabel('Standardized Residuals')
Exemplo n.º 7
0
def plot_qq(resid, title='', ax=None, z=2.807, strftime='%Y-%m-%d'):
    pp = ProbPlot(resid, fit=True)
    outliers = abs(pp.sample_quantiles) > z
    ax = ax or plt.gca()
    pp.qqplot(ax=ax, color='C0', alpha=.5)
    sm.qqline(ax=ax, line='45', fmt='r--', lw=1)
    z = resid.sort_values().index[outliers]
    for x, y, i in zip(pp.theoretical_quantiles[outliers],
                       pp.sample_quantiles[outliers], z):
        ax.annotate(i.strftime(strftime) if strftime else str(i),
                    xy=(x, y),
                    c='m')
    ax.set_title(title or 'Normal Q-Q')
    ax.set_ylabel('Standardized residuals')
    return DataFrame(
        {
            'residuals': pp.sorted_data[outliers],
            'standardized': pp.sample_quantiles[outliers]
        },
        index=z)
Exemplo n.º 8
0
 def plotQQ(self, annot=0, ax=None):
     if not ax:
         ax = plt.subplot()
     QQ = ProbPlot(self.df['residual_stud'])
     QQ.qqplot(line='45', alpha=0.5, color='#4C72B0', lw=1, ax=ax)
     ax.set_title('Normal Q-Q')
     ax.set_xlabel('Theoretical Quantiles')
     ax.set_ylabel('Standardized Residuals')
     # annotations
     if annot:
         abs_norm_resid = np.flip(
             np.argsort(np.abs(self.df['residual_stud'])), 0)
         abs_norm_resid_top = abs_norm_resid[:annot]
         abs_norm_resid_top = self.df['residual_stud'].index[
             abs_norm_resid_top]
         for r, i in enumerate(abs_norm_resid_top):
             ax.annotate(i,
                         xy=(np.flip(QQ.theoretical_quantiles,
                                     0)[r], self.df['residual_stud'][i]))
     return ax
Exemplo n.º 9
0
    def check_residual_normality(residuals_normalized):
        qq = ProbPlot(residuals_normalized)
        plot_2 = qq.qqplot(line='45', alpha=0.5, color='#4C72B0', lw=1)
        plot_2.axes[0].set_title('Normal Q-Q')
        plot_2.axes[0].set_xlabel('Theoretical Quantiles')
        plot_2.axes[0].set_ylabel('Standardized Residuals')

        # annotations
        abs_norm_resid = np.flip(np.argsort(np.abs(residuals_normalized)), 0)
        abs_norm_resid_top_3 = abs_norm_resid[:3]
        for r, i in enumerate(abs_norm_resid_top_3):
            plot_2.axes[0].annotate(i,
                                    xy=(np.flip(qq.theoretical_quantiles,
                                                0)[r],
                                        residuals_normalized[i]))
Exemplo n.º 10
0
def qqplot(X, y, model_fit=None):
    if not model_fit:
        model_fit = smf.OLS(y, smf.add_constant(X)).fit()
    model_fitted_y = model_fit.fittedvalues
    model_norm_residuals = model_fit.get_influence().resid_studentized_internal
    model_norm_residuals_abs_sqrt = np.sqrt(np.abs(model_norm_residuals))

    QQ = ProbPlot(model_norm_residuals)
    plot_lm_2 = QQ.qqplot(line='45', alpha=0.5, color='#4C72B0', lw=1)
    plot_lm_2.axes[0].set_title('Normal Q-Q')
    plot_lm_2.axes[0].set_xlabel('Theoretical Quantiles')
    plot_lm_2.axes[0].set_ylabel('Standardized Residuals')
    # annotations
    abs_norm_resid = np.flip(np.argsort(np.abs(model_norm_residuals)), 0)
    abs_norm_resid_top_5 = abs_norm_resid[:5]
    for r, i in enumerate(abs_norm_resid_top_5):
        plot_lm_2.axes[0].annotate(i,
                                   xy=(np.flip(QQ.theoretical_quantiles,
                                               0)[r], model_norm_residuals[i]))
def QQ(model_norm_residuals):
    QQ = ProbPlot(model_norm_residuals)
    plot_lm_2 = QQ.qqplot(line='45', alpha=0.5, color='#4C72B0', lw=1)

    plot_lm_2.set_figheight(8)
    plot_lm_2.set_figwidth(12)

    plot_lm_2.axes[0].set_title('Normal Q-Q')
    plot_lm_2.axes[0].set_xlabel('Theoretical Quantiles')
    plot_lm_2.axes[0].set_ylabel('Standardized Residuals')

    # annotations
    abs_norm_resid = np.flip(np.argsort(np.abs(model_norm_residuals)), 0)
    abs_norm_resid_top_3 = abs_norm_resid[:3]

    for r, i in enumerate(abs_norm_resid_top_3):
        plot_lm_2.axes[0].annotate(i,
                                   xy=(np.flip(QQ.theoretical_quantiles,
                                               0)[r], model_norm_residuals[i]))

    return
Exemplo n.º 12
0
def qqPlot(fitted):
    QQ = ProbPlot(fitted.get_influence().resid_studentized_internal)
    plot_lm_2 = QQ.qqplot(line='45', alpha=0.5, color='#4C72B0', lw=1)

    plot_lm_2.set_figheight(8)
    plot_lm_2.set_figwidth(12)

    plot_lm_2.axes[0].set_title('Normal Q-Q')
    plot_lm_2.axes[0].set_xlabel('Theoretical Quantiles')
    plot_lm_2.axes[0].set_ylabel('Standardized Residuals')

    # annotations
    abs_norm_resid = np.flip(
        np.argsort(np.abs(fitted.get_influence().resid_studentized_internal)),
        0)
    abs_norm_resid_top_3 = abs_norm_resid[:3]

    for r, i in enumerate(abs_norm_resid_top_3):
        plot_lm_2.axes[0].annotate(
            i,
            xy=(np.flip(QQ.theoretical_quantiles, 0)[r],
                fitted.get_influence().resid_studentized_internal[i]))
    return plot_lm_2
Exemplo n.º 13
0
    def check_residual_normality(fitted_y, t_residuals_normalized):
        qq = ProbPlot(t_residuals_normalized)
        plot_2 = qq.qqplot(line='45', alpha=0.5, color='#4C72B0', lw=1)
        plot_2.axes[0].set_title('Normal Q-Q')
        plot_2.axes[0].set_xlabel('Theoretical Quantiles')
        plot_2.axes[0].set_ylabel('T Student Standardized Residuals')

        # annotations

        df = pd.DataFrame(
            pd.DataFrame(t_residuals_normalized).set_index(fitted_y.index))
        df['ranks'] = df.rank(method='dense').astype(int) - 1
        ranks = df.ranks.values

        s_thresh = max(qq.theoretical_quantiles)
        abs_norm_resid_top = pd.DataFrame(t_residuals_normalized).index[
            abs(t_residuals_normalized) > s_thresh].to_list()

        for r, i in enumerate(abs_norm_resid_top):
            plot_2.axes[0].annotate(fitted_y.index[i],
                                    xy=(qq.theoretical_quantiles[ranks[i]],
                                        t_residuals_normalized[i]))

        plt.savefig("Normality.png")
Exemplo n.º 14
0
def normal_resid_distr(norm_resid, sample_n=None, height=2, width=4):
    '''
    This plot shows if residuals are normally distributed. 
    Do residuals follow a straight line well or do they deviate severely? 
    It’s good if residuals are lined well on the straight dashed line.
    '''
    plt.style.use('seaborn')

    # if sampling value turned on then limit data in the output
    if sample_n != None:
        if len(norm_resid) > sample_n:
            i = np.random.choice(len(norm_resid), sample_n, replace=False)
            norm_resid = norm_resid[i]

    ## residual normal quantile plot
    norm_q_plot = ProbPlot(norm_resid)
    plot_lm_q = norm_q_plot.qqplot(line='45', alpha=0.2, color='#4C72B0', lw=1)
    plot_lm_q.set_figheight(height)
    plot_lm_q.set_figwidth(width)
    # labels
    plot_lm_q.axes[0].set_title('Normal Quantile')
    plot_lm_q.axes[0].set_xlabel('Theoretical Quantiles')
    plot_lm_q.axes[0].set_ylabel('Normalized Residuals')
    plt.show()
Exemplo n.º 15
0
def qq_plot(results, n_annotate=3):
    # normalized residuals
    model_norm_residuals = results.get_influence().resid_studentized_internal

    QQ = ProbPlot(model_norm_residuals)
    fig = QQ.qqplot(line='45', alpha=0.5, color='#4C72B0', lw=1)

    fig.set_figheight(8)
    fig.set_figwidth(12)

    fig.axes[0].set_title('Normal Q-Q')
    fig.axes[0].set_xlabel('Theoretical Quantiles')
    fig.axes[0].set_ylabel('Standardized Residuals')

    # annotations
    abs_norm_resid = np.flip(np.argsort(np.abs(model_norm_residuals)), 0)
    n_annotate = min(n_annotate, len(abs_norm_resid))
    abs_norm_resid_top_n = abs_norm_resid[:n_annotate]

    for r, i in enumerate(abs_norm_resid_top_n):
        fig.axes[0].annotate(i, xy=(np.flip(QQ.theoretical_quantiles, 0)[r], model_norm_residuals[i]))

    plt.close()
    return fig
Exemplo n.º 16
0
def diagnostic_plots(model, figsize=(12, 7)):
    warnings.simplefilter('ignore')
    # Defining subplots
    plot, axes = plt.subplots(2, 2, figsize=figsize)
    axes = axes.ravel()

    # Plot 1
    axes[0] = sns.residplot(
        model.fittedvalues,
        model.resid + model.
        fittedvalues,  # residuals = target - predicted => target = resid + predicted
        lowess=True,
        scatter_kws={"alpha": 0.5},
        line_kws={
            "color": "red",
            "lw": 1,
            "alpha": 0.8
        },
        ax=axes[0])
    axes[0].set_title("Residuals vs Fitted")
    axes[0].set_xlabel("Fitted values")
    axes[0].set_ylabel("Residuals")
    # annotations
    abs_resid = model.resid.abs().sort_values(ascending=False)
    for i in abs_resid[:3].index:
        axes[0].annotate(i, xy=(model.fittedvalues[i], model.resid[i]))

    # Plot 2
    # normalized residuals
    model_norm_residuals = model.get_influence().resid_studentized_internal
    # absolute squared normalized residuals
    model_norm_residuals_abs_sqrt = np.sqrt(np.abs(model_norm_residuals))
    QQ = ProbPlot(model_norm_residuals)
    QQ.qqplot(line="45", alpha=0.5, color="#4C72B0", lw=1, ax=axes[1])
    axes[1].set_title("Normal Q-Q")
    axes[1].set_xlabel("Theoretical Quantiles")
    axes[1].set_ylabel("Standardized Residuals")
    # annotations
    abs_norm_resid = np.flip(np.argsort(np.abs(model_norm_residuals)), 0)
    abs_norm_resid_top_3 = abs_norm_resid[:3]
    for r, i in enumerate(abs_norm_resid_top_3):
        axes[1].annotate(i,
                         xy=(np.flip(QQ.theoretical_quantiles,
                                     0)[r], model_norm_residuals[i]))

    # Plot 3
    axes[2] = sns.scatterplot(model.fittedvalues,
                              model_norm_residuals_abs_sqrt,
                              alpha=0.5,
                              ax=axes[2])
    axes[2] = sns.regplot(model.fittedvalues,
                          model_norm_residuals_abs_sqrt,
                          scatter=False,
                          ci=False,
                          lowess=True,
                          line_kws={
                              "color": "red",
                              "lw": 1,
                              "alpha": 0.8
                          },
                          ax=axes[2])
    axes[2].set_title("Scale-Location")
    axes[2].set_xlabel("Fitted values")
    axes[2].set_ylabel("$\sqrt{|Standardized Residuals|}$")
    # annotations
    abs_sq_norm_resid = np.flip(np.argsort(model_norm_residuals_abs_sqrt), 0)
    abs_sq_norm_resid_top_3 = abs_sq_norm_resid[:3]
    for i in abs_sq_norm_resid_top_3:
        axes[2].annotate(i,
                         xy=(model.fittedvalues[i],
                             model_norm_residuals_abs_sqrt[i]))

    # Plot 4
    # leverage, from statsmodels internals
    model_leverage = model.get_influence().hat_matrix_diag
    # cook's distance, from statsmodels internals
    model_cooks = model.get_influence().cooks_distance[0]
    sns.scatterplot(model_leverage,
                    model_norm_residuals,
                    alpha=0.5,
                    ax=axes[3])
    sns.regplot(model_leverage,
                model_norm_residuals,
                scatter=False,
                ci=False,
                lowess=True,
                line_kws={
                    "color": "red",
                    "lw": 1,
                    "alpha": 0.8
                },
                ax=axes[3])
    axes[3].set_xlim(0, max(model_leverage) + 0.01)
    axes[3].set_ylim(-3, 5)
    axes[3].set_title("Residuals vs Leverage")
    axes[3].set_xlabel("Leverage")
    axes[3].set_ylabel("Standardized Residuals")

    # annotations
    leverage_top_3 = np.flip(np.argsort(model_cooks), 0)[:3]
    for i in leverage_top_3:
        axes[3].annotate(i, xy=(model_leverage[i], model_norm_residuals[i]))

    p = len(model.params)  # number of model parameters

    # line 1
    x = np.linspace(0.001, max(model_leverage), 50)
    f = lambda x: np.sqrt((0.5 * p * (1 - x)) / x)
    y = f(x)
    sns.lineplot(x,
                 y,
                 label="Cook's distance",
                 ax=axes[3],
                 color='red',
                 dashes=True)
    axes[3].lines[1].set_linestyle("--")
    # line 2
    x = np.linspace(0.001, max(model_leverage), 50)
    f = lambda x: np.sqrt((1 * p * (1 - x)) / x)
    y = f(x)
    sns.lineplot(x, y, ax=axes[3], color='red', dashes=True)
    axes[3].legend(loc='upper right')

    plot.tight_layout()  # so titles won't overlap x_labels
    plot.show()
def diagnostic_plots(X, y, model_fit=None):
    """
    Function to reproduce the 4 base plots of an OLS model from R.

    ---
    Inputs:

    X: A numpy array or pandas dataframe of the features to use in building the linear regression model

    y: A numpy array or pandas series/dataframe of the target variable of the linear regression model

    model_fit [optional]: a statsmodel.api.OLS model after regressing y on X. If not provided, will be
                          generated from X, y
    """

    if not model_fit:
        model_fit = sm.OLS(y, sm.add_constant(X)).fit()

    # create dataframe from X, y for easier plot handling
    dataframe = pd.concat([X, y], axis=1)

    # model values
    model_fitted_y = model_fit.fittedvalues
    # model residuals
    model_residuals = model_fit.resid
    # normalized residuals
    model_norm_residuals = model_fit.get_influence().resid_studentized_internal
    # absolute squared normalized residuals
    model_norm_residuals_abs_sqrt = np.sqrt(np.abs(model_norm_residuals))
    # absolute residuals
    model_abs_resid = np.abs(model_residuals)
    # leverage, from statsmodels internals
    model_leverage = model_fit.get_influence().hat_matrix_diag
    # cook's distance, from statsmodels internals
    model_cooks = model_fit.get_influence().cooks_distance[0]

    # Residuals vs Fitted Plot
    plot_lm_1 = plt.figure()
    plot_lm_1.axes[0] = sns.residplot(model_fitted_y,
                                      dataframe.columns[-1],
                                      data=dataframe,
                                      lowess=True,
                                      scatter_kws={'alpha': 0.5},
                                      line_kws={
                                          'color': 'red',
                                          'lw': 1,
                                          'alpha': 0.8
                                      })

    plot_lm_1.axes[0].set_title('Residuals vs Fitted')
    plot_lm_1.axes[0].set_xlabel('Fitted values')
    plot_lm_1.axes[0].set_ylabel('Residuals')

    # annotations
    abs_resid = model_abs_resid.sort_values(ascending=False)
    abs_resid_top_3 = abs_resid[:3]
    for i in abs_resid_top_3.index:
        plot_lm_1.axes[0].annotate(i,
                                   xy=(model_fitted_y[i], model_residuals[i]))

    # Normal Q-Q Plot
    QQ = ProbPlot(model_norm_residuals)
    plot_lm_2 = QQ.qqplot(line='45', alpha=0.5, color='#4C72B0', lw=1)
    plot_lm_2.axes[0].set_title('Normal Q-Q')
    plot_lm_2.axes[0].set_xlabel('Theoretical Quantiles')
    plot_lm_2.axes[0].set_ylabel('Standardized Residuals')
    # annotations
    abs_norm_resid = np.flip(np.argsort(np.abs(model_norm_residuals)), 0)
    abs_norm_resid_top_3 = abs_norm_resid[:3]
    for r, i in enumerate(abs_norm_resid_top_3):
        plot_lm_2.axes[0].annotate(i,
                                   xy=(np.flip(QQ.theoretical_quantiles,
                                               0)[r], model_norm_residuals[i]))

    # Scale vs Location Plot
    plot_lm_3 = plt.figure()
    plt.scatter(model_fitted_y, model_norm_residuals_abs_sqrt, alpha=0.5)
    sns.regplot(model_fitted_y,
                model_norm_residuals_abs_sqrt,
                scatter=False,
                ci=False,
                lowess=True,
                line_kws={
                    'color': 'red',
                    'lw': 1,
                    'alpha': 0.8
                })
    plot_lm_3.axes[0].set_title('Scale-Location')
    plot_lm_3.axes[0].set_xlabel('Fitted values')
    plot_lm_3.axes[0].set_ylabel('$\sqrt{|Standardized Residuals|}$')

    # annotations
    abs_sq_norm_resid = np.flip(np.argsort(model_norm_residuals_abs_sqrt), 0)
    abs_sq_norm_resid_top_3 = abs_sq_norm_resid[:3]
    for i in abs_norm_resid_top_3:
        plot_lm_3.axes[0].annotate(i,
                                   xy=(model_fitted_y[i],
                                       model_norm_residuals_abs_sqrt[i]))

    # Residuals vs Leverage Plot
    plot_lm_4 = plt.figure()
    plt.scatter(model_leverage, model_norm_residuals, alpha=0.5)
    sns.regplot(model_leverage,
                model_norm_residuals,
                scatter=False,
                ci=False,
                lowess=True,
                line_kws={
                    'color': 'red',
                    'lw': 1,
                    'alpha': 0.8
                })
    plot_lm_4.axes[0].set_xlim(0, max(model_leverage) + 0.01)
    plot_lm_4.axes[0].set_ylim(-3, 5)
    plot_lm_4.axes[0].set_title('Residuals vs Leverage')
    plot_lm_4.axes[0].set_xlabel('Leverage')
    plot_lm_4.axes[0].set_ylabel('Standardized Residuals')

    # annotations
    leverage_top_3 = np.flip(np.argsort(model_cooks), 0)[:3]
    for i in leverage_top_3:
        plot_lm_4.axes[0].annotate(i,
                                   xy=(model_leverage[i],
                                       model_norm_residuals[i]))

    p = len(model_fit.params)  # number of model parameters
    graph(lambda x: np.sqrt((0.5 * p * (1 - x)) / x),
          np.linspace(0.001, max(model_leverage), 50),
          'Cook\'s distance')  # 0.5 line
    graph(lambda x: np.sqrt((1 * p * (1 - x)) / x),
          np.linspace(0.001, max(model_leverage), 50))  # 1 line
    plot_lm_4.legend(loc='upper right')
Exemplo n.º 18
0
def diagnostic_plots(model_fit, streets, model_name, modelsave):
    model_fitted_y = model_fit.fittedvalues

    # residuals
    model_residuals = model_fit.resid

    # normalized residuals
    model_norm_residuals = model_fit.get_influence().resid_studentized_internal

    # absolute squared normalized residuals
    model_norm_residuals_abs_sqrt = np.sqrt(np.abs(model_norm_residuals))

    # absolute residuals
    model_abs_resid = np.abs(model_residuals)

    # leverage, from statsmodels internals
    model_leverage = model_fit.get_influence().hat_matrix_diag

        #All four diagnostic plots in one place
    fig, axarr = plt.subplots(2,2, figsize = (20,20))
    QQ = ProbPlot(model_norm_residuals)

    #residualsm
    sns.residplot( model_fitted_y, 'tickpermile', data=streets,
                              lowess=True,
                              scatter_kws={'alpha': 0.5},
                              line_kws={'color': 'red', 'lw': 1, 'alpha': 0.8}, ax = axarr[0,0])
    axarr[0,0].set_xlabel('Fitted')
    axarr[0,0].set_ylabel('Residual')


    #Q-Q
    QQ.qqplot(line='45', alpha=0.5, color='#4C72B0', lw=1, ax = axarr[0,1])
    axarr[0,1].set_xlabel('Theoretical Quantile')
    axarr[0,1].set_ylabel('Residuals')
    axarr[0,1].set_xlim(-4,4)



    #scale - location
    axarr[1,0].scatter(model_fitted_y, model_norm_residuals_abs_sqrt, alpha=0.5)
    sns.regplot(model_fitted_y, model_norm_residuals_abs_sqrt,
                scatter=False,
                ci=False,
                lowess=True,
                line_kws={'color': 'red', 'lw': 1, 'alpha': 0.8}, ax = axarr[1,0])
    axarr[1,0].set_xlabel('Fitted')
    axarr[1,0].set_ylabel('Standardized')


    #Leverage
    axarr[1,1].scatter(model_leverage, model_norm_residuals, alpha=0.5)
    sns.regplot(model_leverage, model_norm_residuals,
                scatter=False,
                ci=False,
                lowess=True,
                line_kws={'color': 'red', 'lw': 1, 'alpha': 0.8}, ax = axarr[1,1])
    axarr[1,1].set_xlim(0,0.1)
    axarr[1,1].set_xlabel('Leverage')
    axarr[1,1].set_ylabel('Standardized Residuals')

    fig.suptitle('Diagnostic plots for ' + model_name)
    fig.tight_layout(rect=[0, 0.03, 1, 0.95])
    fig.show()
    fig.savefig(image_loc + modelsave)
    return
def diagnostic_plots(x_true: np.ndarray, y_true: np.ndarray,
                     y_predicted: np.ndarray) -> None:
    """Generate diagnostic plots for regression evaluation.

    Source: Emre @ https://emredjan.github.io/blog/2017/07/11/emulating-r-plots-in-python/

    Args:
        x_true (np.ndarray): Known x-values.
        y_true (np.ndarray): Known y-values.
        y_predicted (np.ndarray): Predicted y-values.
    """
    residuals, studentized_residuals, cooks_distance, hat_diag = regression_eval_metrics(
        x_true, y_true, y_predicted)

    fig, axs = plt.subplots(nrows=2, ncols=2, figsize=(18, 10))
    plt.tight_layout(pad=5, w_pad=5, h_pad=5)

    # 1. residual plot
    sns.residplot(x=y_predicted,
                  y=residuals,
                  lowess=True,
                  scatter_kws={"alpha": 0.5},
                  line_kws={
                      "color": "red",
                      "lw": 1,
                      "alpha": 0.8
                  },
                  ax=axs[0, 0])
    axs[0, 0].set_title("Residuals vs Fitted")
    axs[0, 0].set_xlabel("Fitted values")
    axs[0, 0].set_ylabel("Residuals")

    # 2. qq plot
    qq = ProbPlot(studentized_residuals)
    qq.qqplot(line="45", alpha=0.5, color="#2578B2", lw=0.5, ax=axs[0, 1])
    axs[0, 1].set_title("Normal Q-Q")
    axs[0, 1].set_xlabel("Theoretical Quantiles")
    axs[0, 1].set_ylabel("Standardized Residuals")

    # 3. scale-location plot
    studentized_residuals_abs_sqrt = np.sqrt(np.abs(studentized_residuals))
    axs[1, 0].scatter(y_predicted, studentized_residuals_abs_sqrt, alpha=0.5)
    sns.regplot(
        y_predicted,
        studentized_residuals_abs_sqrt,
        scatter=False,
        ci=False,
        lowess=True,
        line_kws={
            "color": "red",
            "lw": 1,
            "alpha": 0.8
        },
        ax=axs[1, 0],
    )
    axs[1, 0].set_title("Scale-Location")
    axs[1, 0].set_xlabel("Fitted values")
    axs[1, 0].set_ylabel("$\sqrt{|Standardised Residuals|}$")

    # 4. leverage plot
    axs[1, 1].scatter(hat_diag, studentized_residuals, alpha=0.5)
    sns.regplot(hat_diag,
                studentized_residuals,
                scatter=False,
                ci=False,
                lowess=True,
                line_kws={
                    "color": "red",
                    "lw": 1,
                    "alpha": 0.8
                },
                ax=axs[1, 1])
    axs[1, 1].set_xlim(min(hat_diag), max(hat_diag))
    axs[1, 1].set_ylim(min(studentized_residuals), max(studentized_residuals))
    axs[1, 1].set_title("Residuals vs Leverage")
    axs[1, 1].set_xlabel("Leverage")
    axs[1, 1].set_ylabel("Standardised Residuals")

    # annotations
    leverage_top_3 = np.flip(np.argsort(cooks_distance), 0)[:3]
    for i in leverage_top_3:
        axs[1, 1].annotate(i, xy=(hat_diag[i], studentized_residuals[i]))

    def graph(formula, x_range, label=None):
        x = x_range
        y = formula(x)
        axs[1, 1].plot(x, y, label=label, lw=1, ls="--", color="red")

    p = np.size(x_true, 1)  # number of model parameters

    graph(lambda x: np.sqrt((0.5 * p * (1 - x)) / x),
          np.linspace(0.001, max(hat_diag), 50), "Cook's distance")
    graph(lambda x: np.sqrt((1 * p * (1 - x)) / x),
          np.linspace(0.001, max(hat_diag), 50))
    axs[1, 1].legend(loc="upper right")
Exemplo n.º 20
0
    def pplot_fit_per_hour_interarrival(self):
        worst = {"name": None, "expon": 0, "weib": 0, "vals": None}
        best = {
            "name": None,
            "expon": math.inf,
            "weib": math.inf,
            "vals": None
        }
        expon_reject_obj = {"Dataset": self.name}
        weibull_reject_obj = {"Dataset": self.name}
        expon_reject_obj_2 = {"Dataset": self.name}
        weibull_reject_obj_2 = {"Dataset": self.name}
        f_test_reject_obj = {"Dataset": self.name}
        better_obj = {"Dataset": self.name}
        event_stats = {x: [] for x in self.event_names}
        for event_name in tqdm_notebook(self.event_names):
            mapped_name = self.source_map[event_name]
            if mapped_name is not None:
                all_columns = [self.unique_id, mapped_name]
                df = dd.read_csv(self.data_path, usecols=all_columns)
                df = df.dropna()
                df = df.compute()
                df[mapped_name] = pd.to_datetime(df[mapped_name],
                                                 errors="coerce",
                                                 infer_datetime_format=True)
                df = df.sort_values(by=mapped_name, ascending=True)
                df.index = df[mapped_name]
                df['inter_arrival'] = (
                    df[mapped_name] -
                    df[mapped_name].shift()).dt.seconds.fillna(np.float64(0))
                zero_inter_arrival = df[df['inter_arrival'] == 0].index
                df.drop(zero_inter_arrival, inplace=True)
                expon_reject = 0
                weib_reject = 0
                expon_reject_2 = 0
                weib_reject_2 = 0
                weib_stats = []
                expon_stats = []
                beta_stats = []
                f_reject = 0
                pvals = []
                for x in self.hour_ranges:
                    print("{}-{}-{}".format(self.name, event_name, x))
                    hour_slice = df.between_time(*x)
                    total_counts = hour_slice['inter_arrival'].values
                    ###################
                    args = st.expon.fit(total_counts, floc=0)
                    e = st.expon(*args)
                    edges = []
                    r = [x / 20 for x in range(1, 20)]
                    edges = e.ppf(r)
                    edges = np.insert(edges, 0, 0)
                    edges = np.append(edges, max(total_counts))
                    histo, bin_edges = np.histogram(total_counts,
                                                    bins=edges,
                                                    density=False)
                    cdf = st.expon.cdf(bin_edges, *args)
                    expected_values = len(total_counts) * np.diff(cdf)
                    expon_chi_stat, expon_pval = st.chisquare(
                        histo, f_exp=expected_values, ddof=2)
                    event_stats[event_name].append(expon_chi_stat)
                    expon_stats.append(expon_chi_stat)
                    ########################
                    fig, ax = plt.subplots(1, 2)
                    histo, bin_edges, _ = ax[0].hist(total_counts,
                                                     bins=100,
                                                     density=False)
                    cdf = st.expon.cdf(bin_edges, *args)
                    expected_values = len(total_counts) * np.diff(cdf)
                    ax[0].plot(bin_edges[:-1],
                               expected_values,
                               label="Exponential Fit")
                    ax[0].set_xlabel("Inter-arrival Times (Seconds)")
                    ax[0].set_ylabel("Frequency")
                    ax[0].legend()
                    prob = ProbPlot(total_counts,
                                    st.expon,
                                    loc=args[0],
                                    scale=args[1])
                    prob.qqplot(ax=ax[1], line='45')
                    #ax[1].set_title("Exponential QQ Plot")
                    plt.tight_layout()
                    plt.show()
                    ##### Compare Weibull_Min
                    args = st.weibull_min.fit(total_counts, floc=0)
                    e = st.weibull_min(*args)
                    edges = []
                    r = [x / 20 for x in range(1, 20)]
                    edges = e.ppf(r)
                    edges = np.insert(edges, 0, 0)
                    edges = np.append(edges, max(total_counts))
                    histo, bin_edges = np.histogram(total_counts,
                                                    bins=edges,
                                                    density=False)
                    cdf = st.weibull_min.cdf(bin_edges, *args)
                    expected_values = len(total_counts) * np.diff(cdf)
                    weib_chi_stat, weib_pval = st.chisquare(
                        histo, f_exp=expected_values, ddof=4)
                    weib_stats.append(weib_chi_stat)

                    ##### Compare Beta

                    np.seterr(divide='raise')
                    # total_counts = total_counts.astype(np.float64)
                    # print(np.unique(total_counts))
                    # args = st.betaprime.fit(total_counts)
                    # e = st.betaprime(*args)
                    # edges = []
                    # r = [x/20 for x in range(1,20)]
                    # edges = e.ppf(r)
                    # edges = np.insert(edges,0,0)
                    # edges = np.append(edges,max(total_counts))
                    # histo, bin_edges = np.histogram(total_counts, bins=edges, density=False)
                    # cdf = st.betaprime.cdf(bin_edges, *args)
                    # expected_values = len(total_counts) * np.diff(cdf)
                    # beta_chi_stat, beta_pval = st.chisquare(histo, f_exp=expected_values, ddof=4)
                    # beta_stats.append(beta_chi_stat)

                    f_res = self.f_test(total_counts)
                    if f_res == 1:
                        f_reject += 1
                    if expon_chi_stat > weib_chi_stat:
                        bet = (expon_chi_stat - weib_chi_stat) / expon_chi_stat
                        #print("Exponential Weibull {} better chi squared".format(bet))
                    #ax[0].plot(bin_edges[:-1], expected_values, label="Exponential Fit")
                    #ax[0].legend()
                    comp = 27.204
                    comp_2 = 30.144
                    if expon_chi_stat > comp:
                        expon_reject += 1
                    if expon_chi_stat > comp_2:
                        expon_reject_2 += 1
                    if weib_chi_stat > comp:
                        weib_reject += 1
                    if weib_chi_stat > comp_2:
                        weib_reject_2 += 1
                    if expon_chi_stat > worst["expon"]:
                        worst["name"] = "{}-{}-{}".format(
                            self.name, event_name, x)
                        worst["expon"] = expon_chi_stat
                        worst["weib"] = weib_chi_stat
                        worst["vals"] = total_counts
                    if expon_chi_stat < best["expon"]:
                        best["name"] = "{}-{}-{}".format(
                            self.name, event_name, x)
                        best["expon"] = expon_chi_stat
                        best["weib"] = weib_chi_stat
                        best["vals"] = total_counts
                print(
                    "Expon: {}-{}: Reject at .10 :{}, Reject .05:{}, f_reject: {},  mean:{}"
                    .format(self.name, event_name, expon_reject,
                            expon_reject_2, f_reject, np.mean(expon_stats)))
                print("Weib: {}-{}: Reject at .10:{}, Reject .05:{}, mean:{}".
                      format(self.name, event_name, weib_reject, weib_reject_2,
                             np.mean(weib_stats)))
                bet = (np.mean(weib_stats) -
                       np.mean(expon_stats)) / np.mean(expon_stats)
                #bet_2 = (np.mean(beta_stats)-np.mean(weib_stats))/np.mean(weib_stats)
                print("Weib is {}% change from expon chi stat: {}-{}".format(
                    100 * bet, self.name, event_name))
                #print("beta is {}% change from weib chi stat: {}-{}".format(100*bet_2, self.name,event_name))
                expon_reject_obj_2[event_name] = expon_reject_2
                weibull_reject_obj_2[event_name] = weib_reject_2
                expon_reject_obj[event_name] = expon_reject
                weibull_reject_obj[event_name] = weib_reject
                f_test_reject_obj[event_name] = f_reject
                better_obj[event_name] = bet
#        self.write_stats("better", better_obj)
#        self.write_stats("f_reject", f_test_reject_obj)
#        self.write_stats("expon_reject_10", expon_reject_obj)
#        self.write_stats("weibull_reject_10", weibull_reject_obj)
#        self.write_stats("expon_reject_05", expon_reject_obj_2)
#        self.write_stats("weibull_reject_05", weibull_reject_obj_2)
        print("Worst")
        fig1, ax1 = plt.subplots()
        fig, ax = plt.subplots(1, 2)
        args = st.expon.fit(worst["vals"], floc=0)
        wargs = st.weibull_min.fit(worst["vals"], floc=0)
        histo, bin_edges, _ = ax1.hist(worst["vals"], bins=100, density=False)
        prob = ProbPlot(worst["vals"], st.expon, loc=args[0], scale=args[1])
        wprob = ProbPlot(worst["vals"],
                         st.weibull_min,
                         distargs=(wargs[0], ),
                         loc=wargs[1],
                         scale=wargs[2])
        prob.qqplot(ax=ax[0], line='45')
        wprob.qqplot(ax=ax[1], line='45')
        ax[1].set_title("Weibull QQ Plot")
        ax[0].set_title("Exponential QQ Plot")
        cdf = st.expon.cdf(bin_edges, *args)
        expected_values = len(worst["vals"]) * np.diff(cdf)
        wcdf = st.weibull_min.cdf(bin_edges, *wargs)
        wexpected_values = len(worst["vals"]) * np.diff(wcdf)
        print("{}-{}/{}".format(worst["name"], worst["expon"], worst["weib"]))
        ax1.set_xlabel("Inter-arrival Times (Seconds)")
        ax1.set_ylabel("Frequency")
        ax1.plot(bin_edges[:-1], expected_values, label="Exponential Fit")
        ax1.plot(bin_edges[:-1], wexpected_values, label="Weibull Fit")
        ax1.legend()
        plt.tight_layout()
        plt.show()
        #
        print("Best")
        fig1, ax1 = plt.subplots()
        fig, ax = plt.subplots(1, 2)
        args = st.expon.fit(best["vals"], floc=0)
        wargs = st.weibull_min.fit(best["vals"], floc=0)
        histo, bin_edges, _ = ax1.hist(best["vals"], bins=100, density=False)
        prob = ProbPlot(best["vals"], st.expon, loc=args[0], scale=args[1])
        wprob = ProbPlot(best["vals"],
                         st.weibull_min,
                         distargs=(wargs[0], ),
                         loc=wargs[1],
                         scale=wargs[2])
        prob.qqplot(ax=ax[0], line='r')
        wprob.qqplot(ax=ax[1], line='r')
        ax[1].set_title("Weibull QQ Plot")
        ax[0].set_title("Exponential QQ Plot")
        cdf = st.expon.cdf(bin_edges, *args)
        expected_values = len(best["vals"]) * np.diff(cdf)
        wcdf = st.weibull_min.cdf(bin_edges, *wargs)
        wexpected_values = len(best["vals"]) * np.diff(wcdf)
        print("{}-{}/{}".format(best["name"], best["expon"], best["weib"]))
        ax1.set_xlabel("Inter-arrival Times (Seconds)")
        ax1.set_ylabel("Frequency")
        ax1.plot(bin_edges[:-1], expected_values, label="Exponential Fit")
        ax1.plot(bin_edges[:-1], wexpected_values, label="Weibull Fit")
        ax1.legend()
        plt.tight_layout()
        plt.show()
        return worst, best, event_stats
Exemplo n.º 21
0
def plot_lm(model_fit, data, key, n=3):
    import numpy as np
    import pandas as pd
    import seaborn as sns
    import matplotlib.pyplot as plt
    import statsmodels.formula.api as smf

    from statsmodels.graphics.gofplots import ProbPlot

    # fitted values (need a constant term for intercept)
    model_fitted_y = model_fit.fittedvalues
    # model residuals
    model_residuals = model_fit.resid
    # normalized residuals
    model_norm_residuals = model_fit.get_influence().resid_studentized_internal
    # absolute squared normalized residuals
    model_norm_residuals_abs_sqrt = np.sqrt(np.abs(model_norm_residuals))
    # absolute residuals
    model_abs_resid = np.abs(model_residuals)
    # leverage, from statsmodels internals
    model_leverage = model_fit.get_influence().hat_matrix_diag
    # cook's distance, from statsmodels internals
    model_cooks = model_fit.get_influence().cooks_distance[0]

    fig = plt.figure()
    fig.set_figheight(8)
    fig.set_figwidth(12)

    # first plot
    plot_lm_1 = plt.subplot(2, 2, 1)

    temp_plot = sns.residplot(model_fitted_y,
                              key,
                              data=data,
                              lowess=True,
                              scatter_kws={'alpha': 0.5},
                              line_kws={
                                  'color': 'red',
                                  'lw': 1,
                                  'alpha': 0.8
                              })

    plot_lm_1.set_title('Residuals vs Fitted')
    plot_lm_1.set_xlabel('Fitted values')
    plot_lm_1.set_ylabel('Residuals')

    # annotations
    abs_resid = model_abs_resid.sort_values(ascending=False)
    abs_resid_top_3 = abs_resid[:3]

    for i in abs_resid_top_3.index:
        plot_lm_1.annotate(i, xy=(model_fitted_y[i], model_residuals[i]))

    # second plot
    plot_lm_2 = plt.subplot(2, 2, 2)
    QQ = ProbPlot(model_norm_residuals)
    temp_plot = QQ.qqplot(line='45',
                          alpha=0.5,
                          color='#4C72B0',
                          lw=1,
                          ax=plot_lm_2)

    plot_lm_2.set_title('Normal Q-Q')
    plot_lm_2.set_xlabel('Theoretical Quantiles')
    plot_lm_2.set_ylabel('Standardized Residuals')

    # annotations
    abs_norm_resid = np.flip(np.argsort(np.abs(model_norm_residuals)), 0)
    abs_norm_resid_top_3 = abs_norm_resid[:3]

    for r, i in enumerate(abs_norm_resid_top_3):
        plot_lm_2.annotate(i,
                           xy=(np.flip(QQ.theoretical_quantiles,
                                       0)[r], model_norm_residuals[i]))

    # third plot
    plot_lm_3 = plt.subplot(2, 2, 3)

    plt.scatter(model_fitted_y, model_norm_residuals_abs_sqrt, alpha=0.5)
    sns.regplot(model_fitted_y,
                model_norm_residuals_abs_sqrt,
                scatter=False,
                ci=False,
                lowess=True,
                line_kws={
                    'color': 'red',
                    'lw': 1,
                    'alpha': 0.8
                })

    plot_lm_3.set_title('Scale-Location')
    plot_lm_3.set_xlabel('Fitted values')
    plot_lm_3.set_ylabel('$\sqrt{|Standardized Residuals|}$')

    # annotations
    abs_sq_norm_resid = np.flip(np.argsort(model_norm_residuals_abs_sqrt), 0)
    abs_sq_norm_resid_top_3 = abs_sq_norm_resid[:3]

    for i in abs_norm_resid_top_3:
        try:
            plot_lm_3.annotate(i,
                               xy=(model_fitted_y[i],
                                   model_norm_residuals_abs_sqrt[i]))
        except KeyError as err:
            continue

    # fourth plot
    plot_lm_4 = plt.subplot(2, 2, 4)

    plt.scatter(model_leverage, model_norm_residuals, alpha=0.5)
    sns.regplot(model_leverage,
                model_norm_residuals,
                scatter=False,
                ci=False,
                lowess=True,
                line_kws={
                    'color': 'red',
                    'lw': 1,
                    'alpha': 0.8
                })

    plot_lm_4.set_xlim(0, 0.20)
    plot_lm_4.set_ylim(-3, 5)
    plot_lm_4.set_title('Residuals vs Leverage')
    plot_lm_4.set_xlabel('Leverage')
    plot_lm_4.set_ylabel('Standardized Residuals')

    # annotations
    leverage_top_3 = np.flip(np.argsort(model_cooks), 0)[:3]

    for i in leverage_top_3:
        plot_lm_4.annotate(i, xy=(model_leverage[i], model_norm_residuals[i]))

    # shenanigans for cook's distance contours
    def graph(formula, x_range, label=None):
        x = x_range
        y = formula(x)
        plt.plot(x, y, label=label, lw=1, ls='--', color='red')

    p = len(model_fit.params)  # number of model parameters

    graph(lambda x: np.sqrt((0.5 * p * (1 - x)) / x),
          np.linspace(0.001, 0.200, 50), 'Cook\'s distance')  # 0.5 line
    graph(lambda x: np.sqrt((1 * p * (1 - x)) / x),
          np.linspace(0.001, 0.200, 50))  # 1 line
    plt.legend(loc='upper right')
    plt.xlim(xmax=np.max(model_cooks))

    plt.tight_layout()
Exemplo n.º 22
0
def diagnostic_plots(model, df):
    """ Reproduces the 4 base plots of an OLS model in R.

    model = a statsmodel.api.OLS model after regression
    df    = dataframe used in regression
    """
    # normalized residuals
    model_norm_residuals = model.get_influence().resid_studentized_internal
    # absolute squared normalized residuals
    model_norm_residuals_abs_sqrt = np.sqrt(np.abs(model_norm_residuals))
    # absolute residuals
    model_abs_resid = np.abs(model.resid)
    # leverage, from statsmodels internals
    model_leverage = model.get_influence().hat_matrix_diag
    # cook's distance, from statsmodels internals
    model_cooks = model.get_influence().cooks_distance[0]

    fig = plt.figure(figsize=[12, 10])
    ax1 = fig.add_subplot(2, 2, 1)
    ax2 = fig.add_subplot(2, 2, 2)
    ax3 = fig.add_subplot(2, 2, 3)
    ax4 = fig.add_subplot(2, 2, 4)
    plt.subplots_adjust(hspace=.3, wspace=.2)

    X = model.fittedvalues
    Y = model.resid
    ax1.scatter(X, Y, s=10)
    smoother = sm.nonparametric.lowess(Y, X, frac=.3)
    ax1.plot(smoother[:, 0], smoother[:, 1], 'red')
    ax1.set_title('Residuals vs Fitted')
    ax1.set_xlabel('Fitted values')
    ax1.set_ylabel('Residuals')
    ax1.grid()
    # annotations
    abs_resid = model_abs_resid.sort_values(ascending=False)
    abs_resid_top_3 = abs_resid[:3]
    for i in abs_resid_top_3.index:
        ax1.annotate(i, xy=(model.fittedvalues[i], model.resid[i]))

    QQ = ProbPlot(model_norm_residuals)
    QQ.qqplot(line='45', alpha=0.5, color='#4C72B0', lw=1, ax=ax2)
    ax2.set_title('Normal Q-Q')
    ax2.set_xlabel('Theoretical Quantiles')
    ax2.set_ylabel('Standardized Residuals')
    ax2.grid()
    # annotations
    abs_norm_resid = np.flip(np.argsort(np.abs(model_norm_residuals)), 0)
    abs_norm_resid_top_3 = abs_norm_resid[:3]
    for r, i in enumerate(abs_norm_resid_top_3):
        ax2.annotate(i,
                     xy=(np.flip(QQ.theoretical_quantiles,
                                 0)[r], model_norm_residuals[i]))

    ax3.scatter(model.fittedvalues, model_norm_residuals_abs_sqrt, alpha=0.5)
    sns.regplot(model.fittedvalues,
                model_norm_residuals_abs_sqrt,
                scatter=False,
                ax=ax3,
                ci=False,
                lowess=True,
                line_kws={
                    'color': 'red',
                    'lw': 1,
                    'alpha': 0.8
                })
    ax3.set_title('Scale-Location')
    ax3.set_xlabel('Fitted values')
    ax3.set_ylabel(r'$\sqrt{|Standardized Residuals|}$')
    ax3.grid()
    # annotations
    abs_sq_norm_resid = np.flip(np.argsort(model_norm_residuals_abs_sqrt), 0)
    abs_sq_norm_resid_top_3 = abs_sq_norm_resid[:3]
    for i in abs_norm_resid_top_3:
        ax3.annotate(i,
                     xy=(model.fittedvalues[i],
                         model_norm_residuals_abs_sqrt[i]))

    ax4.scatter(model_leverage, model_norm_residuals, alpha=0.5)
    sns.regplot(model_leverage,
                model_norm_residuals,
                ax=ax4,
                scatter=False,
                ci=False,
                lowess=True,
                line_kws={
                    'color': 'red',
                    'lw': 1,
                    'alpha': 0.8
                })
    ax4.set_xlim(0, max(model_leverage) + 0.01)
    ax4.set_ylim(-3, 5)
    ax4.set_title('Residuals vs Leverage')
    ax4.set_xlabel('Leverage')
    ax4.set_ylabel('Standardized Residuals')
    ax4.grid()
    # annotations
    leverage_top_3 = np.flip(np.argsort(model_cooks), 0)[:3]
    for i in leverage_top_3:
        ax4.annotate(i, xy=(model_leverage[i], model_norm_residuals[i]))

    p = len(model.params)  # number of model parameters

    def f(x):
        return np.sqrt((0.5 * p * (1 - x)) / x)

    X = np.linspace(0.001, max(model_leverage), 50)
    Y = f(X)
    ax4.plot(X, Y, label='Cook\'s distance', lw=1, ls='--', color='red')

    def f(x):
        return np.sqrt((1 * p * (1 - x)) / x)

    X = np.linspace(0.001, 0.200, 50)
    Y = f(X)
    ax4.plot(X, Y, lw=1, ls='--', color='red')
    ax4.legend()
Exemplo n.º 23
0
def diagnostic_plots(x, y, model_fit=None):
    if not model_fit:
        model_fit = sm.OLS(y, sm.add_constant(x)).fit()

    dataframe = pd.concat([x, y], axis=1)
    model_fitted_y = model_fit.fittedvalues
    model_residuals = model_fit.resid
    model_norm_residuals = model_fit.get_influence().resid_studentized_internal
    model_norm_residuals_abs_sqrt = np.sqrt(np.abs(model_norm_residuals))
    model_abs_resid = np.abs(model_residuals)
    model_leverage = model_fit.get_influence().hat_matrix_diag
    model_cooks = model_fit.get_influence().cooks_distance[0]

    plot_lm_1 = plt.figure()
    plot_lm_1.axes[0] = sns.residplot(model_fitted_y,
                                      dataframe.columns[-1],
                                      data=dataframe,
                                      lowess=True,
                                      scatter_kws={'alpha': 0.5},
                                      line_kws={
                                          'color': 'red',
                                          'lw': 1,
                                          'alpha': 0.8
                                      })

    plot_lm_1.axes[0].set_title('Residuals vs Fitted')
    plot_lm_1.axes[0].set_xlabel('Fitted values')
    plot_lm_1.axes[0].set_ylabel('Residuals')
    abs_resid = model_abs_resid.sort_values(ascending=False)
    abs_resid_top_3 = abs_resid[:3]
    for i in abs_resid_top_3.index:
        plot_lm_1.axes[0].annotate(i,
                                   xy=(model_fitted_y[i], model_residuals[i]))

    QQ = ProbPlot(model_norm_residuals)
    plot_lm_2 = QQ.qqplot(line='45', alpha=0.5, color='#4C72B0', lw=1)
    plot_lm_2.axes[0].set_title('Normal Q-Q')
    plot_lm_2.axes[0].set_xlabel('Theoretical Quantiles')
    plot_lm_2.axes[0].set_ylabel('Standardized Residuals')
    abs_norm_resid = np.flip(np.argsort(np.abs(model_norm_residuals)), 0)
    abs_norm_resid_top_3 = abs_norm_resid[:3]
    for r, i in enumerate(abs_norm_resid_top_3):
        plot_lm_2.axes[0].annotate(i,
                                   xy=(np.flip(QQ.theoretical_quantiles,
                                               0)[r], model_norm_residuals[i]))

    plot_lm_3 = plt.figure()
    plt.scatter(model_fitted_y, model_norm_residuals_abs_sqrt, alpha=0.5)
    sns.regplot(model_fitted_y,
                model_norm_residuals_abs_sqrt,
                scatter=False,
                ci=False,
                lowess=True,
                line_kws={
                    'color': 'red',
                    'lw': 1,
                    'alpha': 0.8
                })
    plot_lm_3.axes[0].set_title('Scale-Location')
    plot_lm_3.axes[0].set_xlabel('Fitted values')
    plot_lm_3.axes[0].set_ylabel('$\sqrt{|Standardized Residuals|}$')

    for i in abs_norm_resid_top_3:
        plot_lm_3.axes[0].annotate(i,
                                   xy=(model_fitted_y[i],
                                       model_norm_residuals_abs_sqrt[i]))

    plot_lm_4 = plt.figure()
    plt.scatter(model_leverage, model_norm_residuals, alpha=0.5)
    sns.regplot(model_leverage,
                model_norm_residuals,
                scatter=False,
                ci=False,
                lowess=True,
                line_kws={
                    'color': 'red',
                    'lw': 1,
                    'alpha': 0.8
                })
    plot_lm_4.axes[0].set_xlim(0, max(model_leverage) + 0.01)
    plot_lm_4.axes[0].set_ylim(-3, 5)
    plot_lm_4.axes[0].set_title('Residuals vs Leverage')
    plot_lm_4.axes[0].set_xlabel('Leverage')
    plot_lm_4.axes[0].set_ylabel('Standardized Residuals')

    leverage_top_3 = np.flip(np.argsort(model_cooks), 0)[:3]
    for i in leverage_top_3:
        plot_lm_4.axes[0].annotate(i,
                                   xy=(model_leverage[i],
                                       model_norm_residuals[i]))

    p = len(model_fit.params)  # number of model parameters
    graph(lambda a: np.sqrt((0.5 * p * (1 - a)) / a),
          np.linspace(0.001, max(model_leverage), 50), 'Cook\'s distance')
    graph(lambda a: np.sqrt((1 * p * (1 - a)) / a),
          np.linspace(0.001, max(model_leverage), 50))
    plot_lm_4.legend(loc='upper right')
    plt.show()
def diagnostic_plot(model_fit, df_name, response_variable):
    import numpy as np
    import pandas as pd

    import seaborn as sns
    import matplotlib.pyplot as plt

    import statsmodels.formula.api as smf
    from statsmodels.graphics.gofplots import ProbPlot

    plt.style.use('seaborn') # pretty matplotlib plots
    plt.rc('font', size=14)
    plt.rc('figure', titlesize=18)
    plt.rc('axes', labelsize=15)
    plt.rc('axes', titlesize=18)


    # Calculations required for some of the plots
    # fitted values (need a constant term for intercept)
    model_fitted_y = model_fit.fittedvalues

    # model residuals
    model_residuals = model_fit.resid

    # normalized residuals
    model_norm_residuals = model_fit.get_influence().resid_studentized_internal

    # absolute squared normalized residuals
    model_norm_residuals_abs_sqrt = np.sqrt(np.abs(model_norm_residuals))

    # absolute residuals
    model_abs_resid = np.abs(model_residuals)

    # leverage, from statsmodels internals
    model_leverage = model_fit.get_influence().hat_matrix_diag

    # cook's distance, from statsmodels internals
    model_cooks = model_fit.get_influence().cooks_distance[0]



    #### Residual plot
    '''This plot shows if residuals have non-linear patterns. There could be a non-linear
    relationship between predictor variables and an outcome variable and the pattern could
    show up in this plot if the model doesn’t capture the non-linear relationship.
    If you find equally spread residuals around a horizontal line with distinct patterns,
    that is a good indication you don’t have non-linear relationships.
    '''

    #Draws a scatterplot of fitted values against residuals, with a
    #“locally weighted scatterplot smoothing (lowess)”
    #regression line showing any apparent trend.
    plot_lm_1 = plt.figure(1)
    plot_lm_1.set_figheight(4)
    plot_lm_1.set_figwidth(6)

    plot_lm_1.axes[0] = sns.residplot(model_fitted_y, response_variable, data=df_name,
                                      lowess=True,
                                      scatter_kws={'alpha': 0.5},
                                      line_kws={'color': 'red', 'lw': 1, 'alpha': 0.8})

    plot_lm_1.axes[0].set_title('Residuals vs Fitted')
    plot_lm_1.axes[0].set_xlabel('Fitted values')
    plot_lm_1.axes[0].set_ylabel('Residuals')


    # annotations
    abs_resid = model_abs_resid.sort_values(ascending=False)
    abs_resid_top_3 = abs_resid[:3]

    for i in abs_resid_top_3.index:
        plot_lm_1.axes[0].annotate(i,
                                   xy=(model_fitted_y[i],
                                       model_residuals[i]));



    #### QQ plot
    '''This plot shows if residuals are normally distributed. Do residuals follow
    a straight line well or do they deviate severely? It’s good if residuals are
    lined well on the straight dashed line.
    '''

    #This one shows how well the distribution of residuals fit the normal
    #distribution. This plots the standardized (z-score) residuals against
    #the theoretical normal quantiles. Anything quite off the diagonal
    #lines may be a concern for further investigation.
    QQ = ProbPlot(model_norm_residuals)
    plot_lm_2 = QQ.qqplot(line='45', alpha=0.5, color='#4C72B0', lw=1)

    plot_lm_2.set_figheight(4)
    plot_lm_2.set_figwidth(6)

    plot_lm_2.axes[0].set_title('Normal Q-Q')
    plot_lm_2.axes[0].set_xlabel('Theoretical Quantiles')
    plot_lm_2.axes[0].set_ylabel('Standardized Residuals');

    # annotations
    abs_norm_resid = np.flip(np.argsort(np.abs(model_norm_residuals)), 0)
    abs_norm_resid_top_3 = abs_norm_resid[:3]

    for r, i in enumerate(abs_norm_resid_top_3):
        plot_lm_2.axes[0].annotate(i,
                                   xy=(np.flip(QQ.theoretical_quantiles, 0)[r],
                                       model_norm_residuals[i]));

    #### Scale-Location Plot
    '''It’s also called Spread-Location plot. This plot shows if residuals
    are spread equally along the ranges of predictors. This is how you can
    check the assumption of equal variance (homoscedasticity). It’s good if
    you see a horizontal line with equally (randomly) spread points.
    '''
    #This is another residual plot, showing their spread,
    #which you can use to assess heteroscedasticity.
    #It’s essentially a scatter plot of absolute square-rooted normalized
    #residuals and fitted values, with a lowess regression line.
    plot_lm_3 = plt.figure(3)
    plot_lm_3.set_figheight(4)
    plot_lm_3.set_figwidth(6)

    plt.scatter(model_fitted_y, model_norm_residuals_abs_sqrt, alpha=0.5)
    sns.regplot(model_fitted_y, model_norm_residuals_abs_sqrt,
                scatter=False,
                ci=False,
                lowess=True,
                line_kws={'color': 'red', 'lw': 1, 'alpha': 0.8})

    plot_lm_3.axes[0].set_title('Scale-Location')
    plot_lm_3.axes[0].set_xlabel('Fitted values')
    plot_lm_3.axes[0].set_ylabel('$\sqrt{|Standardized Residuals|}$');

    # annotations
    abs_sq_norm_resid = np.flip(np.argsort(model_norm_residuals_abs_sqrt), 0)
    abs_sq_norm_resid_top_3 = abs_sq_norm_resid[:3]

    for i in abs_norm_resid_top_3:
        plot_lm_3.axes[0].annotate(i,
                                   xy=(model_fitted_y[i],
                                       model_norm_residuals_abs_sqrt[i]));


    #### Leverage plot
    '''This plot helps us to find influential cases (i.e., subjects) if any.
    Not all outliers are influential in linear regression analysis (whatever outliers mean).
    Even though data have extreme values, they might not be influential to determine a
    regression line. That means, the results wouldn’t be much different if we either
    include or exclude them from analysis. They follow the trend in the majority of cases
    and they don’t really matter; they are not influential. On the other hand, some cases
    could be very influential even if they look to be within a reasonable range of the values.
    They could be extreme cases against a regression line and can alter the results if we
    exclude them from analysis. Another way to put it is that they don’t get along with
    the trend in the majority of the cases.

    Unlike the other plots, this time patterns are not relevant. We watch out for outlying
    values at the upper right corner or at the lower right corner. Those spots are the places
    where cases can be influential against a regression line. Look for cases outside of a
    dashed line, Cook’s distance. When cases are outside of the Cook’s distance (meaning
    they have high Cook’s distance scores), the cases are influential to the regression
    results. The regression results will be altered if we exclude those cases.
    '''

    #This plot shows if any outliers have influence over the regression fit.
    #Anything outside the group and outside “Cook’s Distance” lines,
    # may have an influential effect on model fit.
    plot_lm_4 = plt.figure(4)
    plot_lm_4.set_figheight(4)
    plot_lm_4.set_figwidth(6)

    plt.scatter(model_leverage, model_norm_residuals, alpha=0.5)
    sns.regplot(model_leverage, model_norm_residuals,
                scatter=False,
                ci=False,
                lowess=True,
                line_kws={'color': 'red', 'lw': 1, 'alpha': 0.8})

    plot_lm_4.axes[0].set_xlim(0, 0.20)
    plot_lm_4.axes[0].set_ylim(-3, 5)
    plot_lm_4.axes[0].set_title('Residuals vs Leverage')
    plot_lm_4.axes[0].set_xlabel('Leverage')
    plot_lm_4.axes[0].set_ylabel('Standardized Residuals')

    # annotations
    leverage_top_3 = np.flip(np.argsort(model_cooks), 0)[:3]

    for i in leverage_top_3:
        plot_lm_4.axes[0].annotate(i,
                                   xy=(model_leverage[i],
                                       model_norm_residuals[i]))

    # shenanigans for cook's distance contours
    def graph(formula, x_range, label=None):
        x = x_range
        y = formula(x)
        plt.plot(x, y, label=label, lw=1, ls='--', color='red')

    p = len(model_fit.params) # number of model parameters

    graph(lambda x: np.sqrt((0.5 * p * (1 - x)) / x),
          np.linspace(0.001, 0.200, 50),
          'Cook\'s distance') # 0.5 line

    graph(lambda x: np.sqrt((1 * p * (1 - x)) / x),
          np.linspace(0.001, 0.200, 50)) # 1 line

    plt.legend(loc='upper right');

    plt.show()
    sns.reset_orig()
Exemplo n.º 25
0
def regression_diagnostics_plots(bet_filtered, name, fig_2=None):
    """ Creates 4 regression diagnostics plots in 2 x 2 matrix
    Args:
        fit: Matplotlib Figure
        bet_filtered : A BETFilterAppliedResults object
        name: A string, name to give as a title.
        
    Returns:
        Fig, the updated matplotlib figure
        
    """
    # Obtaining Regression Diagnostics

    # Gather data and put in DF
    min_i = bet_filtered.min_i
    min_j = bet_filtered.min_j + 1
    p = bet_filtered.pressure
    lin_q = bet_filtered.linear_y
    P = pd.DataFrame(p)
    LIN_Q = pd.DataFrame(lin_q)
    dataframe = pd.concat([P, LIN_Q], axis=1)

    # Helper functions

    num_points = len(p)

    def graph(formula, x_range, label=None, ax=None):
        """Helper function for plotting cook Distance lines
        """
        x = x_range
        y = formula(x)
        if ax is None:
            plt.plot(x,
                     y,
                     label=label,
                     lw=1,
                     ls='--',
                     color='black',
                     alpha=0.75)
        else:
            ax.plot(x,
                    y,
                    label=label,
                    lw=1,
                    ls='--',
                    color='black',
                    alpha=0.75)

    # OLS regression
    x = sm.add_constant(p)
    model = sm.OLS(lin_q[min_i:min_j], x[min_i:min_j])
    fit = model.fit()
    fit_values = fit.fittedvalues
    fit_resid = fit.resid
    fit_stud_resid = fit.get_influence().resid_studentized_internal
    fit_stud_resid_abs_sqrt = np.sqrt(np.abs(fit_stud_resid))
    fit_abs_resid = np.abs(fit_resid)
    fit_leverage = fit.get_influence().hat_matrix_diag
    fit_CD = fit.get_influence().cooks_distance[0]

    # Make new figure
    if fig_2 is None:
        fig_2 = plt.figure(constrained_layout=False,
                           figsize=(6.29921, 9.52756))
    mpl.rc('font', family='Arial', size=9)
    fig_2.suptitle(f"BETSI Regression Diagnostics for {name}\n")

    # "Residual vs fitted" plot
    resid_vs_fit = fig_2.add_subplot(2, 2, 1)
    sns.residplot(fit_values,
                  fit_resid,
                  data=dataframe,
                  lowess=True,
                  scatter_kws={
                      'alpha': .5,
                      'color': 'red'
                  },
                  line_kws={
                      'color': 'black',
                      'lw': 1,
                      'alpha': 0.75
                  },
                  ax=resid_vs_fit)
    resid_vs_fit.axes.set
    resid_vs_fit.axes.set_title('Residuals vs Fitted', fontsize=11)
    resid_vs_fit.axes.set_xlabel('Fitted Values')
    resid_vs_fit.locator_params(axis='x', nbins=4)
    resid_vs_fit.axes.set_ylabel('Residuals')
    resid_vs_fit.tick_params(axis='both', which='major', labelsize=9)
    resid_vs_fit.tick_params(axis='both', which='minor', labelsize=9)

    dfit_values = (max(fit_values) - min(fit_values)) * 1
    resid_vs_fit.axes.set_xlim(
        min(fit_values) - dfit_values,
        max(fit_values) + dfit_values)
    dfit_resid = (max(fit_resid) - min(fit_resid)) * 1
    resid_vs_fit.axes.set_ylim(
        min(fit_resid) - dfit_resid,
        max(fit_resid) + dfit_resid)

    # "Normal Q-Q" plot
    QQ = ProbPlot(fit_stud_resid)

    qq_plot = QQ.qqplot(line='45',
                        markerfacecolor='red',
                        markeredgecolor='red',
                        color='black',
                        alpha=.3,
                        lw=.5,
                        ax=fig_2.add_subplot(2, 2, 2))
    qq_plot.axes[1].set_title('Normal Q-Q')
    qq_plot.axes[1].set_xlabel('Theoretical Quantiles')
    qq_plot.axes[1].set_ylabel('Studentized Residuals')
    qq_plot.axes[1].tick_params(axis='both', which='major')
    qq_plot.axes[1].tick_params(axis='both', which='minor')

    abs_norm_resid = np.flip(np.argsort(np.abs(fit_stud_resid)), 0)
    abs_norm_resid_top_3 = abs_norm_resid[:3]
    for r, i in enumerate(abs_norm_resid_top_3):
        # Add annotations
        qq_plot.axes[0].annotate(i,
                                 xy=(np.flip(QQ.theoretical_quantiles,
                                             0)[r], fit_stud_resid[i]),
                                 size=9)

    # "Scale-location" plot
    scale_loc = fig_2.add_subplot(2, 2, 3)
    scale_loc.scatter(fit_values, fit_stud_resid_abs_sqrt, alpha=.5, c='red')
    sns.regplot(fit_values,
                fit_stud_resid_abs_sqrt,
                scatter=False,
                ci=False,
                lowess=True,
                line_kws={
                    'color': 'black',
                    'lw': 1,
                    'alpha': .75
                },
                ax=scale_loc)
    scale_loc.set_title('Scale-Location')
    scale_loc.set_xlabel('Fitted Values')
    scale_loc.set_ylabel('$\mathregular{\sqrt{|Studentized\ Residuals|}}$')
    scale_loc.tick_params(axis='both', which='major', labelsize=9)
    scale_loc.tick_params(axis='both', which='minor', labelsize=9)

    abs_sq_norm_resid = np.flip(np.argsort(fit_stud_resid_abs_sqrt), 0)
    abs_sq_norm_resid_top_3 = abs_sq_norm_resid[:3]
    for i in abs_norm_resid_top_3:
        # Add annotations
        scale_loc.axes.annotate(i,
                                xy=(fit_values[i], fit_stud_resid_abs_sqrt[i]),
                                size=11)

    scale_loc.axes.set_xlim(
        min(fit_values) - .2 * max(fit_values),
        max(fit_values) + .2 * max(fit_values))
    scale_loc.locator_params(axis='x', nbins=4)

    # "Residuals vs leverage" plot
    res_vs_lev = fig_2.add_subplot(2, 2, 4)
    res_vs_lev.scatter(fit_leverage, fit_stud_resid, alpha=.5, color='red')
    sns.regplot(fit_leverage,
                fit_stud_resid,
                scatter=False,
                ci=False,
                lowess=True,
                line_kws={
                    'color': 'black',
                    'lw': 1,
                    'alpha': .75
                },
                ax=res_vs_lev)
    res_vs_lev.axes.set_title('Residuals vs Leverage')
    res_vs_lev.axes.set_xlabel('Leverage')
    res_vs_lev.axes.set_ylabel('Studentized Residuals')
    res_vs_lev.tick_params(axis='both', which='major')
    res_vs_lev.tick_params(axis='both', which='minor')

    leverage_top_3 = np.flip(np.argsort(fit_CD), 0)[:3]
    for i in leverage_top_3:
        # Add annotations
        res_vs_lev.axes.annotate(i,
                                 xy=(fit_leverage[i], fit_stud_resid[i]),
                                 size=9)

    p_3 = p[min_i:min_j]
    p_2 = len(fit.params)  # number of model parameters
    graph(lambda p_3: np.sqrt((.5 * p_2 * (1 - p_3)) / p_3),
          np.linspace(.001, max(fit_leverage), 50),
          'Cook\'s Distance',
          ax=res_vs_lev)  # .5 line
    graph(lambda p_3: -1 * np.sqrt((.5 * p_2 * (1 - p_3)) / p_3),
          np.linspace(.001, max(fit_leverage), 50),
          ax=res_vs_lev)
    graph(lambda p_3: np.sqrt((1 * p_2 * (1 - p_3)) / p_3),
          np.linspace(.001, max(fit_leverage), 50),
          ax=res_vs_lev)  # 1 line
    graph(lambda p_3: -1 * np.sqrt((1 * p_2 * (1 - p_3)) / p_3),
          np.linspace(.001, max(fit_leverage), 50),
          ax=res_vs_lev)  # 1 line

    res_vs_lev.legend(prop={'size': 9})

    plt.subplots_adjust(bottom=0.07,
                        top=0.91,
                        hspace=.255,
                        wspace=0.315,
                        left=0.12,
                        right=0.92)

    return fig_2
Exemplo n.º 26
0
def residual_plot(model_fit, df):
    # Required calculation for the plot:

    # fitted values (need a constant term for intercept)
    model_fitted_y = model_fit.fittedvalues

    # model residuals
    model_residuals = model_fit.resid

    # normalized residuals
    model_norm_residuals = model_fit.get_influence().resid_studentized_internal

    # absolute squared normalized residuals
    model_norm_residuals_abs_sqrt = np.sqrt(np.abs(model_norm_residuals))

    # absolute residuals
    model_abs_resid = np.abs(model_residuals)

    # leverage, from statsmodels internals
    model_leverage = model_fit.get_influence().hat_matrix_diag

    # cook's distance, from statsmodels internals
    model_cooks = model_fit.get_influence().cooks_distance[0]

    ## Residual plot

    plot_lm_1 = plt.figure(1)
    plot_lm_1.set_figheight(8)
    plot_lm_1.set_figwidth(12)

    plot_lm_1.axes[0] = sns.residplot(model_fitted_y,
                                      'Time',
                                      data=df,
                                      lowess=True,
                                      scatter_kws={'alpha': 0.5},
                                      line_kws={
                                          'color': 'red',
                                          'lw': 1,
                                          'alpha': 0.8
                                      })

    plot_lm_1.axes[0].set_title('Residuals vs Fitted')
    plot_lm_1.axes[0].set_xlabel('Fitted values')
    plot_lm_1.axes[0].set_ylabel('Residuals')

    # annotations
    abs_resid = model_abs_resid.sort_values(ascending=False)
    abs_resid_top_3 = abs_resid[:3]

    for i in abs_resid_top_3.index:
        plot_lm_1.axes[0].annotate(i,
                                   xy=(model_fitted_y[i], model_residuals[i]))

    ## Q-Q plot

    QQ = ProbPlot(model_norm_residuals)
    plot_lm_2 = QQ.qqplot(line='45', alpha=0.5, color='#4C72B0', lw=1)

    plot_lm_2.set_figheight(8)
    plot_lm_2.set_figwidth(12)

    plot_lm_2.axes[0].set_title('Normal Q-Q')
    plot_lm_2.axes[0].set_xlabel('Theoretical Quantiles')
    plot_lm_2.axes[0].set_ylabel('Standardized Residuals')

    # annotations
    abs_norm_resid = np.flip(np.argsort(np.abs(model_norm_residuals)), 0)
    abs_norm_resid_top_3 = abs_norm_resid[:3]

    for r, i in enumerate(abs_norm_resid_top_3):
        plot_lm_2.axes[0].annotate(i,
                                   xy=(np.flip(QQ.theoretical_quantiles,
                                               0)[r], model_norm_residuals[i]))

    ## Scale Location Plot

    plot_lm_3 = plt.figure(3)
    plot_lm_3.set_figheight(8)
    plot_lm_3.set_figwidth(12)

    plt.scatter(model_fitted_y, model_norm_residuals_abs_sqrt, alpha=0.5)
    sns.regplot(model_fitted_y,
                model_norm_residuals_abs_sqrt,
                scatter=False,
                ci=False,
                lowess=True,
                line_kws={
                    'color': 'red',
                    'lw': 1,
                    'alpha': 0.8
                })

    plot_lm_3.axes[0].set_title('Scale-Location')
    plot_lm_3.axes[0].set_xlabel('Fitted values')
    plot_lm_3.axes[0].set_ylabel('$\sqrt{|Standardized Residuals|}$')

    # annotations
    abs_sq_norm_resid = np.flip(np.argsort(model_norm_residuals_abs_sqrt), 0)
    abs_sq_norm_resid_top_3 = abs_sq_norm_resid[:3]

    for i in abs_norm_resid_top_3:
        plot_lm_3.axes[0].annotate(i,
                                   xy=(model_fitted_y[i],
                                       model_norm_residuals_abs_sqrt[i]))

    ## Leverage plot

    plot_lm_4 = plt.figure(4)
    plot_lm_4.set_figheight(8)
    plot_lm_4.set_figwidth(12)

    plt.scatter(model_leverage, model_norm_residuals, alpha=0.5)
    sns.regplot(model_leverage,
                model_norm_residuals,
                scatter=False,
                ci=False,
                lowess=True,
                line_kws={
                    'color': 'red',
                    'lw': 1,
                    'alpha': 0.8
                })

    plot_lm_4.axes[0].set_xlim(0, 0.20)
    plot_lm_4.axes[0].set_ylim(-3, 5)
    plot_lm_4.axes[0].set_title('Residuals vs Leverage')
    plot_lm_4.axes[0].set_xlabel('Leverage')
    plot_lm_4.axes[0].set_ylabel('Standardized Residuals')

    # annotations
    leverage_top_3 = np.flip(np.argsort(model_cooks), 0)[:3]

    for i in leverage_top_3:
        plot_lm_4.axes[0].annotate(i,
                                   xy=(model_leverage[i],
                                       model_norm_residuals[i]))

    # shenanigans for cook's distance contours
    def graph(formula, x_range, label=None):
        x = x_range
        y = formula(x)
        plt.plot(x, y, label=label, lw=1, ls='--', color='red')

    p = len(model_fit.params)  # number of model parameters

    graph(lambda x: np.sqrt((0.5 * p * (1 - x)) / x),
          np.linspace(0.001, 0.200, 50), 'Cook\'s distance')  # 0.5 line
    graph(lambda x: np.sqrt((1 * p * (1 - x)) / x),
          np.linspace(0.001, 0.200, 50))  # 1 line
    plt.legend(loc='upper right')

    return True
Exemplo n.º 27
0
                                  line_kws={
                                      'color': 'red',
                                      'lw': 1,
                                      'alpha': 0.8
                                  })
plot_lm_1.axes[0].set_title('Bitcoin')
plot_lm_1.axes[0].set_xlabel('Fitted Values')
plot_lm_1.axes[0].set_ylabel('Residuals')
# annotations
abs_resid = model_abs_resid.sort_values(ascending=False)
abs_resid_top_7 = abs_resid[:7]
for i in abs_resid_top_7.index:
    plot_lm_1.axes[0].annotate(i, xy=(model_fitted_y[i], model_residuals[i]))
plt.savefig('BTCfitres.png')
QQ = ProbPlot(model_norm_residuals)
plot_qq_1 = QQ.qqplot(line='45', alpha=0.5, color='#4C72B0', lw=1)
plot_qq_1.set_figheight(8)
plot_qq_1.set_figwidth(12)
plot_qq_1.axes[0].set_title('Normal Q-Q')
plot_qq_1.axes[0].set_xlabel('Theoretical Quantiles')
plot_qq_1.axes[0].set_ylabel('Standardized Residuals')
# annotations
abs_norm_resid = np.flip(np.argsort(np.abs(model_norm_residuals)), 0)
abs_norm_resid_top_4 = abs_norm_resid[:4]
for r, i in enumerate(abs_norm_resid_top_4):
    plot_qq_1.axes[0].annotate(i,
                               xy=(np.flip(QQ.theoretical_quantiles,
                                           0)[r], model_norm_residuals[i]))
plt.savefig('BTCqq.png')

plot_cd_1 = plt.figure(4)
# annotations
abs_resid = lm_abs_resid.sort_values(ascending = False)
abs_resid_top_3 = abs_resid[:3]

for i in abs_resid_top_3.index:
    plot_lm_1.axes[0].annotate(i, xy = (lm_fitted_y[i], lm_residuals[i]));


# **QQ plot**

# In[15]:


QQ = ProbPlot(lm_norm_residuals)
plot_lm_2 = QQ.qqplot(line = '45', alpha = 0.5, color = '#4C72B0', lw = 1)

plot_lm_2.set_figheight(8)
plot_lm_2.set_figwidth(12)

plot_lm_2.axes[0].set_title('Normal Q-Q')
plot_lm_2.axes[0].set_xlabel('Theoretical Quantiles')
plot_lm_2.axes[0].set_ylabel('Standardized Residuals');

# annotations
abs_norm_resid = np.flip(np.argsort(np.abs(lm_norm_residuals)), 0)
abs_norm_resid_top_3 = abs_norm_resid[:3]

for r, i in enumerate(abs_norm_resid_top_3):
    plot_lm_2.axes[0].annotate(i, xy=(np.flip(QQ.theoretical_quantiles, 0)[r], lm_norm_residuals[i]));
Exemplo n.º 29
0
#Fit the model
order = (12, 0, 0)
model = ARIMA(series, order=order)
model_fit = model.fit()
print(model_fit.summary())

#Run Sequence
residuals = pd.DataFrame(model_fit.resid)
residuals.plot()
pyplot.title(str(order) + ' - Run Sequence')
pyplot.show()
residuals.plot(kind='kde')
pyplot.title(str(order) + ' - Kernel density estimaton')
pyplot.show()
print(residuals.describe())

#Lag Plot
lag_plot(residuals)
pyplot.title(str(order) + ' - Lag Plot')
pyplot.show()

#Histogram
residuals.hist()
pyplot.title(str(order) + ' - Histogram')
pyplot.show()

#Normal Probability
probplot = ProbPlot(residuals)
probplot.qqplot()
pyplot.title(str(order) + ' - Normal Probability Plot')
pyplot.show()