예제 #1
0
def eda_missing_summary(self, data=None, color=style.style_grey, display_df=False, chart_scale=15):
    """
    Documentation:

        ---
        Description:
            Creates vertical bar chart visualizing the percent of values missing for each feature.
            Optionally displays the underlying Pandas DataFrame.

        ---
        Parameters:
            data : Pandas DataFrame, default=None
                Pandas DataFrame containing independent variables. If left as none,
                the feature dataset provided to Machine during instantiation is used.
            color : str or color code, default=style.style_grey
                Bar color.
            display_df : boolean, default=False
                Controls whether to display summary data in Pandas DataFrame in addition to chart.
            chart_scale : int or float, default=15
                Controls size and proportions of chart and chart elements. Higher value creates
                larger plots and increases visual elements proportionally.
    """
    # use dataset provided during instantiation if None
    if data is None:
        data = self.data

    # return missingness summary
    percent_missing = self.missing_summary(data)

    # if missingness summary is not empty, create the visualization
    if not percent_missing.empty:
        # optionally display DataFrame summary
        if display_df:
            display(percent_missing)

        # create prettierplot object
        p = PrettierPlot(chart_scale=chart_scale, plot_orientation="wide_standard")

        # add canvas to prettierplot object
        ax = p.make_canvas(
            title="Percent missing by feature",
            y_shift=0.8,
            title_scale=0.8,
        )

        # add vertical bar chart to canvas
        p.bar_v(
            x=percent_missing.index,
            counts=percent_missing["Percent missing"],
            label_rotate=45 if len(percent_missing.index) <=5 else 90,
            color=color,
            y_units="p",
            x_tick_wrap=False,
            ax=ax,
        )
    
    # if missingness summary is empty, just print "No Nulls"
    else:
        print("No nulls")
예제 #2
0
def eda_missing_summary(self, training_data=True, color=style.style_grey, display_df=False, chart_scale=15):
    """
    Documentation:

        ---
        Description:
            Creates vertical bar chart visualizing the percent of values missing for each feature.
            Optionally displays the underlying Pandas DataFrame.

        ---
        Parameters:
            training_data : boolean, dafault=True
                Controls which dataset (training or validation) is used for visualization.
            color : str or color code, default=style.style_grey
                Bar color.
            display_df : boolean, default=False
                Controls whether to display summary data in Pandas DataFrame in addition to chart.
            chart_scale : int or float, default=15
                Controls size and proportions of chart and chart elements. Higher value creates
                larger plots and increases visual elements proportionally.
    """
    # dynamically choose training data objects or validation data objects
    data, _, mlm_dtypes = self.training_or_validation_dataset(training_data)

    # return missingness summary
    percent_missing = self.missing_summary(training_data)

    # if missingness summary is not empty, create the visualization
    if not percent_missing.empty:
        # optionally display DataFrame summary
        if display_df:
            display(percent_missing)

        # create prettierplot object
        p = PrettierPlot(chart_scale=chart_scale, plot_orientation="wide_standard")

        # add canvas to prettierplot object
        ax = p.make_canvas(
            title="Percent missing by feature",
            y_shift=0.8,
            title_scale=0.8,
        )

        # add vertical bar chart to canvas
        p.bar_v(
            x=percent_missing.index,
            counts=percent_missing["Percent missing"],
            label_rotate=45 if len(percent_missing.index) <=5 else 90,
            color=color,
            y_units="p",
            x_tick_wrap=False,
            ax=ax,
        )

        ax.set_ylim([0,100])

    # if missingness summary is empty, just print "No Nulls"
    else:
        print("No nulls")
예제 #3
0
def eda_skew_summary(self, data=None, color=style.style_grey, display_df=False, chart_scale=15):
    """
    Documentation:

        ---
        Description:
            Creates vertical bar chart visualizing the skew for each feature. Optionally
            displaying the underlying Pandas DataFrame.

        ---
        Parameters:
            data : Pandas DataFrame, default=None
                Pandas DataFrame containing independent variables. If left as none,
                the feature dataset provided to Machine during instantiation is used.
            color : str, color code, default=style.style_grey
                Bar color.
            display_df : boolean, default=False
                Controls whether to display summary data in Pandas DataFrame along with chart.
            chart_scale : int or float, default=15
                Controls size and proportions of chart and chart elements. Higher value creates
                larger plots and increases visual elements proportionally.
    """
    # use dataset provided during instantiation if None
    if data is None:
        data = self.data

    # return skewness summary
    skew_summary = self.skew_summary(data)

    # optionally display DataFrame summary
    if display_df:
        display(skew_summary)

    # create prettierplot object
    p = PrettierPlot(chart_scale=chart_scale, plot_orientation="wide_standard")

    # add canvas to prettierplot object
    ax = p.make_canvas(
        title="Skew by feature",
        y_shift=0.8,
        title_scale=0.8,
    )

    # add vertical bar chart to canvas
    p.bar_v(
        x=skew_summary.index,
        counts=skew_summary["Skew"],
        label_rotate=45 if len(skew_summary.index) <=5 else 90,
        color=color,
        y_units="fff",
        x_tick_wrap=False,
        ax=ax,
    )
예제 #4
0
def eda_skew_summary(self, training_data=True, color=style.style_grey, display_df=False, chart_scale=15):
    """
    Documentation:

        ---
        Description:
            Creates vertical bar chart visualizing the skew for each feature. Optionally
            displaying the underlying Pandas DataFrame.

        ---
        Parameters:
            training_data : boolean, dafault=True
                Controls which dataset (training or validation) is used for visualization.
            color : str, color code, default=style.style_grey
                Bar color.
            display_df : boolean, default=False
                Controls whether to display summary data in Pandas DataFrame along with chart.
            chart_scale : int or float, default=15
                Controls size and proportions of chart and chart elements. Higher value creates
                larger plots and increases visual elements proportionally.
    """
    # dynamically choose training data objects or validation data objects
    data, _, mlm_dtypes = self.training_or_validation_dataset(training_data)

    # return skewness summary
    skew_summary = self.skew_summary(data)

    # optionally display DataFrame summary
    if display_df:
        display(skew_summary)

    # create prettierplot object
    p = PrettierPlot(chart_scale=chart_scale, plot_orientation="wide_standard")

    # add canvas to prettierplot object
    ax = p.make_canvas(
        title="Skew by feature",
        y_shift=0.8,
        title_scale=0.8,
    )

    # add vertical bar chart to canvas
    p.bar_v(
        x=skew_summary.index,
        counts=skew_summary["Skew"],
        label_rotate=45 if len(skew_summary.index) <=5 else 90,
        color=color,
        y_units="fff",
        x_tick_wrap=False,
        ax=ax,
    )
예제 #5
0
def sample_plot(self, sample_space, n_iter, chart_scale=15):
    """
    Documentation:

        ---
        Definition:
            Visualizes a single hyperopt theoretical distribution. Useful for helping to determine a
            distribution to use when setting up hyperopt distribution objects for actual parameter
            tuning.

        ---
        Parameters:
            sample_space : dictionary
                Dictionary of 'param name: hyperopt distribution object' key/value pairs. The name can
                be arbitrarily chosen, and the value is a defined hyperopt distribution.
            n_iter : int
                Number of iterations to draw from theoretical distribution in order to visualize the
                theoretical distribution. Higher number leads to more robust distribution but can take
                considerably longer to create.
            chart_scale : float, default=15
                Controls proportions of visualizations. larger values scale visual up in size, smaller values
                scale visual down in size.

    """
    # iterate through each parameter
    for param in sample_space.keys():

        # sample from theoretical distribution for n_iters
        theoretical_dist = []
        for _ in range(n_iter):
            theoretical_dist.append(sample(sample_space)[param])
        theoretical_dist = np.array(theoretical_dist)

        # create prettierplot object
        p = PrettierPlot(chart_scale=chart_scale)

        # add canvas to prettierplot object
        ax = p.make_canvas(
            title="actual vs. theoretical plot\n* {}".format(param),
            y_shift=0.8,
            position=111,
        )

        # add kernel density plot to canvas
        p.kde_plot(
            theoretical_dist,
            color=style.style_grey,
            y_units="p",
            x_units="fff" if np.nanmax(theoretical_dist) <= 5.0 else "ff",
            ax=ax,
        )
예제 #6
0
def eda_transform_log1(self, data, name, chart_scale=15):
    """
    Documentation:

        ---
        Description:
            Creates a two_panel visualization. The left plot is the log + 1 transformed
            distribution overlayed on a normal distribution. The right plot is a log + 1 
            adjusted qqplot overlayed across a straight line.

        ---
        Parameters:
            data : Pandas Series
                Target variable data object.
            name : str
                Name of target variable.
            chart_scale : int or float, default=15
                Controls size and proportions of chart and chart elements. Higher value creates
                larger plots and increases visual elements proportionally.
    """
    # create prettierplot object
    p = PrettierPlot(chart_scale=chart_scale)

    # add canvas to prettierplot object
    ax = p.make_canvas(
        title="dist/kde - {} (log+1)".format(name),
        x_label="",
        y_label="",
        y_shift=0.8,
        position=223,
    )

    # add distribution / kernel density plot to canvas
    p.dist_plot(
        np.log1p(data), color=style.style_grey, fit=stats.norm, x_rotate=True, ax=ax
    )

    # turn off x and y ticks
    plt.xticks([])
    plt.yticks([])

    # add canvas to prettierplot object
    ax = p.make_canvas(
        title="probability plot - {} (log+1)".format(name),
        x_label="",
        y_label="",
        y_shift=0.8,
        position=224,
    )

    # add QQ / probability plot to canvas
    p.prob_plot(np.log1p(data), plot=ax)

    # turn off x and y ticks
    plt.xticks([])
    plt.yticks([])
예제 #7
0
def eda_cat_target_num_feat(self,
                            feature,
                            color_map="viridis",
                            outliers_out_of_scope=None,
                            legend_labels=None,
                            chart_scale=15):
    """
    Documentation:

        ---
        Description:
            Creates exploratory data visualizations and statistical summaries for a number
            feature in the context of a categorical target.

        ---
        Parameters:
            feature : str
                Feature to visualize.
            color_map : str specifying built-in matplotlib colormap, default="viridis"
                Color map applied to plots.
            outliers_out_of_scope : boolean, float or int, default=None
                Truncates the x-axis upper limit so that outliers are out of scope of the visualization.
                The x-axis upper limit is reset to the maximum non-outlier value.

                To identify outliers, the IQR is calculated, and values that are below the first quartile
                minus the IQR, or above the third quarterile plus the IQR are designated as outliers. If True
                is passed as a value, the IQR that is subtracted/added is multiplied by 5. If a float or int is
                passed, the IQR is multiplied by that value. Higher values increase how extremem values need
                to be to be identified as outliers.
            legend_labels : list, default=None
                Class labels displayed in plot legend.
            chart_scale : int or float, default=15
                Controls size and proportions of chart and chart elements. Higher value creates larger plots
                and increases visual elements proportionally.
    """
    ### data summaries
    ## bivariate roll_up table
    # combine feature column and target
    bi_df = pd.concat([self.data[feature], self.target], axis=1)

    # remove any rows with nulls
    bi_df = bi_df[bi_df[feature].notnull()]

    # bivariate summary statistics
    bi_summ_stats_df = pd.DataFrame(
        columns=["Class", "Count", "Proportion", "Mean", "StdDev"])

    # for each unique class label
    for labl in np.unique(self.target):

        # get feature values associated with single class label
        feature_slice = bi_df[bi_df[self.target.name] == labl][feature]

        # append summary statistics for feature values associated with class label
        bi_summ_stats_df = bi_summ_stats_df.append(
            {
                "Class": labl,
                "Count": len(feature_slice),
                "Proportion": len(feature_slice) / len(bi_df[feature]) * 100,
                "Mean": np.mean(feature_slice),
                "StdDev": np.std(feature_slice),
            },
            ignore_index=True,
        )

    # apply custom legend labels, or set dtype to int if column values are numeric
    if legend_labels is not None:
        bi_summ_stats_df["Class"] = legend_labels
    elif is_numeric_dtype(bi_summ_stats_df["Class"]):
        bi_summ_stats_df["Class"] = bi_summ_stats_df["Class"].astype(np.int)

    ## Feature summary
    describe_df = pd.DataFrame(bi_df[feature].describe()).reset_index()

    # add missing percentage
    describe_df = describe_df.append(
        {
            "index": "missing",
            feature: np.round(self.data.shape[0] - bi_df[feature].shape[0], 5),
        },
        ignore_index=True,
    )

    # add skew
    describe_df = describe_df.append(
        {
            "index":
            "skew",
            feature:
            np.round(stats.skew(bi_df[feature].values, nan_policy="omit"), 5),
        },
        ignore_index=True,
    )
    # add kurtosis
    describe_df = describe_df.append(
        {
            "index": "kurtosis",
            feature: stats.kurtosis(bi_df[feature].values, nan_policy="omit"),
        },
        ignore_index=True,
    )
    describe_df = describe_df.rename(columns={"index": ""})

    # execute z-test or t-test
    if len(np.unique(self.target)) == 2:
        s1 = bi_df[(bi_df[self.target.name] == bi_df[
            self.target.name].unique()[0])][feature]
        s2 = bi_df[(bi_df[self.target.name] == bi_df[
            self.target.name].unique()[1])][feature]
        if len(s1) > 30 and len(s2) > 30:

            # perform z-test, return z-statistic and p-value
            z, p_val = ztest(s1, s2)

            # add z-statistic and p-value to DataFrame
            stat_test_df = pd.DataFrame(
                data=[{
                    "z-test statistic": z,
                    "p-value": p_val
                }],
                columns=["z-test statistic", "p-value"],
                index=[feature],
            ).round(4)
        else:
            # perform t-test, return t-score and p-value
            t, p_val = stats.ttest_ind(s1, s2)

            # add t-statistic and p-value to DataFrame
            stat_test_df = pd.DataFrame(
                data=[{
                    "t-test statistic": t,
                    "p-value": p_val
                }],
                columns=["t-test statistic", "p-value"],
                index=[feature],
            ).round(4)

        # display summary tables
        self.df_side_by_side(
            dfs=(describe_df, bi_summ_stats_df, stat_test_df),
            names=[
                "Feature summary", "Feature vs. target summary",
                "Statistical test"
            ],
        )
    else:

        # display summary tables
        self.df_side_by_side(
            dfs=(describe_df, bi_summ_stats_df),
            names=["Feature summary", "Feature vs. target summary"],
        )

    ### visualizations
    # create prettierplot object
    p = PrettierPlot(chart_scale=chart_scale, plot_orientation="wide_standard")

    # if boolean is passed to outliers_out_of_scope
    if isinstance(outliers_out_of_scope, bool):
        # if outliers_out_of_scope = True
        if outliers_out_of_scope:

            # identify outliers using IQR method and an IQR step of 5
            outliers = self.outlier_IQR(self.data[feature], iqr_step=5)

            # reset x-axis minimum and maximum
            x_axis_min = self.data[feature].drop(index=outliers).min()
            x_axis_max = self.data[feature].drop(index=outliers).max()
    # if outliers_out_of_scope is a float or int
    elif isinstance(outliers_out_of_scope, float) or isinstance(
            outliers_out_of_scope, int):
        # identify outliers using IQR method and an IQR step equal to the float/int passed
        outliers = self.outlier_IQR(self.data[feature],
                                    iqr_step=outliers_out_of_scope)

        # reset x-axis minimum and maximum
        x_axis_min = self.data[feature].drop(index=outliers).min()
        x_axis_max = self.data[feature].drop(index=outliers).max()

    # add canvas to prettierplot object
    ax = p.make_canvas(
        title="Feature distribution\n* {}".format(feature),
        title_scale=0.85,
        position=221,
    )

    ## dynamically determine precision of x-units
    # capture min and max feature values
    dist_min = bi_df[feature].values.min()
    dist_max = bi_df[feature].values.max()

    # determine x-units precision based on min and max values in feature
    if -3 < dist_min < 3 and -3 < dist_max < 3 and dist_max / dist_min < 10:
        x_units = "fff"
    elif -30 < dist_min < 30 and -30 < dist_max < 30 and dist_max / dist_min < 3:
        x_units = "fff"
    elif -5 < dist_min < 5 and -5 < dist_max < 5 and dist_max / dist_min < 10:
        x_units = "ff"
    elif -90 < dist_min < 90 and -90 < dist_max < 90 and dist_max / dist_min < 5:
        x_units = "ff"
    else:
        x_units = "f"

    # add distribution plot to canvas
    p.dist_plot(
        bi_df[feature].values,
        color=style.style_grey,
        y_units="f",
        x_units=x_units,
        ax=ax,
    )

    # optionally reset x-axis limits
    if outliers_out_of_scope is not None:
        plt.xlim(x_axis_min, x_axis_max)

    # add canvas to prettierplot object
    ax = p.make_canvas(
        title="Probability plot\n* {}".format(feature),
        title_scale=0.85,
        position=222,
    )

    # add QQ / probability plot to canvas
    p.prob_plot(
        x=bi_df[feature].values,
        plot=ax,
    )

    # add canvas to prettierplot object
    ax = p.make_canvas(
        title="Distribution by class\n* {}".format(feature),
        title_scale=0.85,
        position=223,
    )

    ## dynamically determine precision of x-units
    # capture min and max feature values
    dist_min = bi_df[feature].values.min()
    dist_max = bi_df[feature].values.max()

    # determine x-units precision based on min and max values in feature
    if -3 < dist_min < 3 and -3 < dist_max < 3 and dist_max / dist_min < 10:
        x_units = "fff"
    elif -30 < dist_min < 30 and -30 < dist_max < 30 and dist_max / dist_min < 3:
        x_units = "fff"
    elif -5 < dist_min < 5 and -5 < dist_max < 5 and dist_max / dist_min < 10:
        x_units = "ff"
    elif -90 < dist_min < 90 and -90 < dist_max < 90 and dist_max / dist_min < 5:
        x_units = "ff"
    else:
        x_units = "f"

    # generate color list
    color_list = style.color_gen(name=color_map,
                                 num=len(np.unique(self.target)))

    # add one distribution plot to canvas for each category class
    for ix, labl in enumerate(np.unique(bi_df[self.target.name].values)):
        p.dist_plot(
            bi_df[bi_df[self.target.name] == labl][feature].values,
            color=color_list[ix],
            y_units="f",
            x_units=x_units,
            legend_labels=legend_labels if legend_labels is not None else
            np.arange(len(np.unique(self.target))),
            alpha=0.4,
            bbox=(1.0, 1.0),
            ax=ax,
        )

    # optionally reset x-axis limits
    if outliers_out_of_scope is not None:
        plt.xlim(x_axis_min, x_axis_max)

    # add canvas to prettierplot object
    ax = p.make_canvas(
        title="Boxplot by class\n* {}".format(feature),
        title_scale=0.85,
        position=224,
    )

    ## dynamically determine precision of x-units
    # capture min and max feature values
    dist_min = bi_df[feature].values.min()
    dist_max = bi_df[feature].values.max()

    # determine x-units precision based on min and max values in feature
    if -3 < dist_min < 3 and -3 < dist_max < 3 and dist_max / dist_min < 10:
        x_units = "fff"
    elif -30 < dist_min < 30 and -30 < dist_max < 30 and dist_max / dist_min < 3:
        x_units = "fff"
    elif -5 < dist_min < 5 and -5 < dist_max < 5 and dist_max / dist_min < 10:
        x_units = "ff"
    elif -90 < dist_min < 90 and -90 < dist_max < 90 and dist_max / dist_min < 5:
        x_units = "ff"
    else:
        x_units = "f"

    # add horizontal box plot to canvas
    p.box_plot_h(x=feature,
                 y=self.target.name,
                 data=bi_df,
                 alpha=0.7,
                 x_units=x_units,
                 legend_labels=legend_labels,
                 bbox=(1.2, 1.0),
                 suppress_outliers=True,
                 ax=ax)

    # optionally reset x-axis limits
    if outliers_out_of_scope is not None:
        plt.xlim(x_axis_min - (x_axis_min * 0.1), x_axis_max)

    # apply position adjustment to subplots
    plt.subplots_adjust(bottom=-0.1)

    plt.show()
예제 #8
0
def eda_cat_target_cat_feat(self,
                            feature,
                            level_count_cap=50,
                            color_map="viridis",
                            legend_labels=None,
                            chart_scale=15):
    """
    Documentation:

        ---
        Description:
            Creates exploratory data visualizations and statistical summaries for a category feature
            in the context of a categorical target.

        ---
        Parameters:
            feature : str
                Feature to visualize.
            level_count_cap : int, default=50
                Maximum number of unique levels in feature. If the number of levels exceeds the
                cap, then no visualization panel is produced.
            color_map : str specifying built-in matplotlib colormap, default="viridis"
                Color map applied to plots.
            legend_labels : list, default=None
                Class labels displayed in plot legend.
            chart_scale : int or float, default=15
                Controls size and proportions of chart and chart elements. Higher value creates
                larger plots and increases visual elements proportionally.
    """

    # if number of unique levels in feature is less than specified level_count_cap
    if (len(np.unique(self.data[self.data[feature].notnull()][feature].values))
            < level_count_cap):

        ### data summaries
        ## feature summary
        # create empty DataFrame
        uni_summ_df = pd.DataFrame(columns=[feature, "Count", "Proportion"])

        # capture unique values and count of those unique values
        unique_vals, unique_counts = np.unique(
            self.data[self.data[feature].notnull()][feature],
            return_counts=True)

        # append each unique value, count and proportion to DataFrame
        for i, j in zip(unique_vals, unique_counts):
            uni_summ_df = uni_summ_df.append(
                {
                    feature: i,
                    "Count": j,
                    "Proportion": j / np.sum(unique_counts) * 100,
                },
                ignore_index=True,
            )

        # sort DataFrame by "Proportion", descending
        uni_summ_df = uni_summ_df.sort_values(by=["Proportion"],
                                              ascending=False)

        # set values to int dtype where applicable to optimize
        uni_summ_df["Count"] = uni_summ_df["Count"].astype("int64")
        if is_numeric_dtype(uni_summ_df[feature]):
            uni_summ_df[feature] = uni_summ_df[feature].astype("int64")

        ## feature vs. target summary
        # combine feature column and target
        bi_df = pd.concat([self.data[feature], self.target], axis=1)

        # remove any rows with nulls
        bi_df = bi_df[bi_df[feature].notnull()]

        # groupby category feature and count the occurrences of target classes
        # for each level in category
        bi_summ_df = (
            bi_df.groupby([feature] +
                          [self.target.name]).size().reset_index().pivot(
                              columns=self.target.name,
                              index=feature,
                              values=0))

        # overwrite DataFrame index with actual class labels if provided
        bi_summ_df.columns = pd.Index(
            legend_labels) if legend_labels is not None else pd.Index(
                [i for i in bi_summ_df.columns.tolist()])
        bi_summ_df.reset_index(inplace=True)

        # fill nan's with zero
        fill_columns = bi_summ_df.iloc[:, 2:].columns
        bi_summ_df[fill_columns] = bi_summ_df[fill_columns].fillna(0)

        # set values to int dtype where applicable to optimize displayed DataFrame
        for column in bi_summ_df.columns:
            try:
                bi_summ_df[column] = bi_summ_df[column].astype(np.int)
            except ValueError:
                bi_summ_df[column] = bi_summ_df[column]

        ## proportion by category summary
        # combine feature column and target
        prop_df = pd.concat([self.data[feature], self.target], axis=1)

        # remove any rows with nulls
        prop_df = prop_df[prop_df[feature].notnull()]

        # calculate percent of 100 by class label
        prop_df = prop_df.groupby([feature, self.target.name
                                   ]).agg({self.target.name: {"count"}})
        prop_df = prop_df.groupby(
            level=0).apply(lambda x: 100 * x / float(x.sum()))
        prop_df = prop_df.reset_index()

        multiIndex = prop_df.columns
        singleIndex = [i[0] for i in multiIndex.tolist()]
        singleIndex[-1] = "Count"
        prop_df.columns = singleIndex
        prop_df = prop_df.reset_index(drop=True)

        prop_df = pd.pivot_table(prop_df,
                                 values=["Count"],
                                 columns=[feature],
                                 index=[self.target.name],
                                 aggfunc={"Count": np.mean})
        prop_df = prop_df.reset_index(drop=True)

        multiIndex = prop_df.columns
        singleIndex = []

        for column in multiIndex.tolist():
            try:
                singleIndex.append(int(column[1]))
            except ValueError:
                singleIndex.append(column[1])

        prop_df.columns = singleIndex
        prop_df = prop_df.reset_index(drop=True)

        # insert column to DataFrame with actual class labels if provided, otherwise use raw class labels in target
        prop_df.insert(loc=0,
                       column="Class",
                       value=legend_labels if legend_labels is not None else
                       np.unique(self.target))

        # fill nan's with zero
        fill_columns = prop_df.iloc[:, :].columns
        prop_df[fill_columns] = prop_df[fill_columns].fillna(0)

        # if there are only two class labels, perform z-test/t-test
        if len(np.unique(bi_df[bi_df[feature].notnull()][feature])) == 2:

            # total observations
            total_obs1 = bi_df[(bi_df[feature] == np.unique(
                bi_df[feature])[0])][feature].shape[0]
            total_obs2 = bi_df[(bi_df[feature] == np.unique(
                bi_df[feature])[1])][feature].shape[0]

            # total positive observations
            pos_obs1 = bi_df[(bi_df[feature] == np.unique(bi_df[feature])[0])
                             &
                             (bi_df[self.target.name] == 1)][feature].shape[0]
            pos_obs2 = bi_df[(bi_df[feature] == np.unique(bi_df[feature])[1])
                             &
                             (bi_df[self.target.name] == 1)][feature].shape[0]

            # perform z-test, return z-statistic and p-value
            z, p_val = proportions_ztest(count=(pos_obs1, pos_obs2),
                                         nobs=(total_obs1, total_obs2))

            # add z-statistic and p-value to DataFrame
            stat_test_df = pd.DataFrame(
                data=[{
                    "z-test statistic": z,
                    "p-value": p_val
                }],
                columns=["z-test statistic", "p-value"],
                index=[feature],
            ).round(4)

            # display summary tables
            self.df_side_by_side(
                dfs=(uni_summ_df, bi_summ_df, prop_df, stat_test_df),
                names=[
                    "Feature summary",
                    "Feature vs. target summary",
                    "Target proportion",
                    "Statistical test",
                ],
            )
            if "percent_positive" in bi_summ_df:
                bi_summ_df = bi_summ_df.drop(["percent_positive"], axis=1)

        else:
            # display summary tables
            self.df_side_by_side(
                dfs=(uni_summ_df, bi_summ_df, prop_df),
                names=[
                    "Feature summary", "Feature vs. target summary",
                    "Target proportion"
                ],
            )
            if "percent_positive" in bi_summ_df:
                bi_summ_df = bi_summ_df.drop(["percent_positive"], axis=1)

        ### visualizations
        # set label rotation angle
        len_unique_val = len(unique_vals)
        avg_len_unique_val = sum(map(len, str(unique_vals))) / len(unique_vals)
        if len_unique_val <= 4 and avg_len_unique_val <= 12:
            rotation = 0
        elif len_unique_val >= 5 and len_unique_val <= 8 and avg_len_unique_val <= 8:
            rotation = 0
        elif len_unique_val >= 9 and len_unique_val <= 14 and avg_len_unique_val <= 4:
            rotation = 0
        else:
            rotation = 90

        # create prettierplot object
        p = PrettierPlot(chart_scale=chart_scale,
                         plot_orientation="wide_narrow")

        # add canvas to prettierplot object
        ax = p.make_canvas(title="Category counts\n* {}".format(feature),
                           position=131,
                           title_scale=0.82)

        # add treemap to canvas
        p.tree_map(
            counts=uni_summ_df["Count"].values,
            labels=uni_summ_df[feature].values,
            colors=style.color_gen(name=color_map,
                                   num=len(uni_summ_df[feature].values)),
            alpha=0.8,
            ax=ax,
        )

        # add canvas to prettierplot object
        ax = p.make_canvas(
            title="Category counts by target\n* {}".format(feature),
            position=132)

        # add faceted categorical plot to canvas
        p.facet_cat(
            df=bi_summ_df,
            feature=feature,
            label_rotate=rotation,
            color_map=color_map,
            bbox=(1.0, 1.15),
            alpha=0.8,
            legend_labels=legend_labels,
            x_units=None,
            ax=ax,
        )

        # add canvas to prettierplot object
        ax = p.make_canvas(
            title="Target proportion by category\n* {}".format(feature),
            position=133)

        # add stacked bar chart to canvas
        p.stacked_bar_h(
            df=prop_df.drop("Class", axis=1),
            bbox=(1.0, 1.15),
            legend_labels=legend_labels,
            color_map=color_map,
            alpha=0.8,
            ax=ax,
        )

        plt.show()
예제 #9
0
def binary_classification_panel(self,
                                model,
                                labels=None,
                                title_scale=1.0,
                                color_map="viridis",
                                random_state=1,
                                chart_scale=15,
                                save_objects=False):
    """
    Documentation:

        ---
        Description:
            Generate a panel of reports and visualizations summarizing the
            performance of a classification model.

        ---
        Parameters:
            model : model object
                Instantiated model object.
            labels : list, default=None
                Custom labels for confusion matrix axes. If left as none,
                will default to 0, 1, 2...
            color_map : str specifying built-in matplotlib colormap, default="viridis"
                Color map applied to plots.
            title_scale : float, default=1.0
                Controls the scaling up (higher value) and scaling down (lower value) of the size
                of the main chart title, the x_axis title and the y_axis title.
            random_state : int, default=1
                Random number seed.
            chart_scale : int or float, default=15
                Controls size and proportions of chart and chart elements. Higher value creates
                larger plots and increases visual elements proportionally.
            save_objects : boolean, default=False
                Controls whether visualizations and summary table are saved to the experiment directory.
    """
    if not save_objects:
        print("*" * 55)
        print(f"* Estimator: {model.estimator_name}")
        print(f"* Parameter set: {model.model_iter}")
        print("*" * 55)

        print("\n" + "*" * 55)
        print("Training data evaluation\n")

    ## training data
    # fit model on training data and generate predictions using training data
    y_pred = model.fit(self.training_features,
                       self.training_target).predict(self.training_features)

    # generate classification_report using training data
    report = classification_report(
        self.training_target,
        y_pred,
        target_names=labels
        if labels is not None else np.unique(self.training_target.values),
        output_dict=True,
    )

    df = pd.DataFrame(report).transpose()

    # save or display classification report
    if save_objects:
        csv_path = os.path.join(
            self.evaluation_classification_report_object_dir,
            f"{model.estimator_name}_train_classification_report.csv")
        df.to_csv(csv_path, index=False)

    else:
        display(df)

    # create prettierplot object
    p = PrettierPlot(chart_scale=chart_scale, plot_orientation="wide_narrow")

    # add canvas to prettierplot object
    ax = p.make_canvas(
        title=
        f"Confusion matrix - training data\nModel: {model.estimator_name}\nParameter set: {model.model_iter}",
        y_shift=0.4,
        x_shift=0.25,
        position=121,
        title_scale=title_scale,
    )

    # add confusion plot to canvas
    plot_confusion_matrix(
        estimator=model,
        X=self.training_features,
        y_true=self.training_target,
        display_labels=labels
        if labels is not None else np.unique(self.training_target.values),
        cmap=color_map,
        values_format=".0f",
        ax=ax,
    )

    # add canvas to prettierplot object
    ax = p.make_canvas(
        title=
        f"ROC curve - training data\nModel: {model.estimator_name}\nParameter set: {model.model_iter}",
        x_label="False positive rate",
        y_label="True positive rate",
        y_shift=0.35,
        position=122,
        title_scale=title_scale,
    )
    # add ROC curve to canvas
    p.roc_curve_plot(
        model=model,
        X_train=self.training_features,
        y_train=self.training_target,
        linecolor=style.style_grey,
        ax=ax,
    )
    plt.subplots_adjust(wspace=0.3)

    # save plots or show
    if save_objects:
        plot_path = os.path.join(
            self.evaluation_plots_object_dir,
            f"{model.estimator_name}_train_visualization.jpg")
        plt.tight_layout()
        plt.savefig(plot_path)
        plt.close()
    else:
        plt.show()

    ## validation data
    if not save_objects:
        print("\n" + "*" * 55)
        print("Validation data evaluation\n")

    # fit model on training data and generate predictions using validation data
    y_pred = model.fit(self.training_features,
                       self.training_target).predict(self.validation_features)

    # generate classification_report using training data
    report = classification_report(
        self.validation_target,
        y_pred,
        target_names=labels
        if labels is not None else np.unique(self.training_target.values),
        output_dict=True,
    )

    df = pd.DataFrame(report).transpose()

    # save or display classification report
    if save_objects:
        csv_path = os.path.join(
            self.evaluation_classification_report_object_dir,
            f"{model.estimator_name}_validation_classification_report.csv")
        df.to_csv(csv_path, index=False)

    else:
        display(df)

    # create prettierplot object
    p = PrettierPlot(chart_scale=chart_scale, plot_orientation="wide_narrow")

    # add canvas to prettierplot object
    ax = p.make_canvas(
        title=
        f"Confusion matrix - validation data\nModel: {model.estimator_name}\nParameter set: {model.model_iter}",
        y_shift=0.4,
        x_shift=0.25,
        position=121,
        title_scale=title_scale,
    )

    # add confusion matrix to canvas
    plot_confusion_matrix(
        estimator=model,
        X=self.validation_features,
        y_true=self.validation_target,
        display_labels=labels
        if labels is not None else np.unique(self.training_target.values),
        cmap=color_map,
        values_format=".0f",
        ax=ax,
    )

    # add canvas to prettierplot object
    ax = p.make_canvas(
        title=
        f"ROC curve - validation data\nModel: {model.estimator_name}\nParameter set: {model.model_iter}",
        x_label="False positive rate",
        y_label="True positive rate",
        y_shift=0.35,
        position=122,
        # position=111 if X_valid is not None else 121,
        title_scale=title_scale,
    )
    # add ROC curve to canvas
    p.roc_curve_plot(
        model=model,
        X_train=self.training_features,
        y_train=self.training_target,
        X_valid=self.validation_features,
        y_valid=self.validation_target,
        linecolor=style.style_grey,
        ax=ax,
    )
    plt.subplots_adjust(wspace=0.3)

    # save plots or show
    if save_objects:
        plot_path = os.path.join(
            self.evaluation_plots_object_dir,
            f"{model.estimator_name}_validation_visualization.jpg")
        plt.tight_layout()
        plt.savefig(plot_path)
        plt.close()
    else:
        plt.show()
예제 #10
0
def regression_panel(self,
                     model,
                     X_train,
                     y_train,
                     X_valid=None,
                     y_valid=None,
                     n_folds=None,
                     title_scale=1.0,
                     color_map="viridis",
                     random_state=1,
                     chart_scale=15):
    """
    Documentation:
        Description:
            creates a set of residual plots and pandas DataFrames, where each row captures various summary statistics
            pertaining to a model's performance. generates residual plots and captures performance data for training
            and validation datasets. If no validation set is provided, then cross_validation is performed on the
            training dataset.
        Parameters:
            model : model object
                Instantiated model object.
            X_train : Pandas DataFrame
                Training data observations.
            y_train : Pandas Series
                Training target data.
            X_valid : Pandas DataFrame, default=None
                Validation data observations.
            y_valid : Pandas Series, default=None
                Validation target data.
            n_folds : int, default=None
                Number of cross-validation folds to use. If validation data is provided through
                X_valid/y_valid, n_folds is ignored.
            title_scale : float, default=1.0
                Controls the scaling up (higher value) and scaling down (lower value) of the size of
                the main chart title, the x_axis title and the y_axis title.
            color_map : str specifying built-in matplotlib colormap, default="viridis"
                Color map applied to plots.
            random_state : int, default=1
                Random number seed.
            chart_scale : int or float, default=15
                Controls size and proportions of chart and chart elements. Higher value creates larger plots
                and increases visual elements proportionally.
    """

    print("*" * 55)
    print("* Estimator: {}".format(model.estimator_name))
    print("* Parameter set: {}".format(model.model_iter))
    print("*" * 55)

    print("\n" + "*" * 55)
    print("Training data evaluation")

    # fit model on training data
    model.fit(X_train.values, y_train.values)

    ## training dataset
    # generate predictions using training data and calculate residuals
    y_pred = model.predict(X_train.values)
    residuals = y_pred - y_train.values

    # create prettierplot object
    p = PrettierPlot(plot_orientation="wide_narrow")

    # add canvas to prettierplot object
    ax = p.make_canvas(
        title="Residual plot - training data\nModel: {}\nParameter set: {}".
        format(
            model.estimator_name,
            model.model_iter,
        ),
        x_label="Predicted values",
        y_label="Residuals",
        y_shift=0.55,
        title_scale=title_scale,
        position=121,
    )

    # dynamically size precision of x-units based on magnitude of maximum
    # predicted values
    if -1 <= np.nanmax(y_pred) <= 1:
        x_units = "fff"
    elif -100 <= np.nanmax(y_pred) <= 100:
        x_units = "ff"
    else:
        x_units = "f"

    # dynamically size precision of y-units based on magnitude of maximum
    # predicted values
    if -0.1 <= np.nanmax(residuals) <= 0.1:
        y_units = "ffff"
    elif -1 <= np.nanmax(residuals) <= 1:
        y_units = "fff"
    elif -10 <= np.nanmax(residuals) <= 10:
        y_units = "ff"
    else:
        y_units = "f"

    # x tick label rotation
    if -10000 < np.nanmax(y_pred) < 10000:
        x_rotate = 0
    else:
        x_rotate = 45

    # add 2-dimensional scatter plot to canvas
    p.scatter_2d(
        x=y_pred,
        y=residuals,
        size=7,
        color=style.style_grey,
        y_units=y_units,
        x_units=x_units,
        ax=ax,
    )

    # plot horizontal line at y=0
    plt.hlines(y=0,
               xmin=np.min(y_pred),
               xmax=np.max(y_pred),
               color=style.style_grey,
               lw=2)

    # add canvas to prettierplot object
    ax = p.make_canvas(
        title=
        "Residual distribution - training data\nModel: {}\nParameter set: {}".
        format(
            model.estimator_name,
            model.model_iter,
        ),
        title_scale=title_scale,
        position=122,
    )

    # add distribution plot to canvas
    p.dist_plot(
        residuals,
        fit=stats.norm,
        color=style.style_grey,
        y_units="ff",
        x_units="fff",
        ax=ax,
    )
    plt.show()

    # generate regression_stats using training data and predictions
    results = self.regression_stats(
        model=model,
        y_true=y_train.values,
        y_pred=y_pred,
        feature_count=X_train.shape[1],
    )

    # create shell results DataFrame and append
    regression_results_summary = pd.DataFrame(columns=list(results.keys()))
    regression_results_summary = regression_results_summary.append(
        results, ignore_index=True)

    ## validation dataset
    # if validation data is provided...
    if X_valid is not None:
        print("\n" + "*" * 55)
        print("Training data evaluation")

        # generate predictions with validation data and calculate residuals
        y_pred = model.predict(X_train.values)
        residuals = y_pred - y_train.values

        # create prettierplot object
        p = PrettierPlot(plot_orientation="wide_narrow")

        # add canvas to prettierplot object
        ax = p.make_canvas(
            title="Residual plot - training data\nModel: {}\nParameter set: {}"
            .format(
                model.estimator_name,
                model.model_iter,
            ),
            x_label="Predicted values",
            y_label="Residuals",
            y_shift=0.55,
            title_scale=title_scale,
            position=121,
        )

        # add 2-dimensional scatter plot to canvas
        p.scatter_2d(
            x=y_pred,
            y=residuals,
            size=7,
            color=style.style_grey,
            y_units=y_units,
            x_units=x_units,
            ax=ax,
        )

        # plot horizontal line at y=0
        plt.hlines(y=0,
                   xmin=np.min(y_pred),
                   xmax=np.max(y_pred),
                   color=style.style_grey,
                   lw=2)

        # add canvas to prettierplot object
        ax = p.make_canvas(
            title=
            "Residual distribution - training data\nModel: {}\nParameter set: {}"
            .format(
                model.estimator_name,
                model.model_iter,
            ),
            title_scale=title_scale,
            position=122,
        )

        # add distribution plot to canvas
        p.dist_plot(
            residuals,
            fit=stats.norm,
            color=style.style_grey,
            y_units="ff",
            x_units="fff",
            ax=ax,
        )
        plt.show()

        # generate regression_stats using validation data and predictions
        results = self.regression_stats(
            model=model,
            y_true=y_train.values,
            y_pred=y_pred,
            feature_count=X_train.shape[1],
            data_type="validation",
        )

        # append results to regression_results_summary
        regression_results_summary = regression_results_summary.append(
            results, ignore_index=True)
        display(regression_results_summary)

    # if n_folds are provided, indicating cross-validation
    elif isinstance(n_folds, int):

        # generate cross-validation indices
        cv = list(
            KFold(n_splits=n_folds, shuffle=True,
                  random_state=random_state).split(X_train, y_train))

        print("\n" + "*" * 55)
        print("Cross validation evaluation")

        # iterate through cross-validation indices
        for i, (train_ix, valid_ix) in enumerate(cv):
            X_train_cv = X_train.iloc[train_ix]
            y_train_cv = y_train.iloc[train_ix]
            X_valid_cv = X_train.iloc[valid_ix]
            y_valid_cv = y_train.iloc[valid_ix]

            # fit model on training data and generate predictions using holdout observations
            y_pred = model.fit(X_train_cv.values,
                               y_train_cv.values).predict(X_valid_cv.values)

            # calculate residuals
            residuals = y_pred - y_valid_cv.values

            # create prettierplot object
            p = PrettierPlot(plot_orientation="wide_narrow")

            # add canvas to prettierplot object
            ax = p.make_canvas(
                title="Residual plot - CV fold {}\nModel: {}\nParameter set: {}"
                .format(
                    i + 1,
                    model.estimator_name,
                    model.model_iter,
                ),
                x_label="Predicted values",
                y_label="Residuals",
                y_shift=0.55,
                position=121,
                title_scale=title_scale,
            )

            # add 2-dimensional scatter plot to canvas
            p.scatter_2d(
                x=y_pred,
                y=residuals,
                size=7,
                color=style.style_grey,
                # color=color_list[i],
                y_units=y_units,
                x_units=x_units,
                ax=ax,
            )

            # plot horizontal line at y=0
            plt.hlines(
                y=0,
                xmin=np.min(y_pred),
                xmax=np.max(y_pred),
                color=style.style_grey,
                lw=2,
            )

            # add canvas to prettierplot object
            ax = p.make_canvas(
                title=
                "Residual distribution - CV fold {}\nModel: {}\nParameter set: {}"
                .format(
                    i + 1,
                    model.estimator_name,
                    model.model_iter,
                ),
                title_scale=title_scale,
                position=122,
            )

            # add distribution plot to canvas
            p.dist_plot(
                residuals,
                fit=stats.norm,
                color=style.style_grey,
                y_units="ff",
                x_units="fff",
                ax=ax,
            )
            plt.show()

            # generate regression_stats using holdout observations and predictions
            results = self.regression_stats(
                model=model,
                y_true=y_valid_cv,
                y_pred=y_pred,
                feature_count=X_valid_cv.shape[1],
                data_type="validation",
                fold=i + 1,
            )

            # append results to regression_results_summary
            regression_results_summary = regression_results_summary.append(
                results, ignore_index=True)
        print("\n" + "*" * 55)
        print("Summary")

        display(regression_results_summary)
    else:
        display(regression_results_summary)
예제 #11
0
from prettierplot.plotter import PrettierPlot
from prettierplot import data
import numpy as np

df = data.attrition()

# capture unique EmployeeField values and frequency counts
unique_vals, unique_counts = np.unique(
    df[df["EducationField"].notnull()]["EducationField"], return_counts=True)

# create plotting instance
p = PrettierPlot(chart_scale=10)

# create Axes object and decorate
ax = p.make_canvas(title="Educational field category counts",
                   y_label="Category counts",
                   y_shift=0.47)

# add plots
p.bar_v(x=unique_vals, counts=unique_counts, label_rotate=45, x_tick_wrap=True)
예제 #12
0
def eda_num_target_num_feat(self,
                            feature,
                            training_data=True,
                            color_map="viridis",
                            chart_scale=15,
                            save_plots=False):
    """
    Documentation:

        ---
        Description:
            Produces exploratory data visualizations and statistical summaries for a numeric
            feature in the context of a numeric target.

        ---
        Parameters:
            feature : str
                Feature to visualize.
            training_data : boolean, dafault=True
                Controls which dataset (training or validation) is used for visualization.
            color_map : str specifying built-in matplotlib colormap, default="viridis"
                Color map applied to plots.
            chart_scale : int or float, default=15
                Controls size and proportions of chart and chart elements. Higher value creates
                larger plots and increases visual elements proportionally.
            save_plots : boolean, default=False
                Controls whether model loss plot imgaes are saved to the experiment directory.
    """
    # dynamically choose training data objects or validation data objects
    data, target, mlm_dtypes = self.training_or_validation_dataset(
        training_data)

    ### data summaries
    ## feature summary
    # combine feature column and target
    bi_df = pd.concat([data[feature], target], axis=1)

    # remove any rows with nulls
    bi_df = bi_df[bi_df[feature].notnull()]

    # cast target as float
    bi_df[target.name] = bi_df[target.name].astype(float)

    # create summary statistic table
    describe_df = pd.DataFrame(bi_df[feature].describe()).reset_index()

    # add skew and kurtosis to describe_df
    describe_df = describe_df.append(
        {
            "index": "skew",
            feature: stats.skew(bi_df[feature].values, nan_policy="omit"),
        },
        ignore_index=True,
    )
    describe_df = describe_df.append(
        {
            "index": "kurtosis",
            feature: stats.kurtosis(bi_df[feature].values, nan_policy="omit"),
        },
        ignore_index=True,
    )
    describe_df = describe_df.rename(columns={"index": ""})

    # display summary tables
    display(describe_df)

    ### visualizations
    # create prettierplot object
    p = PrettierPlot(chart_scale=chart_scale, plot_orientation="wide_narrow")

    # add canvas to prettierplot object
    ax = p.make_canvas(title=f"Feature distribution\n* {feature}",
                       position=131,
                       title_scale=1.2)

    # determine x-units precision based on magnitude of max value
    if -1 <= np.nanmax(bi_df[feature].values) <= 1:
        x_units = "fff"
    elif -10 <= np.nanmax(bi_df[feature].values) <= 10:
        x_units = "ff"
    else:
        x_units = "f"

    # determine y-units precision based on magnitude of max value
    if -1 <= np.nanmax(bi_df[feature].values) <= 1:
        y_units = "fff"
    elif -10 <= np.nanmax(bi_df[feature].values) <= 10:
        y_units = "ff"
    else:
        y_units = "f"

    # x rotation
    if -10000 < np.nanmax(bi_df[feature].values) < 10000:
        x_rotate = 0
    else:
        x_rotate = 45

    # add distribution plot to canvas
    p.dist_plot(
        bi_df[feature].values,
        color=style.style_grey,
        y_units=y_units,
        x_rotate=x_rotate,
        ax=ax,
    )

    # add canvas to prettierplot object
    ax = p.make_canvas(title=f"Probability plot\n* {feature}", position=132)

    # add QQ / probability plot to canvas
    p.prob_plot(x=bi_df[feature].values, plot=ax)

    # add canvas to prettierplot object
    ax = p.make_canvas(
        title=f"Regression plot - feature vs. target\n* {feature}",
        position=133,
        title_scale=1.5)

    # add regression plot to canvas
    p.reg_plot(
        x=feature,
        y=target.name,
        data=bi_df,
        x_jitter=0.1,
        x_rotate=x_rotate,
        x_units=x_units,
        y_units=y_units,
        ax=ax,
    )

    # save plots or show
    if save_plots:
        plot_path = os.path.join(
            self.eda_object_dir,
            f"{feature}.jpg".replace("/", ""),
        )
        plt.tight_layout()
        plt.savefig(plot_path)
        plt.close()
    else:
        plt.show()
예제 #13
0
def model_loss_plot(self, bayes_optim_summary, estimator_class, chart_scale=15, trim_outliers=True, outlier_control=1.5,
                    title_scale=0.7, color_map="viridis"):
    """
    Documentation:

        ---
        Definition:
            Visualize how the bayesian optimization loss changes over time across all iterations.
            Extremely poor results are removed from visualized dataset by two filters.
                1) Loss values worse than [loss mean + (2 x loss standard deviation)]
                2) Loss values worse than [median * outliers_control]. 'outlier_control' is a parameter
                   that can be set during function execution.

        ---
        Parameters:
            bayes_optim_summary : Pandas DataFrame
                Pandas DataFrame containing results from bayesian optimization process.
            estimator_class : str or sklearn api object
                Name of estimator to visualize.
            chart_scale : float, default=15
                Control chart proportions. Higher values scale up size of chart objects, lower
                values scale down size of chart objects.
            trim_outliers : boolean, default=True
                Remove extremely high (poor) results by trimming values where the loss is greater
                than 2 standard deviations away from the mean.
            outlier_control : float: default=1.5
                Controls enforcement of outlier trimming. Value is multiplied by median, and the resulting
                product is the cap placed on loss values. Values higher than this cap will be excluded.
                Lower values of outlier_control apply more extreme filtering to loss values.
            title_scale : float, default=0.7
                Controls the scaling up (higher value) and scaling down (lower value) of the size of
                the main chart title, the x_axis title and the y_axis title.
            color_map : str specifying built-in matplotlib colormap, default="viridis"
                Color map applied to plots.
    """
    # unpack bayes_optim_summary parameters for an estimator_class
    estimator_summary = self.unpack_bayes_optim_summary(
        bayes_optim_summary=bayes_optim_summary, estimator_class=estimator_class
    )

    # apply outlier trimming
    if trim_outliers:
        mean = estimator_summary["iter_loss"].mean()
        median = estimator_summary["iter_loss"].median()
        std = estimator_summary["iter_loss"].std()
        cap = mean + (2.0 * std)
        estimator_summary = estimator_summary[
            (estimator_summary["iter_loss"] < cap)
            & (estimator_summary["iter_loss"] < outlier_control * median)
        ]

    # create color list based on color_map
    color_list = style.color_gen(name=color_map, num=3)

    # create prettierplot object
    p = PrettierPlot(chart_scale=chart_scale)

    # add canvas to prettierplot object
    ax = p.make_canvas(
        title="Loss by iteration - {}".format(estimator_class),
        y_shift=0.8,
        position=111,
        title_scale=title_scale,
    )

    # add regression plot to canvas
    p.reg_plot(
        x="iteration",
        y="iter_loss",
        data=estimator_summary,
        y_units="ffff",
        line_color=color_list[0],
        dot_color=color_list[1],
        alpha=0.6,
        line_width=0.4,
        dot_size=10.0,
        ax=ax,
    )
    plt.show()
예제 #14
0
def model_param_plot(self, bayes_optim_summary, estimator_class, estimator_parameter_space, n_iter, chart_scale=15,
                    color_map="viridis", title_scale=1.2, show_single_str_params=False):
    """
    Documentation:

        ---
        Definition:
            Visualize hyperparameter optimization over all iterations. Compares theoretical distribution to
            the distribution of values that were actually chosen, and visualizes how parameter value
            selections changes over time.

        ---
        Parameters:
            bayes_optim_summary : Pandas DataFrame
                Pandas DataFrame containing results from bayesian optimization process.
            estimator_class : str or sklearn api object
                Name of estimator to visualize.
            estimator_parameter_space : dictionary of dictionaries
                Dictionary of nested dictionaries. Outer key is an estimator, and the corresponding value is
                a dictionary. Each nested dictionary contains 'parameter: value distribution' key/value
                pairs. The inner dictionary key specifies the parameter of the model to be tuned, and the
                value is a distribution of values from which trial values are drawn.
            n_iter : int
                Number of iterations to draw from theoretical distribution in order to visualize the
                theoretical distribution. Higher number leader to more robust distribution but can take
                considerably longer to create.
            chart_scale : float, default=15
                Controls proportions of visualizations. larger values scale visual up in size, smaller values
                scale visual down in size.
            color_map : str specifying built-in matplotlib colormap, default="viridis"
                Color map applied to plots.
            title_scale : float, default=1.2
                Controls the scaling up (higher value) and scaling down (lower value) of the size of
                the main chart title, the x_axis title and the y_axis title.
            show_single_str_params : boolean, default=False
                Controls whether to display visuals for string attributes where there is only one unique value,
                i.e. there was only one choice for the optimization procedure to choose from during each iteration.
    """
    # unpack bayes_optim_summary parameters for an estimator_class
    estimator_summary = self.unpack_bayes_optim_summary(
        bayes_optim_summary=bayes_optim_summary, estimator_class=estimator_class
    )

    # override None with string representation
    estimator_summary = estimator_summary.replace([None], "None")

    # subset estimator_parameter_space to space for the specified estimator_class
    estimator_space = estimator_parameter_space[estimator_class]

    print("*" * 100)
    print("* {}".format(estimator_class))
    print("*" * 100)

    # iterate through each parameter
    for param in estimator_space.keys():

        # sample from theoretical distribution for n_iters
        theoretical_dist = []
        for _ in range(n_iter):
            theoretical_dist.append(sample(estimator_space)[param])

        ## override None with string representation
        # theoretical distribution
        theoretical_dist = ["none" if v is None else v for v in theoretical_dist]
        theoretical_dist = np.array(theoretical_dist)

        # actual distribution
        actual_dist = estimator_summary[param].tolist()
        actual_dist = ["none" if v is None else v for v in actual_dist]
        actual_dist = np.array(actual_dist)

        # limit estimator_summary to "iteration" and current "param" columns
        actual_iter_df = estimator_summary[["iteration", param]]

        # identify how many values in param column are zero or one
        zeros_and_ones = (actual_iter_df[param].eq(True) | actual_iter_df[param].eq(False)).sum()

        # param column only contains zeros and ones, store string representations of "TRUE" and "FALSE"
        if zeros_and_ones == actual_iter_df.shape[0]:
            actual_iter_df = actual_iter_df.replace({True: "TRUE", False: "FALSE"})

        # if theoreitcal distribution has dtype -- np.bool_, store string representations of "TRUE" and "FALSE"
        if isinstance(theoretical_dist[0], np.bool_):
            theoretical_dist = np.array(["TRUE" if i == True else "FALSE" for i in theoretical_dist.tolist()])

            estimator_summary = estimator_summary.replace([True], "TRUE")
            estimator_summary = estimator_summary.replace([False], "FALSE")

        # if theoretical distribution contains str data, then treat this as an object/category parameter
        if any(isinstance(d, str) for d in theoretical_dist):

            # generate color list for stripplot
            stripplot_color_list = style.color_gen(name=color_map, num=len(actual_iter_df[param].unique()) + 1)

            # generate color list for bar chart
            bar_color_list = style.color_gen(name=color_map, num=3)

            # identify unique values and associated count in theoretical distribution
            unique_vals_theo, unique_counts_theo = np.unique(theoretical_dist, return_counts=True)

            # if theoretical distribution only has one unique value and show_single_str_params is set to True
            if len(unique_vals_theo) > 1 or show_single_str_params:

                # identify unique values and associated count in actual distribution
                unique_vals_actual, unique_counts_actual = np.unique(actual_dist, return_counts=True)

                # store data in DataFrame
                df = pd.DataFrame({"param": unique_vals_actual, "Theorical": unique_counts_theo, "Actual": unique_counts_actual})

                # create prettierplot object
                p = PrettierPlot(chart_scale=chart_scale, plot_orientation = "wide_narrow")

                # add canvas to prettierplot object
                ax = p.make_canvas(
                    title="Selection vs. theoretical distribution\n* {0} - {1}".format(estimator_class, param),
                    y_shift=0.8,
                    position=121,
                    title_scale=title_scale,
                )

                # add faceted bar chart to canvas
                p.facet_cat(
                    df=df,
                    feature="param",
                    color_map=bar_color_list[:-1],
                    bbox=(1.0, 1.15),
                    alpha=1.0,
                    legend_labels=df.columns[1:].values,
                    x_units=None,
                    ax=ax,
                )

                # add canvas to prettierplot object
                ax = p.make_canvas(
                    title="Selection by iteration\n* {0} - {1}".format(estimator_class, param),
                    y_shift=0.5,
                    position=122,
                    title_scale=title_scale,
                )

                # add stripply to canvas
                sns.stripplot(
                    x="iteration",
                    y=param,
                    data=estimator_summary,
                    jitter=0.3,
                    alpha=1.0,
                    size=0.7 * chart_scale,
                    palette=sns.color_palette(stripplot_color_list[:-1]),
                    ax=ax,
                ).set(xlabel=None, ylabel=None)

                # set tick label font size
                ax.tick_params(axis="both", colors=style.style_grey, labelsize=1.2 * chart_scale)

                plt.show()

        # otherwise treat it as a numeric parameter
        else:
            # cast "iteration" as an int and the param values as float
            convert_dict = {"iteration": int, param: float}
            actual_iter_df = actual_iter_df.astype(convert_dict)

            # create color map
            color_list = style.color_gen(name=color_map, num=3)

            # create prettierplot object
            p = PrettierPlot(chart_scale=chart_scale, plot_orientation = "wide_narrow")

            # add canvas to prettierplot object
            ax = p.make_canvas(
                title="Selection vs. theoretical distribution\n* {0} - {1}".format(estimator_class, param),
                y_shift=0.8,
                position=121,
                title_scale=title_scale,
            )

            # dynamically set x-unit precision based on max value
            if -1.0 <= np.nanmax(theoretical_dist) <= 1.0:
                x_units = "fff"
            elif 1.0 < np.nanmax(theoretical_dist) <= 5.0:
                x_units = "ff"
            elif np.nanmax(theoretical_dist) > 5.0:
                x_units = "f"

            # add kernsel density plot for theoretical distribution to canvas
            p.kde_plot(
                theoretical_dist,
                color=color_list[0],
                y_units="ffff",
                x_units=x_units,
                line_width=0.4,
                bw=0.4,
                ax=ax,
            )

            # add kernsel density plot for actual distribution to canvas
            p.kde_plot(
                actual_dist,
                color=color_list[1],
                y_units="ffff",
                x_units=x_units,
                line_width=0.4,
                bw=0.4,
                ax=ax,
            )

            ## create custom legend
            # create labels
            label_color = {}
            legend_labels = ["Theoretical", "Actual"]
            for ix, i in enumerate(legend_labels):
                label_color[i] = color_list[ix]

            # create legend Patches
            Patches = [Patch(color=v, label=k, alpha=1.0) for k, v in label_color.items()]

            # draw legend
            leg = plt.legend(
                handles=Patches,
                fontsize=1.1 * chart_scale,
                loc="upper right",
                markerscale=0.6 * chart_scale,
                ncol=1,
                bbox_to_anchor=(.95, 1.1),
            )

            # label font color
            for text in leg.get_texts():
                plt.setp(text, color="grey")

            # dynamically set y-unit precision based on max value
            if -1.0 <= np.nanmax(actual_iter_df[param]) <= 1.0:
                y_units = "fff"
            elif 1.0 < np.nanmax(actual_iter_df[param]) <= 5.0:
                y_units = "ff"
            elif np.nanmax(actual_iter_df[param]) > 5.0:
                y_units = "f"

            # add canvas to prettierplot object
            ax = p.make_canvas(
                title="Selection by iteration\n* {0} - {1}".format(estimator_class, param),
                y_shift=0.8,
                position=122,
                title_scale=title_scale,
            )

            # add regression plot to canvas
            p.reg_plot(
                x="iteration",
                y=param,
                data=actual_iter_df,
                y_units=y_units,
                x_units="f",
                line_color=color_list[0],
                line_width=0.4,
                dot_color=color_list[1],
                dot_size=10.0,
                alpha=0.6,
                ax=ax
            )
            plt.show()
예제 #15
0
def eda_num_target_num_feat(self,
                            feature,
                            color_map="viridis",
                            chart_scale=15):
    """
    Documentation:

        ---
        Description:
            Produces exploratory data visualizations and statistical summaries for a numeric
            feature in the context of a numeric target.

        ---
        Parameters:
            feature : str
                Feature to visualize.
            color_map : str specifying built-in matplotlib colormap, default="viridis"
                Color map applied to plots.
            chart_scale : int or float, default=15
                Controls size and proportions of chart and chart elements. Higher value creates
                larger plots and increases visual elements proportionally.
    """
    ### data summaries
    ## feature summary
    # combine feature column and target
    bi_df = pd.concat([self.data[feature], self.target], axis=1)

    # remove any rows with nulls
    bi_df = bi_df[bi_df[feature].notnull()]

    # cast target as float
    bi_df[self.target.name] = bi_df[self.target.name].astype(float)

    # create summary statistic table
    describe_df = pd.DataFrame(bi_df[feature].describe()).reset_index()

    # add skew and kurtosis to describe_df
    describe_df = describe_df.append(
        {
            "index": "skew",
            feature: stats.skew(bi_df[feature].values, nan_policy="omit"),
        },
        ignore_index=True,
    )
    describe_df = describe_df.append(
        {
            "index": "kurtosis",
            feature: stats.kurtosis(bi_df[feature].values, nan_policy="omit"),
        },
        ignore_index=True,
    )
    describe_df = describe_df.rename(columns={"index": ""})

    # display summary tables
    display(describe_df)

    ### visualizations
    # create prettierplot object
    p = PrettierPlot(chart_scale=chart_scale, plot_orientation="wide_narrow")

    # add canvas to prettierplot object
    ax = p.make_canvas(title="Feature distribution\n* {}".format(feature),
                       position=131,
                       title_scale=1.2)

    # determine x-units precision based on magnitude of max value
    if -1 <= np.nanmax(bi_df[feature].values) <= 1:
        x_units = "fff"
    elif -10 <= np.nanmax(bi_df[feature].values) <= 10:
        x_units = "ff"
    else:
        x_units = "f"

    # determine y-units precision based on magnitude of max value
    if -1 <= np.nanmax(bi_df[feature].values) <= 1:
        y_units = "fff"
    elif -10 <= np.nanmax(bi_df[feature].values) <= 10:
        y_units = "ff"
    else:
        y_units = "f"

    # x rotation
    if -10000 < np.nanmax(bi_df[feature].values) < 10000:
        x_rotate = 0
    else:
        x_rotate = 45

    # add distribution plot to canvas
    p.dist_plot(
        bi_df[feature].values,
        color=style.style_grey,
        y_units=y_units,
        x_rotate=x_rotate,
        ax=ax,
    )

    # add canvas to prettierplot object
    ax = p.make_canvas(title="Probability plot\n* {}".format(feature),
                       position=132)

    # add QQ / probability plot to canvas
    p.prob_plot(x=bi_df[feature].values, plot=ax)

    # add canvas to prettierplot object
    ax = p.make_canvas(
        title="Regression plot - feature vs. target\n* {}".format(feature),
        position=133,
        title_scale=1.5)

    # add regression plot to canvas
    p.reg_plot(
        x=feature,
        y=self.target.name,
        data=bi_df,
        x_jitter=0.1,
        x_rotate=x_rotate,
        x_units=x_units,
        y_units=y_units,
        ax=ax,
    )
    plt.show()
예제 #16
0
def eda_num_target_cat_feat(self,
                            feature,
                            level_count_cap=50,
                            color_map="viridis",
                            chart_scale=15):
    """
    Documentation:

        ---
        Description:
            Produces exploratory data visualizations and statistical summaries for a category
            feature in the context of a numeric target.

        ---
        Parameters:
            feature : str
                Feature to visualize.
            level_count_cap : int, default=50
                Maximum number of unique levels in feature. If the number of levels exceeds the
                cap then the feature is skipped.
            color_map : str specifying built-in matplotlib colormap, default="viridis"
                Color map applied to plots.
            chart_scale : int or float, default=15
                Controls size and proportions of chart and chart elements. Higher value creates
                larger plots and increases visual elements proportionally.
    """
    # if number of unique levels in feature is less than specified level_count_cap
    if (len(np.unique(self.data[self.data[feature].notnull()][feature].values))
            < level_count_cap):

        ### data summaries
        ## feature summary
        # create empty DataFrame
        uni_summ_df = pd.DataFrame(columns=[feature, "Count", "Proportion"])

        # capture unique values and count of those unique values
        unique_vals, unique_counts = np.unique(
            self.data[self.data[feature].notnull()][feature],
            return_counts=True)

        # append each unique value, count and proportion to DataFrame
        for i, j in zip(unique_vals, unique_counts):
            uni_summ_df = uni_summ_df.append(
                {
                    feature: i,
                    "Count": j,
                    "Proportion": j / np.sum(unique_counts) * 100
                },
                ignore_index=True,
            )

        # sort DataFrame by "Proportion", descending
        uni_summ_df = uni_summ_df.sort_values(by=["Proportion"],
                                              ascending=False)

        # set values to int dtype where applicable to optimize
        if is_numeric_dtype(uni_summ_df[feature]):
            uni_summ_df[feature] = uni_summ_df[feature].astype("int64")
        uni_summ_df["Count"] = uni_summ_df["Count"].astype("int64")

        ## feature vs. target summary
        # combine feature column and target
        bi_df = pd.concat([self.data[feature], self.target], axis=1)

        # remove any rows with nulls
        bi_df = bi_df[bi_df[feature].notnull()]

        # cast target as float
        bi_df[self.target.name] = bi_df[self.target.name].astype(float)

        # create pivot table of target summary statistics, grouping by category feature
        bi_summ_piv_df = pd.pivot_table(
            bi_df,
            index=feature,
            aggfunc={
                self.target.name:
                [np.nanmin, np.nanmax, np.nanmean, np.nanmedian, np.nanstd]
            })
        multi_index = bi_summ_piv_df.columns
        single_index = pd.Index([i[1] for i in multi_index.tolist()])
        bi_summ_piv_df.columns = single_index
        bi_summ_piv_df.reset_index(inplace=True)
        bi_summ_piv_df = bi_summ_piv_df.rename(
            columns={
                "nanmin": "Min",
                "nanmax": "Max",
                "nanmean": "Mean",
                "nanmedian": "Median",
                "nanstd": "StdDev",
            })
        # fill nan's with zero
        fill_columns = bi_summ_piv_df.iloc[:, 1:].columns
        bi_summ_piv_df[fill_columns] = bi_summ_piv_df[fill_columns].fillna(0)

        # reorder column
        bi_summ_piv_df = bi_summ_piv_df[[
            feature, "Mean", "Median", "StdDev", "Min", "Max"
        ]]

        # convert to int
        if is_numeric_dtype(bi_summ_piv_df[feature]):
            bi_summ_piv_df[feature] = bi_summ_piv_df[feature].astype("int64")

        # display summary tables
        self.df_side_by_side(
            dfs=(uni_summ_df, bi_summ_piv_df),
            names=["Feature summary", "Feature vs. target summary"],
        )

        ### visualizations
        # create prettierplot object
        p = PrettierPlot(chart_scale=chart_scale,
                         plot_orientation="wide_narrow")

        # add canvas to prettierplot object
        ax = p.make_canvas(title="Category counts\n* {}".format(feature),
                           position=131,
                           title_scale=1.0)

        # add treemap to canvas
        p.tree_map(
            counts=uni_summ_df["Count"].values,
            labels=uni_summ_df[feature].values,
            colors=style.color_gen(name=color_map,
                                   num=len(uni_summ_df[feature].values)),
            alpha=0.8,
            ax=ax,
        )

        # add canvas to prettierplot object
        ax = p.make_canvas(title="Feature distribution\n* {}".format(feature),
                           position=132)

        # error catching block for resorting labels
        try:
            sorted(unique_vals, key=int)
        except ValueError:
            pass
        else:
            # sort unique_vals/unique_counts for bar chart
            new_ix = [
                sorted(list(unique_vals), key=int).index(i)
                for i in list(unique_vals)
            ]
            unique_vals = np.array(sorted(list(unique_vals), key=int))
            unique_counts = np.array(
                [y for x, y in sorted(zip(new_ix, unique_counts))])

            # sort temporary data frame for box plot
            bi_df[feature] = bi_df[feature].astype(int)

        # dynamically set rotation angle based on number unique values and maximum length of
        # category labels.
        len_unique_val = len(unique_vals)
        avg_len_unique_val = sum(map(len, str(unique_vals))) / len(unique_vals)
        if len_unique_val <= 4 and avg_len_unique_val <= 12:
            rotation = 0
        elif len_unique_val >= 5 and len_unique_val <= 8 and avg_len_unique_val <= 7.0:
            rotation = 0
        elif len_unique_val >= 9 and len_unique_val <= 14 and avg_len_unique_val <= 6:
            rotation = 0
        else:
            rotation = 30

        # represent x-axis tick labels as integers rather than floats
        x_values = list(map(str, unique_vals.tolist()))
        try:
            x_values = [int(float(x)) for x in x_values]
        except ValueError:
            pass

        # add bar chart to canvas
        p.bar_v(
            x=x_values,
            counts=unique_counts,
            label_rotate=rotation,
            color=style.style_grey,
            y_units="f",
            x_tick_wrap=True,
            ax=ax,
        )

        # hide every other label if total number of levels is greater than 40
        if len_unique_val > 40:
            n = 2
            [
                l.set_visible(False)
                for (i, l) in enumerate(ax.xaxis.get_ticklabels())
                if i % n != 0
            ]

        # add canvas to prettierplot object
        ax = p.make_canvas(title="Boxplot by category\n* {}".format(feature),
                           position=133)

        ## dynamically determine precision of y-units
        # capture min and max feature values
        dist_min = bi_df[self.target.name].values.min()
        dist_max = bi_df[self.target.name].values.max()

        # determine y-units precision based on min and max values in feature
        if -3 < dist_min < 3 and -3 < dist_max < 3 and dist_max / dist_min < 10:
            y_units = "fff"
        elif -30 < dist_min < 30 and -30 < dist_max < 30 and dist_max / dist_min < 3:
            y_units = "fff"
        elif -5 < dist_min < 5 and -5 < dist_max < 5 and dist_max / dist_min < 10:
            y_units = "ff"
        elif -90 < dist_min < 90 and -90 < dist_max < 90 and dist_max / dist_min < 5:
            y_units = "ff"
        else:
            y_units = "f"

        # add vertical box plot to canvas
        p.box_plot_v(
            x=feature,
            y=self.target.name,
            data=bi_df.sort_values([feature]),
            color=matplotlib.cm.get_cmap(name=color_map),
            label_rotate=rotation,
            y_units=y_units,
            ax=ax,
        )

        # hide every other label if total number of levels is greater than 40
        if len_unique_val > 40:
            n = 2
            [
                l.set_visible(False)
                for (i, l) in enumerate(ax.xaxis.get_ticklabels())
                if i % n != 0
            ]

        plt.show()
예제 #17
0
def binary_classification_panel(self, model, X_train, y_train, X_valid=None, y_valid=None, labels=None,
                        n_folds=None, title_scale=1.0, color_map="viridis", random_state=1, chart_scale=15):
    """
    Documentation:

        ---
        Description:
            Generate a panel of reports and visualizations summarizing the
            performance of a classification model.

        ---
        Parameters:
            model : model object
                Instantiated model object.
            X_train : Pandas DataFrame
                Training data observations.
            y_train : Pandas Series
                Training target data.
            X_valid : Pandas DataFrame, default=None
                Validation data observations.
            y_valid : Pandas Series, default=None
                Validation target data.
            labels : list, default=None
                Custom labels for confusion matrix axes. If left as none,
                will default to 0, 1, 2...
            n_folds : int, default=None
                Number of cross-validation folds to use. If validation data is provided through
                X_valid/y_valid, n_folds is ignored.
            color_map : str specifying built-in matplotlib colormap, default="viridis"
                Color map applied to plots.
            title_scale : float, default=1.0
                Controls the scaling up (higher value) and scaling down (lower value) of the size
                of the main chart title, the x_axis title and the y_axis title.
            random_state : int, default=1
                Random number seed.
            chart_scale : int or float, default=15
                Controls size and proportions of chart and chart elements. Higher value creates
                larger plots and increases visual elements proportionally.
    """
    print("*" * 55)
    print("* Estimator: {}".format(model.estimator_name))
    print("* Parameter set: {}".format(model.model_iter))
    print("*" * 55)

    print("\n" + "*" * 55)
    print("Training data evaluation\n")

    ## training panel
    # fit model on training data and generate predictions using training data
    y_pred = model.fit(X_train, y_train).predict(X_train)

    # print and generate classification_report using training data
    print(
            classification_report(
                y_train,
                y_pred,
                target_names=labels if labels is not None else np.unique(y_train.values),
            )
        )

    # create prettierplot object
    p = PrettierPlot(chart_scale=chart_scale, plot_orientation="wide_narrow")

    # add canvas to prettierplot object
    ax = p.make_canvas(
        title="Confusion matrix - training data\nModel: {}\nParameter set: {}".format(
            model.estimator_name, model.model_iter
        ),
        y_shift=0.4,
        x_shift=0.25,
        position=121,
        title_scale=title_scale,
    )

    # add confusion plot to canvas
    plot_confusion_matrix(
        estimator=model,
        X=X_train,
        y_true=y_train,
        display_labels=labels if labels is not None else np.unique(y_train.values),
        cmap=color_map,
        values_format=".0f",
        ax=ax,
    )

    # add canvas to prettierplot object
    ax = p.make_canvas(
        title="ROC curve - training data\nModel: {}\nParameter set: {}".format(
            model.estimator_name,
            model.model_iter,
        ),
        x_label="False positive rate",
        y_label="True positive rate",
        y_shift=0.35,
        position=122,
        title_scale=title_scale,
    )
    # add ROC curve to canvas
    p.roc_curve_plot(
        model=model,
        X_train=X_train,
        y_train=y_train,
        linecolor=style.style_grey,
        ax=ax,
    )
    plt.subplots_adjust(wspace=0.3)
    plt.show()

    # if validation data is provided
    if X_valid is not None:
        print("\n" + "*" * 55)
        print("Validation data evaluation\n")

        # fit model on training data and generate predictions using validation data
        y_pred = model.fit(X_train, y_train).predict(X_valid)

        # print and generate classification_report using training data
        print(
            classification_report(
                y_valid,
                y_pred,
                target_names=labels if labels is not None else np.unique(y_train.values),
            )
        )

        # create prettierplot object
        p = PrettierPlot(chart_scale=chart_scale, plot_orientation="wide_narrow")

        # add canvas to prettierplot object
        ax = p.make_canvas(
            title="Confusion matrix - validation data\nModel: {}\nParameter set: {}".format(
                model.estimator_name, model.model_iter
            ),
            y_shift=0.4,
            x_shift=0.25,
            position=121,
            title_scale=title_scale,
        )

        # add confusion matrix to canvas
        plot_confusion_matrix(
            estimator=model,
            X=X_valid,
            y_true=y_valid,
            display_labels=labels if labels is not None else np.unique(y_train.values),
            cmap=color_map,
            values_format=".0f",
            ax=ax,
        )

        # add canvas to prettierplot object
        ax = p.make_canvas(
            title="ROC curve - validation data\nModel: {}\nParameter set: {}".format(
                model.estimator_name,
                model.model_iter,
            ),
            x_label="False positive rate",
            y_label="True positive rate",
            y_shift=0.35,
            position=122,
            # position=111 if X_valid is not None else 121,
            title_scale=title_scale,
        )
        # add ROC curve to canvas
        p.roc_curve_plot(
            model=model,
            X_train=X_train,
            y_train=y_train,
            X_valid=X_valid,
            y_valid=y_valid,
            linecolor=style.style_grey,
            ax=ax,
        )
        plt.subplots_adjust(wspace=0.3)
        plt.show()

    # if n_folds are provided, indicating cross-validation
    elif isinstance(n_folds, int):
        print("\n" + "*" * 55)
        print("Cross validation evaluation\n")

        # generate cross-validation indices
        cv = list(
            StratifiedKFold(
                n_splits=n_folds, shuffle=True, random_state=random_state
            ).split(X_train, y_train)
        )

        # generate colors
        color_list = style.color_gen(color_map, num=len(cv))

        # iterate through cross-validation indices
        for i, (train_ix, valid_ix) in enumerate(cv):
            print("\n" + "*" * 55)
            print("CV Fold {}\n".format(i + 1))

            X_train_cv = X_train.iloc[train_ix]
            y_train_cv = y_train.iloc[train_ix]
            X_valid_cv = X_train.iloc[valid_ix]
            y_valid_cv = y_train.iloc[valid_ix]

            # fit model on training data and generate predictions using holdout observations
            y_pred = model.fit(X_train_cv, y_train_cv).predict(X_valid_cv)

            # print and generate classification_report using holdout observations
            print(
            classification_report(
                    y_valid_cv,
                    y_pred,
                    target_names=labels if labels is not None else np.unique(y_train.values),
                )
            )

            # create prettierplot object
            p = PrettierPlot(chart_scale=chart_scale, plot_orientation="wide_narrow")

            # add canvas to prettierplot object
            ax = p.make_canvas(
                title="Confusion matrix - CV Fold {}\nModel: {}\nParameter set: {}".format(
                    i + 1, model.estimator_name, model.model_iter
                ),
                y_shift=0.4,
                x_shift=0.25,
                position=121,
                title_scale=title_scale,
            )

            # add confusion matrix to canvas
            plot_confusion_matrix(
                estimator=model,
                X=X_valid_cv,
                y_true=y_valid_cv,
                display_labels=labels if labels is not None else np.unique(y_train.values),
                cmap=color_map,
                values_format=".0f",
                ax=ax,
            )

            # add canvas to prettierplot object
            ax = p.make_canvas(
                title="ROC curve - CV Fold {}\nModel: {}\nParameter set: {}".format(
                    i + 1,
                    model.estimator_name,
                    model.model_iter,
                ),
                x_label="False positive rate",
                y_label="True positive rate",
                y_shift=0.35,
                position=122,
                title_scale=title_scale,
            )

            # add ROC curve to canvas
            p.roc_curve_plot(
                model=model,
                X_train=X_train_cv,
                y_train=y_train_cv,
                X_valid=X_valid_cv,
                y_valid=y_valid_cv,
                linecolor=style.style_grey,
                ax=ax,
            )
            plt.subplots_adjust(wspace=0.3)
            plt.show()