示例#1
0
def eda_num_target_num_feat(self,
                            feature,
                            color_map="viridis",
                            chart_scale=15):
    """
    Documentation:

        ---
        Description:
            Produces exploratory data visualizations and statistical summaries for a numeric
            feature in the context of a numeric target.

        ---
        Parameters:
            feature : str
                Feature to visualize.
            color_map : str specifying built-in matplotlib colormap, default="viridis"
                Color map applied to plots.
            chart_scale : int or float, default=15
                Controls size and proportions of chart and chart elements. Higher value creates
                larger plots and increases visual elements proportionally.
    """
    ### data summaries
    ## feature summary
    # combine feature column and target
    bi_df = pd.concat([self.data[feature], self.target], axis=1)

    # remove any rows with nulls
    bi_df = bi_df[bi_df[feature].notnull()]

    # cast target as float
    bi_df[self.target.name] = bi_df[self.target.name].astype(float)

    # create summary statistic table
    describe_df = pd.DataFrame(bi_df[feature].describe()).reset_index()

    # add skew and kurtosis to describe_df
    describe_df = describe_df.append(
        {
            "index": "skew",
            feature: stats.skew(bi_df[feature].values, nan_policy="omit"),
        },
        ignore_index=True,
    )
    describe_df = describe_df.append(
        {
            "index": "kurtosis",
            feature: stats.kurtosis(bi_df[feature].values, nan_policy="omit"),
        },
        ignore_index=True,
    )
    describe_df = describe_df.rename(columns={"index": ""})

    # display summary tables
    display(describe_df)

    ### visualizations
    # create prettierplot object
    p = PrettierPlot(chart_scale=chart_scale, plot_orientation="wide_narrow")

    # add canvas to prettierplot object
    ax = p.make_canvas(title="Feature distribution\n* {}".format(feature),
                       position=131,
                       title_scale=1.2)

    # determine x-units precision based on magnitude of max value
    if -1 <= np.nanmax(bi_df[feature].values) <= 1:
        x_units = "fff"
    elif -10 <= np.nanmax(bi_df[feature].values) <= 10:
        x_units = "ff"
    else:
        x_units = "f"

    # determine y-units precision based on magnitude of max value
    if -1 <= np.nanmax(bi_df[feature].values) <= 1:
        y_units = "fff"
    elif -10 <= np.nanmax(bi_df[feature].values) <= 10:
        y_units = "ff"
    else:
        y_units = "f"

    # x rotation
    if -10000 < np.nanmax(bi_df[feature].values) < 10000:
        x_rotate = 0
    else:
        x_rotate = 45

    # add distribution plot to canvas
    p.dist_plot(
        bi_df[feature].values,
        color=style.style_grey,
        y_units=y_units,
        x_rotate=x_rotate,
        ax=ax,
    )

    # add canvas to prettierplot object
    ax = p.make_canvas(title="Probability plot\n* {}".format(feature),
                       position=132)

    # add QQ / probability plot to canvas
    p.prob_plot(x=bi_df[feature].values, plot=ax)

    # add canvas to prettierplot object
    ax = p.make_canvas(
        title="Regression plot - feature vs. target\n* {}".format(feature),
        position=133,
        title_scale=1.5)

    # add regression plot to canvas
    p.reg_plot(
        x=feature,
        y=self.target.name,
        data=bi_df,
        x_jitter=0.1,
        x_rotate=x_rotate,
        x_units=x_units,
        y_units=y_units,
        ax=ax,
    )
    plt.show()
示例#2
0
def model_param_plot(self, bayes_optim_summary, estimator_class, estimator_parameter_space, n_iter, chart_scale=15,
                    color_map="viridis", title_scale=1.2, show_single_str_params=False):
    """
    Documentation:

        ---
        Definition:
            Visualize hyperparameter optimization over all iterations. Compares theoretical distribution to
            the distribution of values that were actually chosen, and visualizes how parameter value
            selections changes over time.

        ---
        Parameters:
            bayes_optim_summary : Pandas DataFrame
                Pandas DataFrame containing results from bayesian optimization process.
            estimator_class : str or sklearn api object
                Name of estimator to visualize.
            estimator_parameter_space : dictionary of dictionaries
                Dictionary of nested dictionaries. Outer key is an estimator, and the corresponding value is
                a dictionary. Each nested dictionary contains 'parameter: value distribution' key/value
                pairs. The inner dictionary key specifies the parameter of the model to be tuned, and the
                value is a distribution of values from which trial values are drawn.
            n_iter : int
                Number of iterations to draw from theoretical distribution in order to visualize the
                theoretical distribution. Higher number leader to more robust distribution but can take
                considerably longer to create.
            chart_scale : float, default=15
                Controls proportions of visualizations. larger values scale visual up in size, smaller values
                scale visual down in size.
            color_map : str specifying built-in matplotlib colormap, default="viridis"
                Color map applied to plots.
            title_scale : float, default=1.2
                Controls the scaling up (higher value) and scaling down (lower value) of the size of
                the main chart title, the x_axis title and the y_axis title.
            show_single_str_params : boolean, default=False
                Controls whether to display visuals for string attributes where there is only one unique value,
                i.e. there was only one choice for the optimization procedure to choose from during each iteration.
    """
    # unpack bayes_optim_summary parameters for an estimator_class
    estimator_summary = self.unpack_bayes_optim_summary(
        bayes_optim_summary=bayes_optim_summary, estimator_class=estimator_class
    )

    # override None with string representation
    estimator_summary = estimator_summary.replace([None], "None")

    # subset estimator_parameter_space to space for the specified estimator_class
    estimator_space = estimator_parameter_space[estimator_class]

    print("*" * 100)
    print("* {}".format(estimator_class))
    print("*" * 100)

    # iterate through each parameter
    for param in estimator_space.keys():

        # sample from theoretical distribution for n_iters
        theoretical_dist = []
        for _ in range(n_iter):
            theoretical_dist.append(sample(estimator_space)[param])

        ## override None with string representation
        # theoretical distribution
        theoretical_dist = ["none" if v is None else v for v in theoretical_dist]
        theoretical_dist = np.array(theoretical_dist)

        # actual distribution
        actual_dist = estimator_summary[param].tolist()
        actual_dist = ["none" if v is None else v for v in actual_dist]
        actual_dist = np.array(actual_dist)

        # limit estimator_summary to "iteration" and current "param" columns
        actual_iter_df = estimator_summary[["iteration", param]]

        # identify how many values in param column are zero or one
        zeros_and_ones = (actual_iter_df[param].eq(True) | actual_iter_df[param].eq(False)).sum()

        # param column only contains zeros and ones, store string representations of "TRUE" and "FALSE"
        if zeros_and_ones == actual_iter_df.shape[0]:
            actual_iter_df = actual_iter_df.replace({True: "TRUE", False: "FALSE"})

        # if theoreitcal distribution has dtype -- np.bool_, store string representations of "TRUE" and "FALSE"
        if isinstance(theoretical_dist[0], np.bool_):
            theoretical_dist = np.array(["TRUE" if i == True else "FALSE" for i in theoretical_dist.tolist()])

            estimator_summary = estimator_summary.replace([True], "TRUE")
            estimator_summary = estimator_summary.replace([False], "FALSE")

        # if theoretical distribution contains str data, then treat this as an object/category parameter
        if any(isinstance(d, str) for d in theoretical_dist):

            # generate color list for stripplot
            stripplot_color_list = style.color_gen(name=color_map, num=len(actual_iter_df[param].unique()) + 1)

            # generate color list for bar chart
            bar_color_list = style.color_gen(name=color_map, num=3)

            # identify unique values and associated count in theoretical distribution
            unique_vals_theo, unique_counts_theo = np.unique(theoretical_dist, return_counts=True)

            # if theoretical distribution only has one unique value and show_single_str_params is set to True
            if len(unique_vals_theo) > 1 or show_single_str_params:

                # identify unique values and associated count in actual distribution
                unique_vals_actual, unique_counts_actual = np.unique(actual_dist, return_counts=True)

                # store data in DataFrame
                df = pd.DataFrame({"param": unique_vals_actual, "Theorical": unique_counts_theo, "Actual": unique_counts_actual})

                # create prettierplot object
                p = PrettierPlot(chart_scale=chart_scale, plot_orientation = "wide_narrow")

                # add canvas to prettierplot object
                ax = p.make_canvas(
                    title="Selection vs. theoretical distribution\n* {0} - {1}".format(estimator_class, param),
                    y_shift=0.8,
                    position=121,
                    title_scale=title_scale,
                )

                # add faceted bar chart to canvas
                p.facet_cat(
                    df=df,
                    feature="param",
                    color_map=bar_color_list[:-1],
                    bbox=(1.0, 1.15),
                    alpha=1.0,
                    legend_labels=df.columns[1:].values,
                    x_units=None,
                    ax=ax,
                )

                # add canvas to prettierplot object
                ax = p.make_canvas(
                    title="Selection by iteration\n* {0} - {1}".format(estimator_class, param),
                    y_shift=0.5,
                    position=122,
                    title_scale=title_scale,
                )

                # add stripply to canvas
                sns.stripplot(
                    x="iteration",
                    y=param,
                    data=estimator_summary,
                    jitter=0.3,
                    alpha=1.0,
                    size=0.7 * chart_scale,
                    palette=sns.color_palette(stripplot_color_list[:-1]),
                    ax=ax,
                ).set(xlabel=None, ylabel=None)

                # set tick label font size
                ax.tick_params(axis="both", colors=style.style_grey, labelsize=1.2 * chart_scale)

                plt.show()

        # otherwise treat it as a numeric parameter
        else:
            # cast "iteration" as an int and the param values as float
            convert_dict = {"iteration": int, param: float}
            actual_iter_df = actual_iter_df.astype(convert_dict)

            # create color map
            color_list = style.color_gen(name=color_map, num=3)

            # create prettierplot object
            p = PrettierPlot(chart_scale=chart_scale, plot_orientation = "wide_narrow")

            # add canvas to prettierplot object
            ax = p.make_canvas(
                title="Selection vs. theoretical distribution\n* {0} - {1}".format(estimator_class, param),
                y_shift=0.8,
                position=121,
                title_scale=title_scale,
            )

            # dynamically set x-unit precision based on max value
            if -1.0 <= np.nanmax(theoretical_dist) <= 1.0:
                x_units = "fff"
            elif 1.0 < np.nanmax(theoretical_dist) <= 5.0:
                x_units = "ff"
            elif np.nanmax(theoretical_dist) > 5.0:
                x_units = "f"

            # add kernsel density plot for theoretical distribution to canvas
            p.kde_plot(
                theoretical_dist,
                color=color_list[0],
                y_units="ffff",
                x_units=x_units,
                line_width=0.4,
                bw=0.4,
                ax=ax,
            )

            # add kernsel density plot for actual distribution to canvas
            p.kde_plot(
                actual_dist,
                color=color_list[1],
                y_units="ffff",
                x_units=x_units,
                line_width=0.4,
                bw=0.4,
                ax=ax,
            )

            ## create custom legend
            # create labels
            label_color = {}
            legend_labels = ["Theoretical", "Actual"]
            for ix, i in enumerate(legend_labels):
                label_color[i] = color_list[ix]

            # create legend Patches
            Patches = [Patch(color=v, label=k, alpha=1.0) for k, v in label_color.items()]

            # draw legend
            leg = plt.legend(
                handles=Patches,
                fontsize=1.1 * chart_scale,
                loc="upper right",
                markerscale=0.6 * chart_scale,
                ncol=1,
                bbox_to_anchor=(.95, 1.1),
            )

            # label font color
            for text in leg.get_texts():
                plt.setp(text, color="grey")

            # dynamically set y-unit precision based on max value
            if -1.0 <= np.nanmax(actual_iter_df[param]) <= 1.0:
                y_units = "fff"
            elif 1.0 < np.nanmax(actual_iter_df[param]) <= 5.0:
                y_units = "ff"
            elif np.nanmax(actual_iter_df[param]) > 5.0:
                y_units = "f"

            # add canvas to prettierplot object
            ax = p.make_canvas(
                title="Selection by iteration\n* {0} - {1}".format(estimator_class, param),
                y_shift=0.8,
                position=122,
                title_scale=title_scale,
            )

            # add regression plot to canvas
            p.reg_plot(
                x="iteration",
                y=param,
                data=actual_iter_df,
                y_units=y_units,
                x_units="f",
                line_color=color_list[0],
                line_width=0.4,
                dot_color=color_list[1],
                dot_size=10.0,
                alpha=0.6,
                ax=ax
            )
            plt.show()
示例#3
0
def eda_num_target_num_feat(self,
                            feature,
                            training_data=True,
                            color_map="viridis",
                            chart_scale=15,
                            save_plots=False):
    """
    Documentation:

        ---
        Description:
            Produces exploratory data visualizations and statistical summaries for a numeric
            feature in the context of a numeric target.

        ---
        Parameters:
            feature : str
                Feature to visualize.
            training_data : boolean, dafault=True
                Controls which dataset (training or validation) is used for visualization.
            color_map : str specifying built-in matplotlib colormap, default="viridis"
                Color map applied to plots.
            chart_scale : int or float, default=15
                Controls size and proportions of chart and chart elements. Higher value creates
                larger plots and increases visual elements proportionally.
            save_plots : boolean, default=False
                Controls whether model loss plot imgaes are saved to the experiment directory.
    """
    # dynamically choose training data objects or validation data objects
    data, target, mlm_dtypes = self.training_or_validation_dataset(
        training_data)

    ### data summaries
    ## feature summary
    # combine feature column and target
    bi_df = pd.concat([data[feature], target], axis=1)

    # remove any rows with nulls
    bi_df = bi_df[bi_df[feature].notnull()]

    # cast target as float
    bi_df[target.name] = bi_df[target.name].astype(float)

    # create summary statistic table
    describe_df = pd.DataFrame(bi_df[feature].describe()).reset_index()

    # add skew and kurtosis to describe_df
    describe_df = describe_df.append(
        {
            "index": "skew",
            feature: stats.skew(bi_df[feature].values, nan_policy="omit"),
        },
        ignore_index=True,
    )
    describe_df = describe_df.append(
        {
            "index": "kurtosis",
            feature: stats.kurtosis(bi_df[feature].values, nan_policy="omit"),
        },
        ignore_index=True,
    )
    describe_df = describe_df.rename(columns={"index": ""})

    # display summary tables
    display(describe_df)

    ### visualizations
    # create prettierplot object
    p = PrettierPlot(chart_scale=chart_scale, plot_orientation="wide_narrow")

    # add canvas to prettierplot object
    ax = p.make_canvas(title=f"Feature distribution\n* {feature}",
                       position=131,
                       title_scale=1.2)

    # determine x-units precision based on magnitude of max value
    if -1 <= np.nanmax(bi_df[feature].values) <= 1:
        x_units = "fff"
    elif -10 <= np.nanmax(bi_df[feature].values) <= 10:
        x_units = "ff"
    else:
        x_units = "f"

    # determine y-units precision based on magnitude of max value
    if -1 <= np.nanmax(bi_df[feature].values) <= 1:
        y_units = "fff"
    elif -10 <= np.nanmax(bi_df[feature].values) <= 10:
        y_units = "ff"
    else:
        y_units = "f"

    # x rotation
    if -10000 < np.nanmax(bi_df[feature].values) < 10000:
        x_rotate = 0
    else:
        x_rotate = 45

    # add distribution plot to canvas
    p.dist_plot(
        bi_df[feature].values,
        color=style.style_grey,
        y_units=y_units,
        x_rotate=x_rotate,
        ax=ax,
    )

    # add canvas to prettierplot object
    ax = p.make_canvas(title=f"Probability plot\n* {feature}", position=132)

    # add QQ / probability plot to canvas
    p.prob_plot(x=bi_df[feature].values, plot=ax)

    # add canvas to prettierplot object
    ax = p.make_canvas(
        title=f"Regression plot - feature vs. target\n* {feature}",
        position=133,
        title_scale=1.5)

    # add regression plot to canvas
    p.reg_plot(
        x=feature,
        y=target.name,
        data=bi_df,
        x_jitter=0.1,
        x_rotate=x_rotate,
        x_units=x_units,
        y_units=y_units,
        ax=ax,
    )

    # save plots or show
    if save_plots:
        plot_path = os.path.join(
            self.eda_object_dir,
            f"{feature}.jpg".replace("/", ""),
        )
        plt.tight_layout()
        plt.savefig(plot_path)
        plt.close()
    else:
        plt.show()
示例#4
0
def model_loss_plot(self, bayes_optim_summary, estimator_class, chart_scale=15, trim_outliers=True, outlier_control=1.5,
                    title_scale=0.7, color_map="viridis"):
    """
    Documentation:

        ---
        Definition:
            Visualize how the bayesian optimization loss changes over time across all iterations.
            Extremely poor results are removed from visualized dataset by two filters.
                1) Loss values worse than [loss mean + (2 x loss standard deviation)]
                2) Loss values worse than [median * outliers_control]. 'outlier_control' is a parameter
                   that can be set during function execution.

        ---
        Parameters:
            bayes_optim_summary : Pandas DataFrame
                Pandas DataFrame containing results from bayesian optimization process.
            estimator_class : str or sklearn api object
                Name of estimator to visualize.
            chart_scale : float, default=15
                Control chart proportions. Higher values scale up size of chart objects, lower
                values scale down size of chart objects.
            trim_outliers : boolean, default=True
                Remove extremely high (poor) results by trimming values where the loss is greater
                than 2 standard deviations away from the mean.
            outlier_control : float: default=1.5
                Controls enforcement of outlier trimming. Value is multiplied by median, and the resulting
                product is the cap placed on loss values. Values higher than this cap will be excluded.
                Lower values of outlier_control apply more extreme filtering to loss values.
            title_scale : float, default=0.7
                Controls the scaling up (higher value) and scaling down (lower value) of the size of
                the main chart title, the x_axis title and the y_axis title.
            color_map : str specifying built-in matplotlib colormap, default="viridis"
                Color map applied to plots.
    """
    # unpack bayes_optim_summary parameters for an estimator_class
    estimator_summary = self.unpack_bayes_optim_summary(
        bayes_optim_summary=bayes_optim_summary, estimator_class=estimator_class
    )

    # apply outlier trimming
    if trim_outliers:
        mean = estimator_summary["iter_loss"].mean()
        median = estimator_summary["iter_loss"].median()
        std = estimator_summary["iter_loss"].std()
        cap = mean + (2.0 * std)
        estimator_summary = estimator_summary[
            (estimator_summary["iter_loss"] < cap)
            & (estimator_summary["iter_loss"] < outlier_control * median)
        ]

    # create color list based on color_map
    color_list = style.color_gen(name=color_map, num=3)

    # create prettierplot object
    p = PrettierPlot(chart_scale=chart_scale)

    # add canvas to prettierplot object
    ax = p.make_canvas(
        title="Loss by iteration - {}".format(estimator_class),
        y_shift=0.8,
        position=111,
        title_scale=title_scale,
    )

    # add regression plot to canvas
    p.reg_plot(
        x="iteration",
        y="iter_loss",
        data=estimator_summary,
        y_units="ffff",
        line_color=color_list[0],
        dot_color=color_list[1],
        alpha=0.6,
        line_width=0.4,
        dot_size=10.0,
        ax=ax,
    )
    plt.show()