예제 #1
0
def eda_cat_target_cat_feat(self,
                            feature,
                            level_count_cap=50,
                            color_map="viridis",
                            legend_labels=None,
                            chart_scale=15):
    """
    Documentation:

        ---
        Description:
            Creates exploratory data visualizations and statistical summaries for a category feature
            in the context of a categorical target.

        ---
        Parameters:
            feature : str
                Feature to visualize.
            level_count_cap : int, default=50
                Maximum number of unique levels in feature. If the number of levels exceeds the
                cap, then no visualization panel is produced.
            color_map : str specifying built-in matplotlib colormap, default="viridis"
                Color map applied to plots.
            legend_labels : list, default=None
                Class labels displayed in plot legend.
            chart_scale : int or float, default=15
                Controls size and proportions of chart and chart elements. Higher value creates
                larger plots and increases visual elements proportionally.
    """

    # if number of unique levels in feature is less than specified level_count_cap
    if (len(np.unique(self.data[self.data[feature].notnull()][feature].values))
            < level_count_cap):

        ### data summaries
        ## feature summary
        # create empty DataFrame
        uni_summ_df = pd.DataFrame(columns=[feature, "Count", "Proportion"])

        # capture unique values and count of those unique values
        unique_vals, unique_counts = np.unique(
            self.data[self.data[feature].notnull()][feature],
            return_counts=True)

        # append each unique value, count and proportion to DataFrame
        for i, j in zip(unique_vals, unique_counts):
            uni_summ_df = uni_summ_df.append(
                {
                    feature: i,
                    "Count": j,
                    "Proportion": j / np.sum(unique_counts) * 100,
                },
                ignore_index=True,
            )

        # sort DataFrame by "Proportion", descending
        uni_summ_df = uni_summ_df.sort_values(by=["Proportion"],
                                              ascending=False)

        # set values to int dtype where applicable to optimize
        uni_summ_df["Count"] = uni_summ_df["Count"].astype("int64")
        if is_numeric_dtype(uni_summ_df[feature]):
            uni_summ_df[feature] = uni_summ_df[feature].astype("int64")

        ## feature vs. target summary
        # combine feature column and target
        bi_df = pd.concat([self.data[feature], self.target], axis=1)

        # remove any rows with nulls
        bi_df = bi_df[bi_df[feature].notnull()]

        # groupby category feature and count the occurrences of target classes
        # for each level in category
        bi_summ_df = (
            bi_df.groupby([feature] +
                          [self.target.name]).size().reset_index().pivot(
                              columns=self.target.name,
                              index=feature,
                              values=0))

        # overwrite DataFrame index with actual class labels if provided
        bi_summ_df.columns = pd.Index(
            legend_labels) if legend_labels is not None else pd.Index(
                [i for i in bi_summ_df.columns.tolist()])
        bi_summ_df.reset_index(inplace=True)

        # fill nan's with zero
        fill_columns = bi_summ_df.iloc[:, 2:].columns
        bi_summ_df[fill_columns] = bi_summ_df[fill_columns].fillna(0)

        # set values to int dtype where applicable to optimize displayed DataFrame
        for column in bi_summ_df.columns:
            try:
                bi_summ_df[column] = bi_summ_df[column].astype(np.int)
            except ValueError:
                bi_summ_df[column] = bi_summ_df[column]

        ## proportion by category summary
        # combine feature column and target
        prop_df = pd.concat([self.data[feature], self.target], axis=1)

        # remove any rows with nulls
        prop_df = prop_df[prop_df[feature].notnull()]

        # calculate percent of 100 by class label
        prop_df = prop_df.groupby([feature, self.target.name
                                   ]).agg({self.target.name: {"count"}})
        prop_df = prop_df.groupby(
            level=0).apply(lambda x: 100 * x / float(x.sum()))
        prop_df = prop_df.reset_index()

        multiIndex = prop_df.columns
        singleIndex = [i[0] for i in multiIndex.tolist()]
        singleIndex[-1] = "Count"
        prop_df.columns = singleIndex
        prop_df = prop_df.reset_index(drop=True)

        prop_df = pd.pivot_table(prop_df,
                                 values=["Count"],
                                 columns=[feature],
                                 index=[self.target.name],
                                 aggfunc={"Count": np.mean})
        prop_df = prop_df.reset_index(drop=True)

        multiIndex = prop_df.columns
        singleIndex = []

        for column in multiIndex.tolist():
            try:
                singleIndex.append(int(column[1]))
            except ValueError:
                singleIndex.append(column[1])

        prop_df.columns = singleIndex
        prop_df = prop_df.reset_index(drop=True)

        # insert column to DataFrame with actual class labels if provided, otherwise use raw class labels in target
        prop_df.insert(loc=0,
                       column="Class",
                       value=legend_labels if legend_labels is not None else
                       np.unique(self.target))

        # fill nan's with zero
        fill_columns = prop_df.iloc[:, :].columns
        prop_df[fill_columns] = prop_df[fill_columns].fillna(0)

        # if there are only two class labels, perform z-test/t-test
        if len(np.unique(bi_df[bi_df[feature].notnull()][feature])) == 2:

            # total observations
            total_obs1 = bi_df[(bi_df[feature] == np.unique(
                bi_df[feature])[0])][feature].shape[0]
            total_obs2 = bi_df[(bi_df[feature] == np.unique(
                bi_df[feature])[1])][feature].shape[0]

            # total positive observations
            pos_obs1 = bi_df[(bi_df[feature] == np.unique(bi_df[feature])[0])
                             &
                             (bi_df[self.target.name] == 1)][feature].shape[0]
            pos_obs2 = bi_df[(bi_df[feature] == np.unique(bi_df[feature])[1])
                             &
                             (bi_df[self.target.name] == 1)][feature].shape[0]

            # perform z-test, return z-statistic and p-value
            z, p_val = proportions_ztest(count=(pos_obs1, pos_obs2),
                                         nobs=(total_obs1, total_obs2))

            # add z-statistic and p-value to DataFrame
            stat_test_df = pd.DataFrame(
                data=[{
                    "z-test statistic": z,
                    "p-value": p_val
                }],
                columns=["z-test statistic", "p-value"],
                index=[feature],
            ).round(4)

            # display summary tables
            self.df_side_by_side(
                dfs=(uni_summ_df, bi_summ_df, prop_df, stat_test_df),
                names=[
                    "Feature summary",
                    "Feature vs. target summary",
                    "Target proportion",
                    "Statistical test",
                ],
            )
            if "percent_positive" in bi_summ_df:
                bi_summ_df = bi_summ_df.drop(["percent_positive"], axis=1)

        else:
            # display summary tables
            self.df_side_by_side(
                dfs=(uni_summ_df, bi_summ_df, prop_df),
                names=[
                    "Feature summary", "Feature vs. target summary",
                    "Target proportion"
                ],
            )
            if "percent_positive" in bi_summ_df:
                bi_summ_df = bi_summ_df.drop(["percent_positive"], axis=1)

        ### visualizations
        # set label rotation angle
        len_unique_val = len(unique_vals)
        avg_len_unique_val = sum(map(len, str(unique_vals))) / len(unique_vals)
        if len_unique_val <= 4 and avg_len_unique_val <= 12:
            rotation = 0
        elif len_unique_val >= 5 and len_unique_val <= 8 and avg_len_unique_val <= 8:
            rotation = 0
        elif len_unique_val >= 9 and len_unique_val <= 14 and avg_len_unique_val <= 4:
            rotation = 0
        else:
            rotation = 90

        # create prettierplot object
        p = PrettierPlot(chart_scale=chart_scale,
                         plot_orientation="wide_narrow")

        # add canvas to prettierplot object
        ax = p.make_canvas(title="Category counts\n* {}".format(feature),
                           position=131,
                           title_scale=0.82)

        # add treemap to canvas
        p.tree_map(
            counts=uni_summ_df["Count"].values,
            labels=uni_summ_df[feature].values,
            colors=style.color_gen(name=color_map,
                                   num=len(uni_summ_df[feature].values)),
            alpha=0.8,
            ax=ax,
        )

        # add canvas to prettierplot object
        ax = p.make_canvas(
            title="Category counts by target\n* {}".format(feature),
            position=132)

        # add faceted categorical plot to canvas
        p.facet_cat(
            df=bi_summ_df,
            feature=feature,
            label_rotate=rotation,
            color_map=color_map,
            bbox=(1.0, 1.15),
            alpha=0.8,
            legend_labels=legend_labels,
            x_units=None,
            ax=ax,
        )

        # add canvas to prettierplot object
        ax = p.make_canvas(
            title="Target proportion by category\n* {}".format(feature),
            position=133)

        # add stacked bar chart to canvas
        p.stacked_bar_h(
            df=prop_df.drop("Class", axis=1),
            bbox=(1.0, 1.15),
            legend_labels=legend_labels,
            color_map=color_map,
            alpha=0.8,
            ax=ax,
        )

        plt.show()
예제 #2
0
def pair_plot(self, df, columns=None, target=None, diag_kind="auto", legend_labels=None, drop_na=True,
                    bbox=(2.0, 1.0), alpha=0.7, color_map="viridis"):
    """
    Documentation:

        ---
        Description:
            Create pair plot that produces a grid of scatter plots for all unique pairs of
            number features and a series of KDE or histogram plots along the diagonal.

        ---
        Parameters:
            df : Pandas DataFrame
                Pandas DataFrame containing data of interest.
            columns : list, default=None
                List of strings describing columns in Pandas DataFrame to be visualized. If None,
                all columns are visualized.
            target : Pandas Series, default=None
                Introduce third dimension to scatter plots through a color hue that differentiates
                dots based on the category value.
            diag_kind : str, default='auto.
                Type of plot created along diagonal.
            drop_na : boolean, default=True
                Controls whether rows with null values are dropped.
            legend_labels : list, default=None
                List containing strings of custom labels to display in legend.
            bbox : tuple of floats, default=None
                Coordinates for determining legend position.
            alpha : float, default=0.7
                Controls transparency of objects. Accepts value between 0.0 and 1.0.
            color_map : str specifying built-in matplotlib colormap, default="viridis"
                Color map applied to plots.
    """
    # custom plot formatting settings for this particular chart.
    with plt.rc_context(
        {
            "axes.titlesize": 3.5 * self.chart_scale,
            "axes.labelsize": 1.5 * self.chart_scale,  # axis title font size
            "xtick.labelsize": 1.2 * self.chart_scale,
            "xtick.major.size": 0.5 * self.chart_scale,
            "xtick.major.width": 0.05 * self.chart_scale,
            "xtick.color": style.style_grey,
            "ytick.labelsize": 1.2 * self.chart_scale,
            "ytick.major.size": 0.5 * self.chart_scale,
            "ytick.major.width": 0.05 * self.chart_scale,
            "ytick.color": style.style_grey,
            "figure.facecolor": style.style_white,
            "axes.facecolor": style.style_white,
            "axes.spines.left": False,
            "axes.spines.bottom": False,
            "axes.edgecolor": style.style_grey,
            "axes.grid": False,
        }
    ):
        # optionally drop rows with nulls
        if drop_na:
            df = df.dropna()

        # optionally limit to a subset of columns
        if columns is not None:
            df = df[columns]

        # merge df with target if target is provided
        if target is not None:
            df = df.merge(target, left_index=True, right_index=True)

        # create pair plot
        g = sns.pairplot(
            data=df if target is None else df.dropna(),
            vars=df.columns
            if target is None
            else [x for x in df.columns if x is not target.name],
            hue=target if target is None else target.name,
            diag_kind=diag_kind,
            height=0.2 * self.chart_scale,
            plot_kws={
                "s": 2.0 * self.chart_scale,
                "edgecolor": None,
                "linewidth": 1,
                "alpha": alpha,
                "marker": "o",
                "facecolor": style.style_grey if target is None else None,
            },
            diag_kws={
                "facecolor": style.style_grey if target is None else style.style_white,
                "linewidth": 2,
                },
            # diag_kws={"facecolor": style.style_grey if target is None else None},
            palette=None
            if target is None
            else sns.color_palette(
                style.color_gen(color_map, num=len(np.unique(target)))
            ),
        )

        # plot formatting
        for ax in g.axes.flat:

            _ = ax.set_xlabel(
                    "\n".join(textwrap.wrap(str(ax.get_xlabel()).replace("_", " "), 12))
                , rotation=40, ha="right")
            _ = ax.set_ylabel(
                    "\n".join(textwrap.wrap(str(ax.get_ylabel()).replace("_", " "), 12))
                , rotation=40, ha="right")
            _ = ax.xaxis.labelpad = 20
            _ = ax.yaxis.labelpad = 40
            _ = ax.xaxis.label.set_color(style.style_grey)
            _ = ax.yaxis.label.set_color(style.style_grey)

            # wrap long x-tick labels
            plt.xlabel(
                # 0,
                [
                    "\n".join(textwrap.wrap(str(i).replace("_", " "), 12))
                    for i in ax.get_xlabel()
                ],
                # ha="center",
            )

            # wrap long y-tick labels
            plt.ylabel(
                # 0,
                [
                    "\n".join(textwrap.wrap(str(i).replace("_", " "), 12))
                    for i in ax.get_xlabel()
                ],
                # va="center_baseline",
            )

        # adjust subplot relative positioning
        plt.subplots_adjust(hspace=0.0, wspace=0.0)

        # add custom legend describing hue labels
        if target is not None:
            g._legend.remove()

            ## create custom legend
            # create labels
            if legend_labels is None:
                legend_labels = np.unique(df[df[target.name].notnull()][target.name])
            else:
                legend_labels = np.array(legend_labels)

            # generate colors
            color_list = style.color_gen("viridis", num=len(legend_labels))

            label_color = {}
            for ix, i in enumerate(legend_labels):
                label_color[i] = color_list[ix]

            # create legend Patches
            patches = [Patch(color=v, label=k, alpha=alpha) for k, v in label_color.items()]

            # draw legend
            leg = plt.legend(
                handles=patches,
                fontsize=0.6 * self.chart_scale * np.log1p(len(g.axes.flat)),
                loc="upper right",
                markerscale=0.15 * self.chart_scale * np.log1p(len(g.axes.flat)),
                ncol=1,
                bbox_to_anchor=bbox,
            )

            # label font color
            for text in leg.get_texts():
                plt.setp(text, color="grey")
예제 #3
0
def binary_classification_panel(self, model, X_train, y_train, X_valid=None, y_valid=None, labels=None,
                        n_folds=None, title_scale=1.0, color_map="viridis", random_state=1, chart_scale=15):
    """
    Documentation:

        ---
        Description:
            Generate a panel of reports and visualizations summarizing the
            performance of a classification model.

        ---
        Parameters:
            model : model object
                Instantiated model object.
            X_train : Pandas DataFrame
                Training data observations.
            y_train : Pandas Series
                Training target data.
            X_valid : Pandas DataFrame, default=None
                Validation data observations.
            y_valid : Pandas Series, default=None
                Validation target data.
            labels : list, default=None
                Custom labels for confusion matrix axes. If left as none,
                will default to 0, 1, 2...
            n_folds : int, default=None
                Number of cross-validation folds to use. If validation data is provided through
                X_valid/y_valid, n_folds is ignored.
            color_map : str specifying built-in matplotlib colormap, default="viridis"
                Color map applied to plots.
            title_scale : float, default=1.0
                Controls the scaling up (higher value) and scaling down (lower value) of the size
                of the main chart title, the x_axis title and the y_axis title.
            random_state : int, default=1
                Random number seed.
            chart_scale : int or float, default=15
                Controls size and proportions of chart and chart elements. Higher value creates
                larger plots and increases visual elements proportionally.
    """
    print("*" * 55)
    print("* Estimator: {}".format(model.estimator_name))
    print("* Parameter set: {}".format(model.model_iter))
    print("*" * 55)

    print("\n" + "*" * 55)
    print("Training data evaluation\n")

    ## training panel
    # fit model on training data and generate predictions using training data
    y_pred = model.fit(X_train, y_train).predict(X_train)

    # print and generate classification_report using training data
    print(
            classification_report(
                y_train,
                y_pred,
                target_names=labels if labels is not None else np.unique(y_train.values),
            )
        )

    # create prettierplot object
    p = PrettierPlot(chart_scale=chart_scale, plot_orientation="wide_narrow")

    # add canvas to prettierplot object
    ax = p.make_canvas(
        title="Confusion matrix - training data\nModel: {}\nParameter set: {}".format(
            model.estimator_name, model.model_iter
        ),
        y_shift=0.4,
        x_shift=0.25,
        position=121,
        title_scale=title_scale,
    )

    # add confusion plot to canvas
    plot_confusion_matrix(
        estimator=model,
        X=X_train,
        y_true=y_train,
        display_labels=labels if labels is not None else np.unique(y_train.values),
        cmap=color_map,
        values_format=".0f",
        ax=ax,
    )

    # add canvas to prettierplot object
    ax = p.make_canvas(
        title="ROC curve - training data\nModel: {}\nParameter set: {}".format(
            model.estimator_name,
            model.model_iter,
        ),
        x_label="False positive rate",
        y_label="True positive rate",
        y_shift=0.35,
        position=122,
        title_scale=title_scale,
    )
    # add ROC curve to canvas
    p.roc_curve_plot(
        model=model,
        X_train=X_train,
        y_train=y_train,
        linecolor=style.style_grey,
        ax=ax,
    )
    plt.subplots_adjust(wspace=0.3)
    plt.show()

    # if validation data is provided
    if X_valid is not None:
        print("\n" + "*" * 55)
        print("Validation data evaluation\n")

        # fit model on training data and generate predictions using validation data
        y_pred = model.fit(X_train, y_train).predict(X_valid)

        # print and generate classification_report using training data
        print(
            classification_report(
                y_valid,
                y_pred,
                target_names=labels if labels is not None else np.unique(y_train.values),
            )
        )

        # create prettierplot object
        p = PrettierPlot(chart_scale=chart_scale, plot_orientation="wide_narrow")

        # add canvas to prettierplot object
        ax = p.make_canvas(
            title="Confusion matrix - validation data\nModel: {}\nParameter set: {}".format(
                model.estimator_name, model.model_iter
            ),
            y_shift=0.4,
            x_shift=0.25,
            position=121,
            title_scale=title_scale,
        )

        # add confusion matrix to canvas
        plot_confusion_matrix(
            estimator=model,
            X=X_valid,
            y_true=y_valid,
            display_labels=labels if labels is not None else np.unique(y_train.values),
            cmap=color_map,
            values_format=".0f",
            ax=ax,
        )

        # add canvas to prettierplot object
        ax = p.make_canvas(
            title="ROC curve - validation data\nModel: {}\nParameter set: {}".format(
                model.estimator_name,
                model.model_iter,
            ),
            x_label="False positive rate",
            y_label="True positive rate",
            y_shift=0.35,
            position=122,
            # position=111 if X_valid is not None else 121,
            title_scale=title_scale,
        )
        # add ROC curve to canvas
        p.roc_curve_plot(
            model=model,
            X_train=X_train,
            y_train=y_train,
            X_valid=X_valid,
            y_valid=y_valid,
            linecolor=style.style_grey,
            ax=ax,
        )
        plt.subplots_adjust(wspace=0.3)
        plt.show()

    # if n_folds are provided, indicating cross-validation
    elif isinstance(n_folds, int):
        print("\n" + "*" * 55)
        print("Cross validation evaluation\n")

        # generate cross-validation indices
        cv = list(
            StratifiedKFold(
                n_splits=n_folds, shuffle=True, random_state=random_state
            ).split(X_train, y_train)
        )

        # generate colors
        color_list = style.color_gen(color_map, num=len(cv))

        # iterate through cross-validation indices
        for i, (train_ix, valid_ix) in enumerate(cv):
            print("\n" + "*" * 55)
            print("CV Fold {}\n".format(i + 1))

            X_train_cv = X_train.iloc[train_ix]
            y_train_cv = y_train.iloc[train_ix]
            X_valid_cv = X_train.iloc[valid_ix]
            y_valid_cv = y_train.iloc[valid_ix]

            # fit model on training data and generate predictions using holdout observations
            y_pred = model.fit(X_train_cv, y_train_cv).predict(X_valid_cv)

            # print and generate classification_report using holdout observations
            print(
            classification_report(
                    y_valid_cv,
                    y_pred,
                    target_names=labels if labels is not None else np.unique(y_train.values),
                )
            )

            # create prettierplot object
            p = PrettierPlot(chart_scale=chart_scale, plot_orientation="wide_narrow")

            # add canvas to prettierplot object
            ax = p.make_canvas(
                title="Confusion matrix - CV Fold {}\nModel: {}\nParameter set: {}".format(
                    i + 1, model.estimator_name, model.model_iter
                ),
                y_shift=0.4,
                x_shift=0.25,
                position=121,
                title_scale=title_scale,
            )

            # add confusion matrix to canvas
            plot_confusion_matrix(
                estimator=model,
                X=X_valid_cv,
                y_true=y_valid_cv,
                display_labels=labels if labels is not None else np.unique(y_train.values),
                cmap=color_map,
                values_format=".0f",
                ax=ax,
            )

            # add canvas to prettierplot object
            ax = p.make_canvas(
                title="ROC curve - CV Fold {}\nModel: {}\nParameter set: {}".format(
                    i + 1,
                    model.estimator_name,
                    model.model_iter,
                ),
                x_label="False positive rate",
                y_label="True positive rate",
                y_shift=0.35,
                position=122,
                title_scale=title_scale,
            )

            # add ROC curve to canvas
            p.roc_curve_plot(
                model=model,
                X_train=X_train_cv,
                y_train=y_train_cv,
                X_valid=X_valid_cv,
                y_valid=y_valid_cv,
                linecolor=style.style_grey,
                ax=ax,
            )
            plt.subplots_adjust(wspace=0.3)
            plt.show()
예제 #4
0
def scatter_2d_hue(self, x, y, target, label, df=None, x_units="f", x_ticks=None, y_units="f", y_ticks=None,
                        plot_buffer=True, size=10, axis_limits=True, color=style.style_grey, facecolor="w",
                        bbox=(1.2, 0.9), color_map="viridis", alpha=0.8, x_rotate=None, ax=None):
    """
    Documentation:

        ---
        Description:
            Create 2-dimensional scatter plot with a third dimension represented as a color hue in the
            scatter dots.

        ---
        Parameters:
            x : array or string
                Either 1-dimensional array of values or a column name in a Pandas DataFrame.
            y : array or string
                Either 1-dimensional array of values or a column name in a Pandas DataFrame.
            target : array or string
                Either 1-dimensional array of values or a column name in a Pandas DataFrame.
            label : list
                Labels corresponding to color hue.
            df : Pandas DataFrame, default=None
                Pandas DataFrame containing data to plot. Can be any size - plotted columns will be
                chosen by columns names specified in x and y parameters.
            x_units : str, default='d'
                Determines unit of measurement for x-axis tick labels. 'f' displays float. 'p' displays
                percentages, d' displays dollars. Repeat character (e.g 'ff' or 'ddd') for additional
                decimal places.
            x_ticks : array, default=None
                Custom x-tick labels.
            y_units : str, default='d'
                Determines unit of measurement for x-axis tick labels. 'f' displays float. 'p' displays
                percentages, d' displays dollars. Repeat character (e.g 'ff' or 'ddd') for additional
                decimal places.
            y_ticks : array, default=None
                Custom y-tick labels.
            plot_buffer : bool, default=True
                Controls whether dynamic plot buffer function is executed.
            size : int or float, default=10
                Size of scattered dots.
            axis_limits : bool, default=True
                Controls whether dynamic axis limit setting function is executed.
            color : str (color code of some sort), default=style.style_grey
                Color of scattered dots
            facecolor : str (color code of some sort), default='w'
                Face color of scattered dots
            bbox : tuple of floats, default=(1.2, 0.9)
                Coordinates for determining legend position.
            color_map : str specifying built-in matplotlib colormap, default="viridis"
                Color map applied to plots.
            alpha : float, default=0.8
                Controls transparency of objects. Accepts value between 0.0 and 1.0.
            x_rotate : int, default=None
                Rotates x-axis tick mark labels x degrees.
            ax : axes object, default=None
                Axis object for the visualization.
    """
    if ax is None:
        ax = self.ax

    # if a Pandas DataFrame is passed to function, create x and y and target arrays using columns names
    # passed into function. Also concatenates columns into single object
    if df is not None:
        x = df[[x, y, target]].values
        x = df[x].values
        y = df[y].values
        target = df[target].values
    # concatenate the x, y and target arrays
    else:
        x = np.c_[x, y, target]

    # unique target values
    target_ids = np.unique(x[:, 2])

    # generate color list
    color_list = style.color_gen(name=color_map, num=len(target_ids))

    # loop through sets of target values, labels and colors to create 2_d scatter with hue
    for target_id, target_name, color in zip(target_ids, label, color_list):
        plt.scatter(
            x=x[x[:, 2] == target_id][:, 0],
            y=x[x[:, 2] == target_id][:, 1],
            color=color,
            label=target_name,
            s=size * self.chart_scale,
            alpha=alpha,
            facecolor="w",
            linewidth=0.234 * self.chart_scale,
        )

    # add legend to figure
    if label is not None:
        plt.legend(
            loc="upper right",
            bbox_to_anchor=bbox,
            ncol=1,
            frameon=True,
            fontsize=1.1 * self.chart_scale,
        )

    # optionally set axis lower / upper limits
    if axis_limits:
        x_min, x_max, y_min, y_max = util.util_set_axes(x=x, y=y)
        plt.axis([x_min, x_max, y_min, y_max])

    # optionally create smaller buffer around plot area to prevent cutting off elements
    if plot_buffer:
        util.util_plot_buffer(ax=ax, x=0.02, y=0.02)

    # optionally creates custom x-tick labels
    if x_ticks is not None:
        ax.set_xticks(x_ticks)

    # optionally creates custom y-tick labels
    if y_ticks is not None:
        ax.set_yticks(y_ticks)

    # format x and y ticklabels
    ax.set_yticklabels(
        ax.get_yticklabels() * 100 if "p" in y_units else ax.get_yticklabels(),
        rotation=0,
        fontsize=1.0 * self.chart_scale,
        color=style.style_grey,
    )

    ax.set_xticklabels(
        ax.get_xticklabels() * 100 if "p" in y_units else ax.get_xticklabels(),
        rotation=0,
        fontsize=1.0 * self.chart_scale,
        color=style.style_grey,
    )

    # use label formatter utility function to customize chart labels
    util.util_label_formatter(ax=ax, x_units=x_units, y_units=y_units, x_rotate=x_rotate)
예제 #5
0
def dist_plot(self, x, color, x_units="f", y_units="f", fit=None, kde=False, x_rotate=None, alpha=0.8,
                    bbox=(1.2, 0.9), legend_labels=None, color_map="viridis", ax=None):
    """
    Documentation:

        ---
        Description:
            Creates distribution plot for numeric variable. Optionally overlays a kernel density
            estimation curve.

        ---
        Parameters:
            x : array
                Data for plotting.
            color : str (some sort of color code)
                Color of bars and KDE lines.
            x_units : str, default='f'
                Determines unit of measurement for x-axis tick labels. 'f' displays float. 'p' displays
                percentages, d' displays dollars. Repeat character (e.g 'ff' or 'ddd') for additional
                decimal places.
            y_units : str, default='f'
                Determines unit of measurement for x-axis tick labels. 'f' displays float. 'p' displays
                percentages, 'd' displays dollars. Repeat character (e.g 'ff' or 'ddd') for additional
                decimal places.
            fit : random variabe object, default=None
                Allows for the addition of another curve. utilizing 'norm' overlays a normal distribution
                over the distribution bar chart. Useful for seeing how well, or not, the distribution tracks
                with a specified distrbution.
            kde : boolean, default=False
                Controls whether kernel density is plotted over distribution.
            x_rotate : int, default=None
                Rotates x_axis tick mark labels x degrees.
            alpha : float, default=0.8
                Controls transparency of objects. Accepts value between 0.0 and 1.0.
            bbox : tuple of floats, default=(1.2, 0.9)
                Coordinates for determining legend position.
            legend_labels : list, default=None
                Custom legend labels.
            color_map : str specifying built-in matplotlib colormap, default="viridis"
                Color map applied to plots.
            ax : axes object, default=None
                Axis object for the visualization.
    """
    if ax is None:
        ax = self.ax

    # create distribution plot with an optional fit curve
    g = sns.distplot(
        a=x,
        kde=kde,
        color=color,
        axlabel=False,
        fit=fit,
        kde_kws={"lw": 0.2 * self.chart_scale},
        hist_kws={"alpha": alpha},
        ax=ax,
    )

    # tick label font size
    ax.tick_params(axis="both", colors=style.style_grey, labelsize=1.2 * self.chart_scale)

    # format x and y ticklabels
    ax.set_yticklabels(
        ax.get_yticklabels() * 100 if "p" in y_units else ax.get_yticklabels(),
        rotation=0,
        fontsize=1.1 * self.chart_scale,
        color=style.style_grey,
    )

    ax.set_xticklabels(
        ax.get_xticklabels() * 100 if "p" in y_units else ax.get_xticklabels(),
        rotation=0,
        fontsize=1.1 * self.chart_scale,
        color=style.style_grey,
    )

    # use label formatter utility function to customize chart labels
    util.util_label_formatter(
        ax=ax, x_units=x_units, y_units=y_units, x_rotate=x_rotate
    )

    ## create custom legend
    if legend_labels is None:
        legend_labels = legend_labels
    else:
        legend_labels = np.array(legend_labels)

        # generate colors
        color_list = style.color_gen(color_map, num=len(legend_labels))

        label_color = {}
        for ix, i in enumerate(legend_labels):
            label_color[i] = color_list[ix]

        # create legend Patches
        patches = [Patch(color=v, label=k, alpha=alpha) for k, v in label_color.items()]

        # draw legend
        leg = plt.legend(
            handles=patches,
            fontsize=1.0 * self.chart_scale,
            loc="upper right",
            markerscale=0.5 * self.chart_scale,
            ncol=1,
            bbox_to_anchor=bbox,
        )

        # label font color
        for text in leg.get_texts():
            plt.setp(text, color="grey")
예제 #6
0
def stacked_bar_h(self, df, label_rotate=0, x_units="p", alpha=0.8, color_map="viridis", bbox=(1.2,0.9),
                    legend_labels=None, ax=None):
    """
    Documentation:

        ---
        Description:
            create horizontal bar plot.

        ---
        Parameters:
            df : Pandas DataFrame
                1-dimensional array of values to plot on y-axis representing distinct categories.
            label_rotate : float or int, default=45
                Number of degrees to rotate the x-tick labels.
            x_units : str, default='f'
                Determines unit of measurement for x-axis tick labels. 's' displays string. 'f'
                displays float. 'p' displays percentages, 'd' displays dollars. Repeat character
                (e.g 'ff' or 'ddd') for additional decimal places.
            alpha : float, default=0.8
                Controls transparency of bars. Accepts value between 0.0 and 1.0.
            color_map : str specifying built-in matplotlib colormap, default="viridis"
                Color map applied to plots.
            bbox : tuple of floats, default=(1.2, 0.9)
                Coordinates for determining legend position.
            legend_labels : list, default=None
                Custom legend labels.
            ax : axes object, default=None
                Axis object for the visualization.
    """
    if ax is None:
        ax = self.ax

    # define class label count and bar color list
    y = np.arange(len(df.index))
    color_list = style.color_gen(color_map, num=len(y))

    # define category labels
    category_levels = np.arange(len(df.columns))

    # plot stacked bars
    for class_label, color in zip(np.arange(len(y)), color_list):
        # first category
        if class_label == 0:
            plt.barh(
                y=category_levels,
                width=df.loc[class_label],
                color=color,
                alpha=alpha,
            )
        # stack all additional categories on previous categories
        else:
            plt.barh(
                y=category_levels,
                width=df.loc[class_label],
                left=df.drop([x for x in df.index if x >= class_label]).sum(axis=0),
                color=color,
                alpha=alpha,
            )

    # convert x-axis tick labels to percentages
    ax.set_xticklabels(
        ax.get_xticklabels() * 100 if "p" in x_units else ax.get_xticklabels(),
        rotation=0,
        color=style.style_grey,
    )

    ## create custom legend
    if legend_labels is None:
        legend_labels = np.arange(len(color_list))
    else:
        legend_labels = np.array(legend_labels)

    # define colors
    label_color = {}
    for ix, i in enumerate(legend_labels):
        label_color[i] = color_list[ix]

    # create legend Patches
    patches = [Patch(color=v, label=k, alpha=alpha) for k, v in label_color.items()]

    # draw legend
    leg = plt.legend(
        handles=patches,
        fontsize=0.95 * self.chart_scale,
        loc="upper right",
        markerscale=0.3 * self.chart_scale,
        ncol=1,
        bbox_to_anchor=bbox,
    )

    # label font color
    for text in leg.get_texts():
        plt.setp(text, color="grey")

    # use label formatter utility function to customize chart labels
    util.util_label_formatter(ax=ax, x_units=x_units)

    # overwrite y-axis labels with category labels
    try:
        columns = df.columns.map(np.int)
    except ValueError:
        columns = df.columns

    # dynamically size y-labels
    if 7 < len(category_levels) <= 10:
        ax.tick_params(axis="y", colors=style.style_grey, labelsize=0.9 * self.chart_scale)
    elif 10 < len(category_levels) <= 20:
        ax.tick_params(axis="y", colors=style.style_grey, labelsize=0.75 * self.chart_scale)
    elif len(category_levels) > 20:
        ax.tick_params(axis="y", colors=style.style_grey, labelsize=0.6 * self.chart_scale)

    ax.tick_params(axis="x", colors=style.style_grey, labelsize=1.2 * self.chart_scale)

    # wrap long y-tick labels
    plt.yticks(
        category_levels,
        [
            "\n".join(textwrap.wrap(str(i).replace("_", " "), 12))
            for i in columns
        ],
    )
예제 #7
0
def facet_two_cat_point(self,
                        df,
                        x,
                        y,
                        split,
                        cat_col=None,
                        cat_row=None,
                        bbox=None,
                        aspect=1,
                        alpha=0.8,
                        height=4,
                        legend_labels=None,
                        color_map="viridis"):
    """
    Documentation:
        
        ---
        Description:
            Creates pointplots of one categorical variable, and each can optionally be split by
            two additional categories along the column and/or row axes of the figure.
        
        ---
        Parameters:
            df : Pandas DataFrame
                Pandas DataFrame containing data for plotting.
            x : str
                Categorical variable to plot along x_axis.
            y : str
                Variable to be counted along y_axis.
            split : str
                Categorical variable for faceting the 'x' variable.
            cat_col : str
                Categorical variable faceted along the column axis.
            cat_row : str
                Categorical variable faceted along the row axis.
            bbox : tuple of floats, default=None
                Coordinates for determining legend position.
            aspect : float, default=1
                higher values create wider plot, lower values create narrow plot, while
                keeping height constant.
            alpha : float, default=0.8
                Controls transparency of objects. Accepts value between 0.0 and 1.0.
            height : float, default=4
                height in inches of each facet.
            legend_labels : list, default=None
                Custom legend labels.
            color_map : str specifying built-in matplotlib colormap, default="viridis"
                Color map applied to plots.
    """
    # create FacetGrid object
    g = sns.FacetGrid(df,
                      row=cat_row,
                      col=cat_col,
                      aspect=aspect,
                      height=height,
                      margin_titles=True)

    # map pointplot to FacetGrid object
    g.map(
        sns.pointplot,
        x,
        y,
        split,
        order=df[x].sort_values().drop_duplicates().values.tolist(),
        hue_order=df[split].sort_values().drop_duplicates().values.tolist(),
        palette=sns.color_palette(
            style.color_gen(color_map, num=len(np.unique(df[split].values)))),
        alpha=alpha,
        ci=None,
    )

    # format x any y ticklabels, x and y labels, and main title
    for ax in g.axes.flat:
        _ = ax.set_ylabel(
            ax.get_ylabel(),
            rotation=90,
            fontsize=1.05 * self.chart_scale,
            color=style.style_grey,
        )
        _ = ax.set_xlabel(
            ax.get_xlabel(),
            rotation=0,
            fontsize=1.05 * self.chart_scale,
            color=style.style_grey,
        )
        _ = ax.set_title(
            ax.get_title(),
            rotation=0,
            fontsize=1.05 * self.chart_scale,
            color=style.style_grey,
        )

        # resize y tick labels
        labels = ax.get_yticklabels()
        if len(labels) > 0:
            _ = ax.set_yticklabels(
                ax.get_yticklabels(),
                rotation=0,
                fontsize=0.8 * self.chart_scale,
                color=style.style_grey,
            )
        # resize x tick labels
        labels = ax.get_xticklabels()
        if len(labels) > 0:
            _ = ax.set_xticklabels(
                ax.get_xticklabels(),
                rotation=0,
                fontsize=0.8 * self.chart_scale,
                color=style.style_grey,
            )

        if ax.texts:
            # this contains the right ylabel text
            txt = ax.texts[0]

            ax.text(
                txt.get_unitless_position()[0],
                txt.get_unitless_position()[1],
                txt.get_text(),
                transform=ax.transAxes,
                va="center",
                fontsize=1.05 * self.chart_scale,
                color=style.style_grey,
                rotation=-90,
            )
            # remove the original text
            ax.texts[0].remove()

    ## create custom legend
    # create labels
    if legend_labels is None:
        legend_labels = np.unique(df[df[split].notnull()][split])
    else:
        legend_labels = np.array(legend_labels)

    # generate colors
    color_list = style.color_gen(color_map, num=len(legend_labels))

    label_color = {}
    for ix, i in enumerate(legend_labels):
        label_color[i] = color_list[ix]

    # create legend Patches
    patches = [
        Patch(color=v, label=k, alpha=alpha) for k, v in label_color.items()
    ]

    # draw legend
    leg = plt.legend(
        handles=patches,
        fontsize=1.0 * self.chart_scale,
        loc="upper right",
        markerscale=0.5 * self.chart_scale,
        ncol=1,
        bbox_to_anchor=bbox,
    )

    # label font color
    for text in leg.get_texts():
        plt.setp(text, color="grey")
예제 #8
0
def facet_cat(self,
              df,
              feature,
              label_rotate=0,
              x_units="s",
              y_units="f",
              bbox=(1.2, 0.9),
              alpha=0.8,
              legend_labels=None,
              color_map="viridis",
              ax=None):
    """
    Documentation:

        ---
        Description:
            Creates a count plot for a categorical variable and facet the variable by another
            categorical variable.

        ---
        Parameters:
            df : Pandas DataFrame
                Pandas DataFrame containing data for plotting.
            feature : str
                Name of column that contains the category values to be used for faceting/
            label_rotate : float or int, default=0
                Number of degrees to rotate the x-tick labels.
            x_units : str, default='f'
                Determines unit of measurement for x-axis tick labels. 's' displays string. 'f' displays
                float. 'p' displays percentages, 'd' displays dollars. Repeat character (e.g 'ff' or 'ddd')
                for additional decimal places.
            y_units : str, default='s'
                Determines unit of measurement for y-axis tick labels. 's' displays string. 'f' displays
                float. 'p' displays percentages, 'd' displays dollars. Repeat character (e.g 'ff' or 'ddd')
                for additional decimal places.
            bbox : tuple of floats, default=(1.2, 0.9)
                Coordinates for determining legend position.
            alpha : float, default=0.8
                Controls transparency of objects. Accepts value between 0.0 and 1.0.
            legend_labels : list, default=None
                Custom legend labels.
            color_map : str specifying built-in matplotlib colormap, default="viridis"
                Color map applied to plots.
            ax : axes object, default=None
                Axis object for the visualization.
    """
    if ax is None:
        ax = self.ax

    ixs = np.arange(df.shape[0])
    bar_width = 0.35

    feature_dict = {}
    for feature in df.columns[1:]:
        feature_dict[feature] = df[feature].values.tolist()

    # generate color list
    if isinstance(color_map, str):
        color_list = style.color_gen(name=color_map,
                                     num=len(feature_dict.keys()))
    elif isinstance(color_map, list):
        color_list = color_map

    for feature_ix, (k, v) in enumerate(feature_dict.items()):
        plt.bar(
            ixs + (bar_width * feature_ix),
            feature_dict[k],
            bar_width,
            alpha=alpha,
            color=color_list[feature_ix],
            label=str(k),
        )

    # wrap long x-tick labels
    plt.xticks(
        ixs[:df.shape[0]] + bar_width / 2,
        [
            "\n".join(textwrap.wrap(str(i).replace("_", " "), 12))
            for i in df.iloc[:, 0].values
        ],
    )
    plt.xticks(rotation=label_rotate)

    ## create custom legend
    # create labels
    if legend_labels is None:
        legend_labels = np.arange(len(color_list))
    else:
        legend_labels = np.array(legend_labels)

    # define colors
    label_color = {}
    for ix, i in enumerate(legend_labels):
        label_color[i] = color_list[ix]

    # create legend Patches
    patches = [
        Patch(color=v, label=k, alpha=alpha) for k, v in label_color.items()
    ]

    # draw legend
    leg = plt.legend(
        handles=patches,
        fontsize=0.95 * self.chart_scale,
        loc="upper right",
        markerscale=0.3 * self.chart_scale,
        ncol=1,
        bbox_to_anchor=bbox,
    )

    # label font color
    for text in leg.get_texts():
        plt.setp(text, color="grey")

    ### general formatting
    # if data is float dtype, then format as a number
    if df.iloc[:, 0].values.dtype == np.float:
        x_units = "f"
    # otherwise represent data as a string
    else:
        x_units = "s"

    # use label formatter utility function to customize chart labels
    util.util_label_formatter(ax=ax, x_units=x_units, y_units=y_units)

    # tick label font size
    ax.tick_params(axis="both",
                   colors=style.style_grey,
                   labelsize=1.2 * self.chart_scale)

    # dynamically set x-axis label size
    if 7 < len(feature_dict[feature]) <= 10:
        ax.tick_params(axis="x",
                       colors=style.style_grey,
                       labelsize=0.9 * self.chart_scale)
    elif 10 < len(feature_dict[feature]) <= 20:
        ax.tick_params(axis="x",
                       colors=style.style_grey,
                       labelsize=0.75 * self.chart_scale)
    elif len(feature_dict[feature]) > 20:
        ax.tick_params(axis="x",
                       colors=style.style_grey,
                       labelsize=0.6 * self.chart_scale)
예제 #9
0
def facet_two_cat_bar(self,
                      df,
                      x,
                      y,
                      split,
                      x_units=None,
                      y_units=None,
                      bbox=None,
                      alpha=0.8,
                      legend_labels=None,
                      filter_nan=True,
                      color_map="viridis",
                      ax=None):
    """
    Documentation:

        Description:
            Creates a series of bar plots that count a variable along the y_axis and separate the counts
            into bins based on two category variables.

        ---
        Parameters:
            df : Pandas DataFrame
                Pandas DataFrame containing data for plotting.
            x : str
                Categorical variable to plot along x-axis.
            y : str
                Pandas DataFrame containing data for plotting.
                ariable to be counted along y-axis.
            split : str
                Categorical variable for faceting the num_col variable.
            x_units : str, default=None
                Determines unit of measurement for x-axis tick labels. 's' displays string. 'f' displays
                float. 'p' displays percentages, 'd' displays dollars. Repeat character (e.g 'ff' or 'ddd')
                for additional decimal places.
            y_units : str, default=None
                Determines unit of measurement for x-axis tick labels. 's' displays string. 'f' displays
                float. 'p' displays percentages, 'd' displays dollars. Repeat character (e.g 'ff' or 'ddd')
                for additional decimal places.
            bbox : tuple of floats, default=None
                Coordinates for determining legend position.
            alpha : float, default=0.8
                Controls transparency of objects. Accepts value between 0.0 and 1.0.
            legend_labels : list, default=None
                Custom legend labels.
            filter_nan : bool, default=True
                Remove records that have a null value in the column specified by the 'x' parameter.
            color_map : str specifying built-in matplotlib colormap, default="viridis"
                Color map applied to plots.
            ax : axes object, default=None
                Axis object for the visualization.
    """
    if ax is None:
        ax = self.ax

    # remove nans from x columns
    if filter_nan:
        df = df.dropna(subset=[x])

    # create bar plot
    g = sns.barplot(
        x=x,
        y=y,
        hue=split,
        data=df,
        palette=sns.color_palette(
            style.color_gen("viridis", num=len(np.unique(df[split].values)))),
        order=df[x].sort_values().drop_duplicates().values.tolist(),
        hue_order=df[split].sort_values().drop_duplicates().values.tolist()
        if split is not None else None,
        ax=ax,
        ci=None,
    )

    # format x-tick labels
    g.set_xticklabels(
        g.get_xticklabels(),
        rotation=0,
        fontsize=1.05 * self.chart_scale,
        color=style.style_grey,
    )
    # format y-tick labels
    g.set_yticklabels(
        g.get_yticklabels() * 100 if "p" in y_units else g.get_yticklabels(),
        rotation=0,
        fontsize=1.05 * self.chart_scale,
        color=style.style_grey,
    )
    # format x-axis label
    g.set_xlabel(
        g.get_xlabel(),
        rotation=0,
        fontsize=1.35 * self.chart_scale,
        color=style.style_grey,
    )
    # format y-axis label
    g.set_ylabel(
        g.get_ylabel(),
        rotation=90,
        fontsize=1.35 * self.chart_scale,
        color=style.style_grey,
    )
    # format title
    g.set_title(
        g.get_title(),
        rotation=0,
        fontsize=1.5 * self.chart_scale,
        color=style.style_grey,
    )

    ## create custom legend
    # create labels
    if split is not None:
        if legend_labels is None:
            legend_labels = (df[df[split].notnull()][split].sort_values().
                             drop_duplicates().values.tolist())
        else:
            legend_labels = np.array(legend_labels)

        # generate colors
        color_list = style.color_gen(color_map, num=len(legend_labels))

        label_color = {}
        for ix, i in enumerate(legend_labels):
            label_color[i] = color_list[ix]

        # create legend Patches
        patches = [
            Patch(color=v, label=k, alpha=alpha)
            for k, v in label_color.items()
        ]

        # draw legend
        leg = plt.legend(
            handles=patches,
            fontsize=1.25 * self.chart_scale,
            loc="upper right",
            markerscale=0.5 * self.chart_scale,
            ncol=1,
            bbox_to_anchor=bbox,
        )

        # label font color
        for text in leg.get_texts():
            plt.setp(text, color="grey")

        # use label formatter utility function to customize chart labels
        util.util_label_formatter(ax=ax, x_units=x_units, y_units=y_units)
예제 #10
0
def model_param_plot(self, bayes_optim_summary, estimator_class, estimator_parameter_space, n_iter, chart_scale=15,
                    color_map="viridis", title_scale=1.2, show_single_str_params=False):
    """
    Documentation:

        ---
        Definition:
            Visualize hyperparameter optimization over all iterations. Compares theoretical distribution to
            the distribution of values that were actually chosen, and visualizes how parameter value
            selections changes over time.

        ---
        Parameters:
            bayes_optim_summary : Pandas DataFrame
                Pandas DataFrame containing results from bayesian optimization process.
            estimator_class : str or sklearn api object
                Name of estimator to visualize.
            estimator_parameter_space : dictionary of dictionaries
                Dictionary of nested dictionaries. Outer key is an estimator, and the corresponding value is
                a dictionary. Each nested dictionary contains 'parameter: value distribution' key/value
                pairs. The inner dictionary key specifies the parameter of the model to be tuned, and the
                value is a distribution of values from which trial values are drawn.
            n_iter : int
                Number of iterations to draw from theoretical distribution in order to visualize the
                theoretical distribution. Higher number leader to more robust distribution but can take
                considerably longer to create.
            chart_scale : float, default=15
                Controls proportions of visualizations. larger values scale visual up in size, smaller values
                scale visual down in size.
            color_map : str specifying built-in matplotlib colormap, default="viridis"
                Color map applied to plots.
            title_scale : float, default=1.2
                Controls the scaling up (higher value) and scaling down (lower value) of the size of
                the main chart title, the x_axis title and the y_axis title.
            show_single_str_params : boolean, default=False
                Controls whether to display visuals for string attributes where there is only one unique value,
                i.e. there was only one choice for the optimization procedure to choose from during each iteration.
    """
    # unpack bayes_optim_summary parameters for an estimator_class
    estimator_summary = self.unpack_bayes_optim_summary(
        bayes_optim_summary=bayes_optim_summary, estimator_class=estimator_class
    )

    # override None with string representation
    estimator_summary = estimator_summary.replace([None], "None")

    # subset estimator_parameter_space to space for the specified estimator_class
    estimator_space = estimator_parameter_space[estimator_class]

    print("*" * 100)
    print("* {}".format(estimator_class))
    print("*" * 100)

    # iterate through each parameter
    for param in estimator_space.keys():

        # sample from theoretical distribution for n_iters
        theoretical_dist = []
        for _ in range(n_iter):
            theoretical_dist.append(sample(estimator_space)[param])

        ## override None with string representation
        # theoretical distribution
        theoretical_dist = ["none" if v is None else v for v in theoretical_dist]
        theoretical_dist = np.array(theoretical_dist)

        # actual distribution
        actual_dist = estimator_summary[param].tolist()
        actual_dist = ["none" if v is None else v for v in actual_dist]
        actual_dist = np.array(actual_dist)

        # limit estimator_summary to "iteration" and current "param" columns
        actual_iter_df = estimator_summary[["iteration", param]]

        # identify how many values in param column are zero or one
        zeros_and_ones = (actual_iter_df[param].eq(True) | actual_iter_df[param].eq(False)).sum()

        # param column only contains zeros and ones, store string representations of "TRUE" and "FALSE"
        if zeros_and_ones == actual_iter_df.shape[0]:
            actual_iter_df = actual_iter_df.replace({True: "TRUE", False: "FALSE"})

        # if theoreitcal distribution has dtype -- np.bool_, store string representations of "TRUE" and "FALSE"
        if isinstance(theoretical_dist[0], np.bool_):
            theoretical_dist = np.array(["TRUE" if i == True else "FALSE" for i in theoretical_dist.tolist()])

            estimator_summary = estimator_summary.replace([True], "TRUE")
            estimator_summary = estimator_summary.replace([False], "FALSE")

        # if theoretical distribution contains str data, then treat this as an object/category parameter
        if any(isinstance(d, str) for d in theoretical_dist):

            # generate color list for stripplot
            stripplot_color_list = style.color_gen(name=color_map, num=len(actual_iter_df[param].unique()) + 1)

            # generate color list for bar chart
            bar_color_list = style.color_gen(name=color_map, num=3)

            # identify unique values and associated count in theoretical distribution
            unique_vals_theo, unique_counts_theo = np.unique(theoretical_dist, return_counts=True)

            # if theoretical distribution only has one unique value and show_single_str_params is set to True
            if len(unique_vals_theo) > 1 or show_single_str_params:

                # identify unique values and associated count in actual distribution
                unique_vals_actual, unique_counts_actual = np.unique(actual_dist, return_counts=True)

                # store data in DataFrame
                df = pd.DataFrame({"param": unique_vals_actual, "Theorical": unique_counts_theo, "Actual": unique_counts_actual})

                # create prettierplot object
                p = PrettierPlot(chart_scale=chart_scale, plot_orientation = "wide_narrow")

                # add canvas to prettierplot object
                ax = p.make_canvas(
                    title="Selection vs. theoretical distribution\n* {0} - {1}".format(estimator_class, param),
                    y_shift=0.8,
                    position=121,
                    title_scale=title_scale,
                )

                # add faceted bar chart to canvas
                p.facet_cat(
                    df=df,
                    feature="param",
                    color_map=bar_color_list[:-1],
                    bbox=(1.0, 1.15),
                    alpha=1.0,
                    legend_labels=df.columns[1:].values,
                    x_units=None,
                    ax=ax,
                )

                # add canvas to prettierplot object
                ax = p.make_canvas(
                    title="Selection by iteration\n* {0} - {1}".format(estimator_class, param),
                    y_shift=0.5,
                    position=122,
                    title_scale=title_scale,
                )

                # add stripply to canvas
                sns.stripplot(
                    x="iteration",
                    y=param,
                    data=estimator_summary,
                    jitter=0.3,
                    alpha=1.0,
                    size=0.7 * chart_scale,
                    palette=sns.color_palette(stripplot_color_list[:-1]),
                    ax=ax,
                ).set(xlabel=None, ylabel=None)

                # set tick label font size
                ax.tick_params(axis="both", colors=style.style_grey, labelsize=1.2 * chart_scale)

                plt.show()

        # otherwise treat it as a numeric parameter
        else:
            # cast "iteration" as an int and the param values as float
            convert_dict = {"iteration": int, param: float}
            actual_iter_df = actual_iter_df.astype(convert_dict)

            # create color map
            color_list = style.color_gen(name=color_map, num=3)

            # create prettierplot object
            p = PrettierPlot(chart_scale=chart_scale, plot_orientation = "wide_narrow")

            # add canvas to prettierplot object
            ax = p.make_canvas(
                title="Selection vs. theoretical distribution\n* {0} - {1}".format(estimator_class, param),
                y_shift=0.8,
                position=121,
                title_scale=title_scale,
            )

            # dynamically set x-unit precision based on max value
            if -1.0 <= np.nanmax(theoretical_dist) <= 1.0:
                x_units = "fff"
            elif 1.0 < np.nanmax(theoretical_dist) <= 5.0:
                x_units = "ff"
            elif np.nanmax(theoretical_dist) > 5.0:
                x_units = "f"

            # add kernsel density plot for theoretical distribution to canvas
            p.kde_plot(
                theoretical_dist,
                color=color_list[0],
                y_units="ffff",
                x_units=x_units,
                line_width=0.4,
                bw=0.4,
                ax=ax,
            )

            # add kernsel density plot for actual distribution to canvas
            p.kde_plot(
                actual_dist,
                color=color_list[1],
                y_units="ffff",
                x_units=x_units,
                line_width=0.4,
                bw=0.4,
                ax=ax,
            )

            ## create custom legend
            # create labels
            label_color = {}
            legend_labels = ["Theoretical", "Actual"]
            for ix, i in enumerate(legend_labels):
                label_color[i] = color_list[ix]

            # create legend Patches
            Patches = [Patch(color=v, label=k, alpha=1.0) for k, v in label_color.items()]

            # draw legend
            leg = plt.legend(
                handles=Patches,
                fontsize=1.1 * chart_scale,
                loc="upper right",
                markerscale=0.6 * chart_scale,
                ncol=1,
                bbox_to_anchor=(.95, 1.1),
            )

            # label font color
            for text in leg.get_texts():
                plt.setp(text, color="grey")

            # dynamically set y-unit precision based on max value
            if -1.0 <= np.nanmax(actual_iter_df[param]) <= 1.0:
                y_units = "fff"
            elif 1.0 < np.nanmax(actual_iter_df[param]) <= 5.0:
                y_units = "ff"
            elif np.nanmax(actual_iter_df[param]) > 5.0:
                y_units = "f"

            # add canvas to prettierplot object
            ax = p.make_canvas(
                title="Selection by iteration\n* {0} - {1}".format(estimator_class, param),
                y_shift=0.8,
                position=122,
                title_scale=title_scale,
            )

            # add regression plot to canvas
            p.reg_plot(
                x="iteration",
                y=param,
                data=actual_iter_df,
                y_units=y_units,
                x_units="f",
                line_color=color_list[0],
                line_width=0.4,
                dot_color=color_list[1],
                dot_size=10.0,
                alpha=0.6,
                ax=ax
            )
            plt.show()
예제 #11
0
def decision_region(self,
                    x,
                    y,
                    estimator,
                    test_idx=None,
                    resolution=0.1,
                    bbox=(1.2, 0.9),
                    color_map="viridis",
                    ax=None):
    """
    Documentation:
        Description:
            Create 2-dimensional chart with shading used to highlight decision regions.
        Parameters:
            x : array
                m x 2 array containing 2 features.
            y : array
                m x 1 array containing labels for observations.
            estimator : sklearn model
                Estimator used to create decision regions.
            test_idx :  tuple, default=None
                Optional parameter for specifying observations to be highlighted as test examples.
            resolution : float, default=0.1
                Controls clarity of the graph by setting interval of the arrays passed into np.meshgrid.
                Higher resolution will take longer to generate because predictions have to be generated
                for each point on the grid.
            bbox : tuple of floats, default=(1.2, 0.9)
                Coordinates for determining legend position.
            color_map : str specifying built-in matplotlib colormap, default="viridis"
                Color map applied to plots.
            ax : axes object, default=None
                Axis object for the visualization.
    """
    # generate color list
    color_list = style.color_gen(name=color_map, num=len(np.unique(y)))

    # objects for marker generator and color map
    cmap = ListedColormap(color_list)

    # plot decision surface
    x1_min, x1_max = x[:, 0].min() - 1, x[:, 0].max() + 1
    x2_min, x2_max = x[:, 1].min() - 1, x[:, 1].max() + 1

    # generate meshgrid indices
    xx1, xx2 = np.meshgrid(np.arange(x1_min, x1_max, resolution),
                           np.arange(x2_min, x2_max, resolution))

    # generate predictions using estimator for all points on grid
    z = estimator.predict(np.array([xx1.ravel(), xx2.ravel()]).T)

    # reshape the predictions and apply coloration
    z = z.reshape(xx1.shape)
    plt.contourf(xx1, xx2, z, alpha=0.3, cmap=cmap)
    plt.xlim(xx1.min(), xx1.max())
    plt.ylim(xx2.min(), xx2.max())

    # plot samples
    for idx, cl in enumerate(np.unique(y)):
        plt.scatter(
            x=x[y == cl, 0],
            y=x[y == cl, 1],
            alpha=1.0,
            c=color_list[idx],
            marker=style.style_markers[1],
            label=cl,
            s=12.5 * self.chart_scale,
        )

    # highlight test samples
    if test_idx:
        x_test = x[test_idx, :]
        plt.scatter(
            x_test[:, 0],
            x_test[:, 1],
            facecolor="none",
            edgecolor="white",
            alpha=1.0,
            linewidth=1.4,
            marker="o",
            s=12.75 * self.chart_scale,
            label="test set",
        )

    # add legend to figure
    plt.legend(
        loc="upper right",
        bbox_to_anchor=bbox,
        ncol=1,
        frameon=True,
        fontsize=1.1 * self.chart_scale,
    )

    plt.tight_layout()
예제 #12
0
def model_loss_plot(self, bayes_optim_summary, estimator_class, chart_scale=15, trim_outliers=True, outlier_control=1.5,
                    title_scale=0.7, color_map="viridis"):
    """
    Documentation:

        ---
        Definition:
            Visualize how the bayesian optimization loss changes over time across all iterations.
            Extremely poor results are removed from visualized dataset by two filters.
                1) Loss values worse than [loss mean + (2 x loss standard deviation)]
                2) Loss values worse than [median * outliers_control]. 'outlier_control' is a parameter
                   that can be set during function execution.

        ---
        Parameters:
            bayes_optim_summary : Pandas DataFrame
                Pandas DataFrame containing results from bayesian optimization process.
            estimator_class : str or sklearn api object
                Name of estimator to visualize.
            chart_scale : float, default=15
                Control chart proportions. Higher values scale up size of chart objects, lower
                values scale down size of chart objects.
            trim_outliers : boolean, default=True
                Remove extremely high (poor) results by trimming values where the loss is greater
                than 2 standard deviations away from the mean.
            outlier_control : float: default=1.5
                Controls enforcement of outlier trimming. Value is multiplied by median, and the resulting
                product is the cap placed on loss values. Values higher than this cap will be excluded.
                Lower values of outlier_control apply more extreme filtering to loss values.
            title_scale : float, default=0.7
                Controls the scaling up (higher value) and scaling down (lower value) of the size of
                the main chart title, the x_axis title and the y_axis title.
            color_map : str specifying built-in matplotlib colormap, default="viridis"
                Color map applied to plots.
    """
    # unpack bayes_optim_summary parameters for an estimator_class
    estimator_summary = self.unpack_bayes_optim_summary(
        bayes_optim_summary=bayes_optim_summary, estimator_class=estimator_class
    )

    # apply outlier trimming
    if trim_outliers:
        mean = estimator_summary["iter_loss"].mean()
        median = estimator_summary["iter_loss"].median()
        std = estimator_summary["iter_loss"].std()
        cap = mean + (2.0 * std)
        estimator_summary = estimator_summary[
            (estimator_summary["iter_loss"] < cap)
            & (estimator_summary["iter_loss"] < outlier_control * median)
        ]

    # create color list based on color_map
    color_list = style.color_gen(name=color_map, num=3)

    # create prettierplot object
    p = PrettierPlot(chart_scale=chart_scale)

    # add canvas to prettierplot object
    ax = p.make_canvas(
        title="Loss by iteration - {}".format(estimator_class),
        y_shift=0.8,
        position=111,
        title_scale=title_scale,
    )

    # add regression plot to canvas
    p.reg_plot(
        x="iteration",
        y="iter_loss",
        data=estimator_summary,
        y_units="ffff",
        line_color=color_list[0],
        dot_color=color_list[1],
        alpha=0.6,
        line_width=0.4,
        dot_size=10.0,
        ax=ax,
    )
    plt.show()
예제 #13
0
def box_plot_h(self, x, y, data, color=style.style_grey, x_units="f", bbox=(1.05, 1), color_map="viridis",
                        suppress_outliers=False, alpha=0.8, legend_labels=None, ax=None):
    """
    Documentation:

        ---
        Description:
            create horizontal box plots. useful for evaluating a object target on the y_axis
            vs. a number independent variable on the x_axis.

        ---
        Parameters:
            x : str
                Name of categorical variable.
            y : str
                Name of numeric variable.
            data : Pandas DataFrame
                Pandas DataFrame including both x and y data.
            color : str (some sort of color code), default=style.style_grey
                Determines color of box plot figures. Ideally this object is a color palette,
                which can be a default seaborn palette, a custom seaborn palette, or a custom
                matplotlib cmap.
            x_units : str, default='f'
                Determines unit of measurement for x-axis tick labels. 's' displays string. 'f' displays
                float. 'p' displays percentages, 'd' displays dollars. Repeat character (e.g 'ff' or 'ddd')
                for additional decimal places.
            bbox : tuple of floats, default=(1.05, 1.0)
                Coordinates for determining legend position.
            color_map : str specifying built-in matplotlib colormap, default="viridis"
                Color map applied to plots.
            suppress_outliers : boolean, default=False
                Controls removal of outliers from box/whisker plots
            alpha : float, default=0.8
                Controls transparency of bars. Accepts value between 0.0 and 1.0.
            legend_labels : list, default=None
                Custom legend labels.
            ax : axes object, default=None
                Axis object for the visualization.
    """
    if ax is None:
        ax = self.ax
    # create horizontal box plot

    g = sns.boxplot(
        x=x,
        y=y,
        hue=y,
        data=data,
        orient="h",
        palette=sns.color_palette(
            style.color_gen(color_map, num=len(np.unique(data[y].values)))
        ),
        showfliers=suppress_outliers,
        ax=ax,
    ).set(xlabel=None, ylabel=None)

    # fade box plot figures by reducing alpha
    plt.setp(ax.artists, alpha=alpha)
    ax.yaxis.set_visible(False)

    # tick label font size
    ax.tick_params(axis="both", colors=style.style_grey, labelsize=1.2 * self.chart_scale)

    # use label formatter utility function to customize chart labels
    util.util_label_formatter(ax=ax, x_units=x_units)

    ## custom legend
    # use legend labels if provided, otherwise use unique values in y column
    if legend_labels is None:
        legend_labels = np.unique(data[y].values)
    else:
        legend_labels = np.array(legend_labels)

    # generate colors
    color_list = style.color_gen(color_map, num=len(legend_labels))

    label_color = {}
    for ix, i in enumerate(legend_labels):
        label_color[i] = color_list[ix]

    # create legend Patches
    patches = [Patch(color=v, label=k, alpha=alpha) for k, v in label_color.items()]

    # draw legend
    leg = plt.legend(
        handles=patches,
        fontsize=1.0 * self.chart_scale,
        loc="upper right",
        markerscale=0.5 * self.chart_scale,
        ncol=1,
        bbox_to_anchor=bbox,
    )

    # label font color
    for text in leg.get_texts():
        plt.setp(text, color="grey")
예제 #14
0
def box_plot_v(self, x, y, data, color, label_rotate=0, y_units="f", color_map="viridis", alpha=0.8,
                        suppress_outliers=False, ax=None):
    """
    Documentation:

        ---
        Description:
            Create vertical box plots. Useful for evaluating a numeric variable on the y-axis
            versus several different category segments on the x-axis.

        ---
        Parameters:
            x : str
                Name of categorical variable.
            y : str
                Name of numeric variable.
            data : Pandas DataFrame
                Pandas DataFrame including both x and y data.
            color : str
                Determines color of box plot figures. Ideally this object is a color palette,
                which can be a default seaborn palette, a custom seaborn palette, or a custom
                matplotlib cmap.
            label_rotate : float or int, default=45
                Number of degrees to rotate the x-tick labels.
            y_units : str, default='f'
                Determines unit of measurement for y-axis tick labels. 's' displays string. 'f'
                displays float. 'p' displays percentages, 'd' displays dollars. Repeat character
                (e.g 'ff' or 'ddd') for additional decimal places.
            color_map : str specifying built-in matplotlib colormap, default="viridis"
                Color map applied to plots.
            alpha : float, default=0.8
                Controls transparency of objects. Accepts value between 0.0 and 1.0.
            suppress_outliers : boolean, default=False
                Controls removal of outliers from box/whisker plots.
            ax : axes object, default=None
                Axis object for the visualization.
    """
    if ax is None:
        ax = self.ax

    # create vertical box plot.
    g = sns.boxplot(
        x=x,
        y=y,
        data=data,
        orient="v",
        palette=sns.color_palette(
            style.color_gen(color_map, num=len(np.unique(data[x].values)))
        ),
        showfliers=suppress_outliers,
        ax=ax,
    ).set(xlabel=None, ylabel=None)

    # tick label font size
    ax.tick_params(axis="both", colors=style.style_grey, labelsize=1.2 * self.chart_scale)

    # resize x-axis labels as needed
    unique = np.unique(data[x])
    if len(unique) > 10 and len(unique) <= 20:
        ax.tick_params(
            axis="x", colors=style.style_grey, labelsize=1.0 * self.chart_scale
        )
    elif len(unique) > 20:
        ax.tick_params(
            axis="x", colors=style.style_grey, labelsize=0.9 * self.chart_scale
        )
    else:
        ax.tick_params(
            axis="x", colors=style.style_grey, labelsize=1.2 * self.chart_scale
        )

    # resize y-axis
    ax.tick_params(axis="y", labelsize=1.2 * self.chart_scale)

    # fade box plot figures by reducing alpha.
    plt.setp(ax.artists, alpha=alpha)

    # rotate x-tick labels
    plt.xticks(rotation=label_rotate)
    ax.yaxis.set_visible(True)

    # use label formatter utility function to customize chart labels
    util.util_label_formatter(ax=ax, y_units=y_units)
예제 #15
0
def eda_cat_target_num_feat(self,
                            feature,
                            color_map="viridis",
                            outliers_out_of_scope=None,
                            legend_labels=None,
                            chart_scale=15):
    """
    Documentation:

        ---
        Description:
            Creates exploratory data visualizations and statistical summaries for a number
            feature in the context of a categorical target.

        ---
        Parameters:
            feature : str
                Feature to visualize.
            color_map : str specifying built-in matplotlib colormap, default="viridis"
                Color map applied to plots.
            outliers_out_of_scope : boolean, float or int, default=None
                Truncates the x-axis upper limit so that outliers are out of scope of the visualization.
                The x-axis upper limit is reset to the maximum non-outlier value.

                To identify outliers, the IQR is calculated, and values that are below the first quartile
                minus the IQR, or above the third quarterile plus the IQR are designated as outliers. If True
                is passed as a value, the IQR that is subtracted/added is multiplied by 5. If a float or int is
                passed, the IQR is multiplied by that value. Higher values increase how extremem values need
                to be to be identified as outliers.
            legend_labels : list, default=None
                Class labels displayed in plot legend.
            chart_scale : int or float, default=15
                Controls size and proportions of chart and chart elements. Higher value creates larger plots
                and increases visual elements proportionally.
    """
    ### data summaries
    ## bivariate roll_up table
    # combine feature column and target
    bi_df = pd.concat([self.data[feature], self.target], axis=1)

    # remove any rows with nulls
    bi_df = bi_df[bi_df[feature].notnull()]

    # bivariate summary statistics
    bi_summ_stats_df = pd.DataFrame(
        columns=["Class", "Count", "Proportion", "Mean", "StdDev"])

    # for each unique class label
    for labl in np.unique(self.target):

        # get feature values associated with single class label
        feature_slice = bi_df[bi_df[self.target.name] == labl][feature]

        # append summary statistics for feature values associated with class label
        bi_summ_stats_df = bi_summ_stats_df.append(
            {
                "Class": labl,
                "Count": len(feature_slice),
                "Proportion": len(feature_slice) / len(bi_df[feature]) * 100,
                "Mean": np.mean(feature_slice),
                "StdDev": np.std(feature_slice),
            },
            ignore_index=True,
        )

    # apply custom legend labels, or set dtype to int if column values are numeric
    if legend_labels is not None:
        bi_summ_stats_df["Class"] = legend_labels
    elif is_numeric_dtype(bi_summ_stats_df["Class"]):
        bi_summ_stats_df["Class"] = bi_summ_stats_df["Class"].astype(np.int)

    ## Feature summary
    describe_df = pd.DataFrame(bi_df[feature].describe()).reset_index()

    # add missing percentage
    describe_df = describe_df.append(
        {
            "index": "missing",
            feature: np.round(self.data.shape[0] - bi_df[feature].shape[0], 5),
        },
        ignore_index=True,
    )

    # add skew
    describe_df = describe_df.append(
        {
            "index":
            "skew",
            feature:
            np.round(stats.skew(bi_df[feature].values, nan_policy="omit"), 5),
        },
        ignore_index=True,
    )
    # add kurtosis
    describe_df = describe_df.append(
        {
            "index": "kurtosis",
            feature: stats.kurtosis(bi_df[feature].values, nan_policy="omit"),
        },
        ignore_index=True,
    )
    describe_df = describe_df.rename(columns={"index": ""})

    # execute z-test or t-test
    if len(np.unique(self.target)) == 2:
        s1 = bi_df[(bi_df[self.target.name] == bi_df[
            self.target.name].unique()[0])][feature]
        s2 = bi_df[(bi_df[self.target.name] == bi_df[
            self.target.name].unique()[1])][feature]
        if len(s1) > 30 and len(s2) > 30:

            # perform z-test, return z-statistic and p-value
            z, p_val = ztest(s1, s2)

            # add z-statistic and p-value to DataFrame
            stat_test_df = pd.DataFrame(
                data=[{
                    "z-test statistic": z,
                    "p-value": p_val
                }],
                columns=["z-test statistic", "p-value"],
                index=[feature],
            ).round(4)
        else:
            # perform t-test, return t-score and p-value
            t, p_val = stats.ttest_ind(s1, s2)

            # add t-statistic and p-value to DataFrame
            stat_test_df = pd.DataFrame(
                data=[{
                    "t-test statistic": t,
                    "p-value": p_val
                }],
                columns=["t-test statistic", "p-value"],
                index=[feature],
            ).round(4)

        # display summary tables
        self.df_side_by_side(
            dfs=(describe_df, bi_summ_stats_df, stat_test_df),
            names=[
                "Feature summary", "Feature vs. target summary",
                "Statistical test"
            ],
        )
    else:

        # display summary tables
        self.df_side_by_side(
            dfs=(describe_df, bi_summ_stats_df),
            names=["Feature summary", "Feature vs. target summary"],
        )

    ### visualizations
    # create prettierplot object
    p = PrettierPlot(chart_scale=chart_scale, plot_orientation="wide_standard")

    # if boolean is passed to outliers_out_of_scope
    if isinstance(outliers_out_of_scope, bool):
        # if outliers_out_of_scope = True
        if outliers_out_of_scope:

            # identify outliers using IQR method and an IQR step of 5
            outliers = self.outlier_IQR(self.data[feature], iqr_step=5)

            # reset x-axis minimum and maximum
            x_axis_min = self.data[feature].drop(index=outliers).min()
            x_axis_max = self.data[feature].drop(index=outliers).max()
    # if outliers_out_of_scope is a float or int
    elif isinstance(outliers_out_of_scope, float) or isinstance(
            outliers_out_of_scope, int):
        # identify outliers using IQR method and an IQR step equal to the float/int passed
        outliers = self.outlier_IQR(self.data[feature],
                                    iqr_step=outliers_out_of_scope)

        # reset x-axis minimum and maximum
        x_axis_min = self.data[feature].drop(index=outliers).min()
        x_axis_max = self.data[feature].drop(index=outliers).max()

    # add canvas to prettierplot object
    ax = p.make_canvas(
        title="Feature distribution\n* {}".format(feature),
        title_scale=0.85,
        position=221,
    )

    ## dynamically determine precision of x-units
    # capture min and max feature values
    dist_min = bi_df[feature].values.min()
    dist_max = bi_df[feature].values.max()

    # determine x-units precision based on min and max values in feature
    if -3 < dist_min < 3 and -3 < dist_max < 3 and dist_max / dist_min < 10:
        x_units = "fff"
    elif -30 < dist_min < 30 and -30 < dist_max < 30 and dist_max / dist_min < 3:
        x_units = "fff"
    elif -5 < dist_min < 5 and -5 < dist_max < 5 and dist_max / dist_min < 10:
        x_units = "ff"
    elif -90 < dist_min < 90 and -90 < dist_max < 90 and dist_max / dist_min < 5:
        x_units = "ff"
    else:
        x_units = "f"

    # add distribution plot to canvas
    p.dist_plot(
        bi_df[feature].values,
        color=style.style_grey,
        y_units="f",
        x_units=x_units,
        ax=ax,
    )

    # optionally reset x-axis limits
    if outliers_out_of_scope is not None:
        plt.xlim(x_axis_min, x_axis_max)

    # add canvas to prettierplot object
    ax = p.make_canvas(
        title="Probability plot\n* {}".format(feature),
        title_scale=0.85,
        position=222,
    )

    # add QQ / probability plot to canvas
    p.prob_plot(
        x=bi_df[feature].values,
        plot=ax,
    )

    # add canvas to prettierplot object
    ax = p.make_canvas(
        title="Distribution by class\n* {}".format(feature),
        title_scale=0.85,
        position=223,
    )

    ## dynamically determine precision of x-units
    # capture min and max feature values
    dist_min = bi_df[feature].values.min()
    dist_max = bi_df[feature].values.max()

    # determine x-units precision based on min and max values in feature
    if -3 < dist_min < 3 and -3 < dist_max < 3 and dist_max / dist_min < 10:
        x_units = "fff"
    elif -30 < dist_min < 30 and -30 < dist_max < 30 and dist_max / dist_min < 3:
        x_units = "fff"
    elif -5 < dist_min < 5 and -5 < dist_max < 5 and dist_max / dist_min < 10:
        x_units = "ff"
    elif -90 < dist_min < 90 and -90 < dist_max < 90 and dist_max / dist_min < 5:
        x_units = "ff"
    else:
        x_units = "f"

    # generate color list
    color_list = style.color_gen(name=color_map,
                                 num=len(np.unique(self.target)))

    # add one distribution plot to canvas for each category class
    for ix, labl in enumerate(np.unique(bi_df[self.target.name].values)):
        p.dist_plot(
            bi_df[bi_df[self.target.name] == labl][feature].values,
            color=color_list[ix],
            y_units="f",
            x_units=x_units,
            legend_labels=legend_labels if legend_labels is not None else
            np.arange(len(np.unique(self.target))),
            alpha=0.4,
            bbox=(1.0, 1.0),
            ax=ax,
        )

    # optionally reset x-axis limits
    if outliers_out_of_scope is not None:
        plt.xlim(x_axis_min, x_axis_max)

    # add canvas to prettierplot object
    ax = p.make_canvas(
        title="Boxplot by class\n* {}".format(feature),
        title_scale=0.85,
        position=224,
    )

    ## dynamically determine precision of x-units
    # capture min and max feature values
    dist_min = bi_df[feature].values.min()
    dist_max = bi_df[feature].values.max()

    # determine x-units precision based on min and max values in feature
    if -3 < dist_min < 3 and -3 < dist_max < 3 and dist_max / dist_min < 10:
        x_units = "fff"
    elif -30 < dist_min < 30 and -30 < dist_max < 30 and dist_max / dist_min < 3:
        x_units = "fff"
    elif -5 < dist_min < 5 and -5 < dist_max < 5 and dist_max / dist_min < 10:
        x_units = "ff"
    elif -90 < dist_min < 90 and -90 < dist_max < 90 and dist_max / dist_min < 5:
        x_units = "ff"
    else:
        x_units = "f"

    # add horizontal box plot to canvas
    p.box_plot_h(x=feature,
                 y=self.target.name,
                 data=bi_df,
                 alpha=0.7,
                 x_units=x_units,
                 legend_labels=legend_labels,
                 bbox=(1.2, 1.0),
                 suppress_outliers=True,
                 ax=ax)

    # optionally reset x-axis limits
    if outliers_out_of_scope is not None:
        plt.xlim(x_axis_min - (x_axis_min * 0.1), x_axis_max)

    # apply position adjustment to subplots
    plt.subplots_adjust(bottom=-0.1)

    plt.show()
예제 #16
0
def facet_cat_num_hist(self,
                       df,
                       cat_row,
                       cat_col,
                       num_col,
                       split,
                       bbox=None,
                       aspect=1,
                       height=4,
                       alpha=0.8,
                       legend_labels=None,
                       x_units="f",
                       y_units="f",
                       color_map="viridis"):
    """
    Documentation:
        
        ---
        Description:
            Creates histograms of one numeric variable, and each can optionally be split by a category to
            show two or more distributions. Allows for faceting by up to two category variables along the
            column and/or row axes of the figure.
        
        ---
        Parameters:
            df : Pandas DataFrame
                Pandas DataFrame containing data for plotting.
            cat_row : str
                Categorical variable faceted along the row axis.
            cat_col : str
                Categorical variable faceted along the column axis.
            num_col : str
                number variable to plot along x_axis.
            split : str
                Categorical variable on which to differentiate the num_col variable.
            bbox : tuple of floats, default=None
                Coordinates for determining legend position.
            aspect : float, default=1
                higher values create wider plot, lower values create narrow plot, while
                keeping height constant.
            height : float, default=4
                height in inches of each facet.
            alpha : float, default=0.8
                Controls transparency of objects. Accepts value between 0.0 and 1.0.
            legend_labels : list, default=None
                Custom legend labels.
            x_units : str, default='f'
                Determines unit of measurement for x-axis tick labels. 'f' displays float. 'p' displays
                percentages, d' displays dollars. Repeat character (e.g 'ff' or 'ddd') for additional
                decimal places.
            y_units : str, default='f'
                Determines unit of measurement for x-axis tick labels. 'f' displays float. 'p' displays
                percentages, d' displays dollars. Repeat character (e.g 'ff' or 'ddd') for additional
                decimal places.
            color_map : str specifying built-in matplotlib colormap, default="viridis"
                Color map applied to plots.

    """
    # create FacetGrid object
    g = sns.FacetGrid(
        df,
        row=cat_row,
        col=cat_col,
        hue=split,
        hue_order=df[split].sort_values().drop_duplicates().values.tolist()
        if split is not None else None,
        palette=sns.color_palette(
            style.color_gen(color_map, num=len(np.unique(df[split].values)))),
        despine=True,
        height=height,
        aspect=aspect,
        margin_titles=True,
    )

    # map histogram to FacetGrid object
    g.map(
        plt.hist,
        num_col,
        alpha=alpha,
    )

    # format x any y ticklabels, x and y labels, and main title
    for i, ax in enumerate(g.axes.flat):
        _ = ax.set_ylabel(
            ax.get_ylabel(),
            rotation=90,
            fontsize=1.05 * self.chart_scale,
            color=style.style_grey,
        )
        _ = ax.set_xlabel(
            ax.get_xlabel(),
            rotation=0,
            fontsize=1.05 * self.chart_scale,
            color=style.style_grey,
        )
        _ = ax.set_title(
            ax.get_title(),
            rotation=0,
            fontsize=1.05 * self.chart_scale,
            color=style.style_grey,
        )

        # resize y tick labels
        labels = ax.get_yticklabels()
        if len(labels) > 0:
            _ = ax.set_yticklabels(
                ax.get_yticklabels(),
                rotation=0,
                fontsize=0.8 * self.chart_scale,
                color=style.style_grey,
            )
        # resize x tick labels
        labels = ax.get_xticklabels()
        if len(labels) > 0:
            _ = ax.set_xticklabels(
                ax.get_xticklabels(),
                rotation=0,
                fontsize=0.8 * self.chart_scale,
                color=style.style_grey,
            )

        if ax.texts:
            # this contains the right ylabel text
            txt = ax.texts[0]
            ax.text(
                txt.get_unitless_position()[0],
                txt.get_unitless_position()[1],
                txt.get_text(),
                transform=ax.transAxes,
                va="center",
                fontsize=1.05 * self.chart_scale,
                color=style.style_grey,
                rotation=-90,
            )
            # remove the original text
            ax.texts[0].remove()

    ## create custom legend
    # create labels
    if split is not None:
        if legend_labels is None:
            legend_labels = (df[df[split].notnull()][split].sort_values().
                             drop_duplicates().values.tolist())
        else:
            legend_labels = np.array(legend_labels)

        # generate colors
        color_list = style.color_gen(color_map, num=len(legend_labels))

        label_color = {}
        for ix, i in enumerate(legend_labels):
            label_color[i] = color_list[ix]

        # create legend Patches
        patches = [
            Patch(color=v, label=k, alpha=alpha)
            for k, v in label_color.items()
        ]

        # draw legend
        leg = plt.legend(
            handles=patches,
            fontsize=1.0 * self.chart_scale,
            loc="upper right",
            markerscale=0.5 * self.chart_scale,
            ncol=1,
            bbox_to_anchor=bbox,
        )

        # label font color
        for text in leg.get_texts():
            plt.setp(text, color="grey")
예제 #17
0
def eda_num_target_cat_feat(self,
                            feature,
                            level_count_cap=50,
                            color_map="viridis",
                            chart_scale=15):
    """
    Documentation:

        ---
        Description:
            Produces exploratory data visualizations and statistical summaries for a category
            feature in the context of a numeric target.

        ---
        Parameters:
            feature : str
                Feature to visualize.
            level_count_cap : int, default=50
                Maximum number of unique levels in feature. If the number of levels exceeds the
                cap then the feature is skipped.
            color_map : str specifying built-in matplotlib colormap, default="viridis"
                Color map applied to plots.
            chart_scale : int or float, default=15
                Controls size and proportions of chart and chart elements. Higher value creates
                larger plots and increases visual elements proportionally.
    """
    # if number of unique levels in feature is less than specified level_count_cap
    if (len(np.unique(self.data[self.data[feature].notnull()][feature].values))
            < level_count_cap):

        ### data summaries
        ## feature summary
        # create empty DataFrame
        uni_summ_df = pd.DataFrame(columns=[feature, "Count", "Proportion"])

        # capture unique values and count of those unique values
        unique_vals, unique_counts = np.unique(
            self.data[self.data[feature].notnull()][feature],
            return_counts=True)

        # append each unique value, count and proportion to DataFrame
        for i, j in zip(unique_vals, unique_counts):
            uni_summ_df = uni_summ_df.append(
                {
                    feature: i,
                    "Count": j,
                    "Proportion": j / np.sum(unique_counts) * 100
                },
                ignore_index=True,
            )

        # sort DataFrame by "Proportion", descending
        uni_summ_df = uni_summ_df.sort_values(by=["Proportion"],
                                              ascending=False)

        # set values to int dtype where applicable to optimize
        if is_numeric_dtype(uni_summ_df[feature]):
            uni_summ_df[feature] = uni_summ_df[feature].astype("int64")
        uni_summ_df["Count"] = uni_summ_df["Count"].astype("int64")

        ## feature vs. target summary
        # combine feature column and target
        bi_df = pd.concat([self.data[feature], self.target], axis=1)

        # remove any rows with nulls
        bi_df = bi_df[bi_df[feature].notnull()]

        # cast target as float
        bi_df[self.target.name] = bi_df[self.target.name].astype(float)

        # create pivot table of target summary statistics, grouping by category feature
        bi_summ_piv_df = pd.pivot_table(
            bi_df,
            index=feature,
            aggfunc={
                self.target.name:
                [np.nanmin, np.nanmax, np.nanmean, np.nanmedian, np.nanstd]
            })
        multi_index = bi_summ_piv_df.columns
        single_index = pd.Index([i[1] for i in multi_index.tolist()])
        bi_summ_piv_df.columns = single_index
        bi_summ_piv_df.reset_index(inplace=True)
        bi_summ_piv_df = bi_summ_piv_df.rename(
            columns={
                "nanmin": "Min",
                "nanmax": "Max",
                "nanmean": "Mean",
                "nanmedian": "Median",
                "nanstd": "StdDev",
            })
        # fill nan's with zero
        fill_columns = bi_summ_piv_df.iloc[:, 1:].columns
        bi_summ_piv_df[fill_columns] = bi_summ_piv_df[fill_columns].fillna(0)

        # reorder column
        bi_summ_piv_df = bi_summ_piv_df[[
            feature, "Mean", "Median", "StdDev", "Min", "Max"
        ]]

        # convert to int
        if is_numeric_dtype(bi_summ_piv_df[feature]):
            bi_summ_piv_df[feature] = bi_summ_piv_df[feature].astype("int64")

        # display summary tables
        self.df_side_by_side(
            dfs=(uni_summ_df, bi_summ_piv_df),
            names=["Feature summary", "Feature vs. target summary"],
        )

        ### visualizations
        # create prettierplot object
        p = PrettierPlot(chart_scale=chart_scale,
                         plot_orientation="wide_narrow")

        # add canvas to prettierplot object
        ax = p.make_canvas(title="Category counts\n* {}".format(feature),
                           position=131,
                           title_scale=1.0)

        # add treemap to canvas
        p.tree_map(
            counts=uni_summ_df["Count"].values,
            labels=uni_summ_df[feature].values,
            colors=style.color_gen(name=color_map,
                                   num=len(uni_summ_df[feature].values)),
            alpha=0.8,
            ax=ax,
        )

        # add canvas to prettierplot object
        ax = p.make_canvas(title="Feature distribution\n* {}".format(feature),
                           position=132)

        # error catching block for resorting labels
        try:
            sorted(unique_vals, key=int)
        except ValueError:
            pass
        else:
            # sort unique_vals/unique_counts for bar chart
            new_ix = [
                sorted(list(unique_vals), key=int).index(i)
                for i in list(unique_vals)
            ]
            unique_vals = np.array(sorted(list(unique_vals), key=int))
            unique_counts = np.array(
                [y for x, y in sorted(zip(new_ix, unique_counts))])

            # sort temporary data frame for box plot
            bi_df[feature] = bi_df[feature].astype(int)

        # dynamically set rotation angle based on number unique values and maximum length of
        # category labels.
        len_unique_val = len(unique_vals)
        avg_len_unique_val = sum(map(len, str(unique_vals))) / len(unique_vals)
        if len_unique_val <= 4 and avg_len_unique_val <= 12:
            rotation = 0
        elif len_unique_val >= 5 and len_unique_val <= 8 and avg_len_unique_val <= 7.0:
            rotation = 0
        elif len_unique_val >= 9 and len_unique_val <= 14 and avg_len_unique_val <= 6:
            rotation = 0
        else:
            rotation = 30

        # represent x-axis tick labels as integers rather than floats
        x_values = list(map(str, unique_vals.tolist()))
        try:
            x_values = [int(float(x)) for x in x_values]
        except ValueError:
            pass

        # add bar chart to canvas
        p.bar_v(
            x=x_values,
            counts=unique_counts,
            label_rotate=rotation,
            color=style.style_grey,
            y_units="f",
            x_tick_wrap=True,
            ax=ax,
        )

        # hide every other label if total number of levels is greater than 40
        if len_unique_val > 40:
            n = 2
            [
                l.set_visible(False)
                for (i, l) in enumerate(ax.xaxis.get_ticklabels())
                if i % n != 0
            ]

        # add canvas to prettierplot object
        ax = p.make_canvas(title="Boxplot by category\n* {}".format(feature),
                           position=133)

        ## dynamically determine precision of y-units
        # capture min and max feature values
        dist_min = bi_df[self.target.name].values.min()
        dist_max = bi_df[self.target.name].values.max()

        # determine y-units precision based on min and max values in feature
        if -3 < dist_min < 3 and -3 < dist_max < 3 and dist_max / dist_min < 10:
            y_units = "fff"
        elif -30 < dist_min < 30 and -30 < dist_max < 30 and dist_max / dist_min < 3:
            y_units = "fff"
        elif -5 < dist_min < 5 and -5 < dist_max < 5 and dist_max / dist_min < 10:
            y_units = "ff"
        elif -90 < dist_min < 90 and -90 < dist_max < 90 and dist_max / dist_min < 5:
            y_units = "ff"
        else:
            y_units = "f"

        # add vertical box plot to canvas
        p.box_plot_v(
            x=feature,
            y=self.target.name,
            data=bi_df.sort_values([feature]),
            color=matplotlib.cm.get_cmap(name=color_map),
            label_rotate=rotation,
            y_units=y_units,
            ax=ax,
        )

        # hide every other label if total number of levels is greater than 40
        if len_unique_val > 40:
            n = 2
            [
                l.set_visible(False)
                for (i, l) in enumerate(ax.xaxis.get_ticklabels())
                if i % n != 0
            ]

        plt.show()
예제 #18
0
def multi_line(self,
               x,
               y,
               label=None,
               df=None,
               linecolor=None,
               linestyle=None,
               bbox=(1.2, 0.9),
               x_units="f",
               x_ticks=None,
               y_units="f",
               y_ticks=None,
               marker_on=False,
               plot_buffer=False,
               axis_limits=False,
               color_map="viridis",
               ax=None):
    """
    Documentation:

        Description:
            Create single plot with multiple lines. Capable of adjusting which axis will have the same
            data for each line and which will have different data for each line.

        ---
        Parameters:
            x : array or string
                Either 1-dimensional array of values, a multidimensional array of values, a list of columns
                in a Pandas DataFrame, or a column name in a Pandas DataFrame.
            y : array or string
                Either 1-dimensional array of values, a multidimensional array of values, a list of columns
                in a Pandas DataFrame, or a column name in a Pandas DataFrame.
            label : list of strings : default=None
                Custom legend label for each line.
            df : Pandas DataFrame, default=None
                Pandas DataFrame containing data to plot. Can be any size, as plotted columns will be chosen
                by columns names specified in x and y parameters.
            linecolor : str, default=None
                Line colors. If None, utilizes color_map
            linestyle : str, default=None
                Line style.
            bbox : tuple, default=(1.2, 0.9)
                Coordinates for determining legend position.
            x_units : str, default='d'
                Determines unit of measurement for x-axis tick labels. 's' displays string. 'f' displays float.
                'p' displays percentages, 'd' displays dollars. Repeat character (e.g 'ff' or 'ddd') for
                additional decimal places.
            x_ticks : array, default=None
                Custom x-tick labels.
            y_units : str, default='d'
                Determines unit of measurement for x-axis tick labels. 's' displays string. 'f' displays float.
                'p' displays percentages, 'd' displays dollars. Repeat character (e.g 'ff' or 'ddd') for
                additional decimal places.
            y_ticks : array, default=None
                Custom y-tick labels.
            marker_on : bool, default=False
                Controls whether to show line with markers for each data element.
             plot_buffer : bool, default=False
                Controls whether dynamic plot buffer function is executed to ensure visual elements are
                not cut-off at the figure borders.
            axis_limits : bool, default=False
                Controls whether dynamic axis limit setting function is executed.
            color_map : str specifying built-in matplotlib colormap, default="viridis"
                Color map applied to plots.
            ax : axes object, default=None
                Axis object for the visualization.
    """
    if ax is None:
        ax = self.ax

    # if a Pandas DataFrame is passed to function, create x and y arrays using columns names passed into function
    if df is not None:
        if isinstance(df.index, pd.core.indexes.base.Index):
            x = df.index.values
        else:
            x = df[x].values

        y = df[y].values
    else:
        # convert input list to array
        x = np.array(x) if isinstance(x, list) else x
        y = np.array(y) if isinstance(y, list) else y

        x = x.reshape(-1, 1) if len(x.shape) == 1 else x
        y = y.reshape(-1, 1) if len(y.shape) == 1 else y

    # generate color list
    color_list = style.color_gen(name=color_map, num=y.shape[1])

    # add multiple lines to plot
    for ix in np.arange(y.shape[1]):
        y_col = y[:, ix]
        plt.plot(
            x,
            y_col * 100 if "p" in y_units else y_col,
            color=linecolor if linecolor is not None else color_list[ix],
            linestyle=linestyle
            if linestyle is not None else style.style_line_style[0],
            linewidth=0.247 * self.chart_scale,
            label=label[ix] if label is not None else None,
            marker="." if marker_on else None,
            markersize=17 if marker_on else None,
            markerfacecolor="w" if marker_on else None,
            markeredgewidth=2.2 if marker_on else None,
        )

    # add legend to figure
    if label is not None:
        plt.legend(
            loc="upper right",
            bbox_to_anchor=bbox,
            ncol=1,
            frameon=True,
            fontsize=1.1 * self.chart_scale,
        )

    # optionally set axis lower / upper limits
    if axis_limits:
        x_min, x_max, y_min, y_max = util.util_set_axes(x=x, y=y)
        plt.axis([x_min, x_max, y_min, y_max])

    # optionally create smaller buffer around plot area to prevent cutting off elements
    if plot_buffer:
        util.util_plot_buffer(ax=ax, x=0.02, y=0.02)

    # optionally creates custom x-tick labels
    if x_ticks is not None:
        ax.set_xticks(x_ticks)

    # optionally creates custom y-tick labels
    if y_ticks is not None:
        ax.set_yticks(y_ticks)

    # format x and y ticklabels
    ax.set_yticklabels(
        ax.get_yticklabels() * 100 if "p" in y_units else ax.get_yticklabels(),
        rotation=0,
        fontsize=1.1 * self.chart_scale,
        color=style.style_grey,
    )

    ax.set_xticklabels(
        ax.get_xticklabels() * 100 if "p" in y_units else ax.get_xticklabels(),
        rotation=0,
        fontsize=1.1 * self.chart_scale,
        color=style.style_grey,
    )

    # axis tick label formatting
    util.util_label_formatter(ax=ax, x_units=x_units, y_units=y_units)