Exemplo n.º 1
0
def eda_num_target_cat_feat(self,
                            feature,
                            level_count_cap=50,
                            color_map="viridis",
                            chart_scale=15):
    """
    Documentation:

        ---
        Description:
            Produces exploratory data visualizations and statistical summaries for a category
            feature in the context of a numeric target.

        ---
        Parameters:
            feature : str
                Feature to visualize.
            level_count_cap : int, default=50
                Maximum number of unique levels in feature. If the number of levels exceeds the
                cap then the feature is skipped.
            color_map : str specifying built-in matplotlib colormap, default="viridis"
                Color map applied to plots.
            chart_scale : int or float, default=15
                Controls size and proportions of chart and chart elements. Higher value creates
                larger plots and increases visual elements proportionally.
    """
    # if number of unique levels in feature is less than specified level_count_cap
    if (len(np.unique(self.data[self.data[feature].notnull()][feature].values))
            < level_count_cap):

        ### data summaries
        ## feature summary
        # create empty DataFrame
        uni_summ_df = pd.DataFrame(columns=[feature, "Count", "Proportion"])

        # capture unique values and count of those unique values
        unique_vals, unique_counts = np.unique(
            self.data[self.data[feature].notnull()][feature],
            return_counts=True)

        # append each unique value, count and proportion to DataFrame
        for i, j in zip(unique_vals, unique_counts):
            uni_summ_df = uni_summ_df.append(
                {
                    feature: i,
                    "Count": j,
                    "Proportion": j / np.sum(unique_counts) * 100
                },
                ignore_index=True,
            )

        # sort DataFrame by "Proportion", descending
        uni_summ_df = uni_summ_df.sort_values(by=["Proportion"],
                                              ascending=False)

        # set values to int dtype where applicable to optimize
        if is_numeric_dtype(uni_summ_df[feature]):
            uni_summ_df[feature] = uni_summ_df[feature].astype("int64")
        uni_summ_df["Count"] = uni_summ_df["Count"].astype("int64")

        ## feature vs. target summary
        # combine feature column and target
        bi_df = pd.concat([self.data[feature], self.target], axis=1)

        # remove any rows with nulls
        bi_df = bi_df[bi_df[feature].notnull()]

        # cast target as float
        bi_df[self.target.name] = bi_df[self.target.name].astype(float)

        # create pivot table of target summary statistics, grouping by category feature
        bi_summ_piv_df = pd.pivot_table(
            bi_df,
            index=feature,
            aggfunc={
                self.target.name:
                [np.nanmin, np.nanmax, np.nanmean, np.nanmedian, np.nanstd]
            })
        multi_index = bi_summ_piv_df.columns
        single_index = pd.Index([i[1] for i in multi_index.tolist()])
        bi_summ_piv_df.columns = single_index
        bi_summ_piv_df.reset_index(inplace=True)
        bi_summ_piv_df = bi_summ_piv_df.rename(
            columns={
                "nanmin": "Min",
                "nanmax": "Max",
                "nanmean": "Mean",
                "nanmedian": "Median",
                "nanstd": "StdDev",
            })
        # fill nan's with zero
        fill_columns = bi_summ_piv_df.iloc[:, 1:].columns
        bi_summ_piv_df[fill_columns] = bi_summ_piv_df[fill_columns].fillna(0)

        # reorder column
        bi_summ_piv_df = bi_summ_piv_df[[
            feature, "Mean", "Median", "StdDev", "Min", "Max"
        ]]

        # convert to int
        if is_numeric_dtype(bi_summ_piv_df[feature]):
            bi_summ_piv_df[feature] = bi_summ_piv_df[feature].astype("int64")

        # display summary tables
        self.df_side_by_side(
            dfs=(uni_summ_df, bi_summ_piv_df),
            names=["Feature summary", "Feature vs. target summary"],
        )

        ### visualizations
        # create prettierplot object
        p = PrettierPlot(chart_scale=chart_scale,
                         plot_orientation="wide_narrow")

        # add canvas to prettierplot object
        ax = p.make_canvas(title="Category counts\n* {}".format(feature),
                           position=131,
                           title_scale=1.0)

        # add treemap to canvas
        p.tree_map(
            counts=uni_summ_df["Count"].values,
            labels=uni_summ_df[feature].values,
            colors=style.color_gen(name=color_map,
                                   num=len(uni_summ_df[feature].values)),
            alpha=0.8,
            ax=ax,
        )

        # add canvas to prettierplot object
        ax = p.make_canvas(title="Feature distribution\n* {}".format(feature),
                           position=132)

        # error catching block for resorting labels
        try:
            sorted(unique_vals, key=int)
        except ValueError:
            pass
        else:
            # sort unique_vals/unique_counts for bar chart
            new_ix = [
                sorted(list(unique_vals), key=int).index(i)
                for i in list(unique_vals)
            ]
            unique_vals = np.array(sorted(list(unique_vals), key=int))
            unique_counts = np.array(
                [y for x, y in sorted(zip(new_ix, unique_counts))])

            # sort temporary data frame for box plot
            bi_df[feature] = bi_df[feature].astype(int)

        # dynamically set rotation angle based on number unique values and maximum length of
        # category labels.
        len_unique_val = len(unique_vals)
        avg_len_unique_val = sum(map(len, str(unique_vals))) / len(unique_vals)
        if len_unique_val <= 4 and avg_len_unique_val <= 12:
            rotation = 0
        elif len_unique_val >= 5 and len_unique_val <= 8 and avg_len_unique_val <= 7.0:
            rotation = 0
        elif len_unique_val >= 9 and len_unique_val <= 14 and avg_len_unique_val <= 6:
            rotation = 0
        else:
            rotation = 30

        # represent x-axis tick labels as integers rather than floats
        x_values = list(map(str, unique_vals.tolist()))
        try:
            x_values = [int(float(x)) for x in x_values]
        except ValueError:
            pass

        # add bar chart to canvas
        p.bar_v(
            x=x_values,
            counts=unique_counts,
            label_rotate=rotation,
            color=style.style_grey,
            y_units="f",
            x_tick_wrap=True,
            ax=ax,
        )

        # hide every other label if total number of levels is greater than 40
        if len_unique_val > 40:
            n = 2
            [
                l.set_visible(False)
                for (i, l) in enumerate(ax.xaxis.get_ticklabels())
                if i % n != 0
            ]

        # add canvas to prettierplot object
        ax = p.make_canvas(title="Boxplot by category\n* {}".format(feature),
                           position=133)

        ## dynamically determine precision of y-units
        # capture min and max feature values
        dist_min = bi_df[self.target.name].values.min()
        dist_max = bi_df[self.target.name].values.max()

        # determine y-units precision based on min and max values in feature
        if -3 < dist_min < 3 and -3 < dist_max < 3 and dist_max / dist_min < 10:
            y_units = "fff"
        elif -30 < dist_min < 30 and -30 < dist_max < 30 and dist_max / dist_min < 3:
            y_units = "fff"
        elif -5 < dist_min < 5 and -5 < dist_max < 5 and dist_max / dist_min < 10:
            y_units = "ff"
        elif -90 < dist_min < 90 and -90 < dist_max < 90 and dist_max / dist_min < 5:
            y_units = "ff"
        else:
            y_units = "f"

        # add vertical box plot to canvas
        p.box_plot_v(
            x=feature,
            y=self.target.name,
            data=bi_df.sort_values([feature]),
            color=matplotlib.cm.get_cmap(name=color_map),
            label_rotate=rotation,
            y_units=y_units,
            ax=ax,
        )

        # hide every other label if total number of levels is greater than 40
        if len_unique_val > 40:
            n = 2
            [
                l.set_visible(False)
                for (i, l) in enumerate(ax.xaxis.get_ticklabels())
                if i % n != 0
            ]

        plt.show()
Exemplo n.º 2
0
def eda_cat_target_cat_feat(self,
                            feature,
                            level_count_cap=50,
                            color_map="viridis",
                            legend_labels=None,
                            chart_scale=15):
    """
    Documentation:

        ---
        Description:
            Creates exploratory data visualizations and statistical summaries for a category feature
            in the context of a categorical target.

        ---
        Parameters:
            feature : str
                Feature to visualize.
            level_count_cap : int, default=50
                Maximum number of unique levels in feature. If the number of levels exceeds the
                cap, then no visualization panel is produced.
            color_map : str specifying built-in matplotlib colormap, default="viridis"
                Color map applied to plots.
            legend_labels : list, default=None
                Class labels displayed in plot legend.
            chart_scale : int or float, default=15
                Controls size and proportions of chart and chart elements. Higher value creates
                larger plots and increases visual elements proportionally.
    """

    # if number of unique levels in feature is less than specified level_count_cap
    if (len(np.unique(self.data[self.data[feature].notnull()][feature].values))
            < level_count_cap):

        ### data summaries
        ## feature summary
        # create empty DataFrame
        uni_summ_df = pd.DataFrame(columns=[feature, "Count", "Proportion"])

        # capture unique values and count of those unique values
        unique_vals, unique_counts = np.unique(
            self.data[self.data[feature].notnull()][feature],
            return_counts=True)

        # append each unique value, count and proportion to DataFrame
        for i, j in zip(unique_vals, unique_counts):
            uni_summ_df = uni_summ_df.append(
                {
                    feature: i,
                    "Count": j,
                    "Proportion": j / np.sum(unique_counts) * 100,
                },
                ignore_index=True,
            )

        # sort DataFrame by "Proportion", descending
        uni_summ_df = uni_summ_df.sort_values(by=["Proportion"],
                                              ascending=False)

        # set values to int dtype where applicable to optimize
        uni_summ_df["Count"] = uni_summ_df["Count"].astype("int64")
        if is_numeric_dtype(uni_summ_df[feature]):
            uni_summ_df[feature] = uni_summ_df[feature].astype("int64")

        ## feature vs. target summary
        # combine feature column and target
        bi_df = pd.concat([self.data[feature], self.target], axis=1)

        # remove any rows with nulls
        bi_df = bi_df[bi_df[feature].notnull()]

        # groupby category feature and count the occurrences of target classes
        # for each level in category
        bi_summ_df = (
            bi_df.groupby([feature] +
                          [self.target.name]).size().reset_index().pivot(
                              columns=self.target.name,
                              index=feature,
                              values=0))

        # overwrite DataFrame index with actual class labels if provided
        bi_summ_df.columns = pd.Index(
            legend_labels) if legend_labels is not None else pd.Index(
                [i for i in bi_summ_df.columns.tolist()])
        bi_summ_df.reset_index(inplace=True)

        # fill nan's with zero
        fill_columns = bi_summ_df.iloc[:, 2:].columns
        bi_summ_df[fill_columns] = bi_summ_df[fill_columns].fillna(0)

        # set values to int dtype where applicable to optimize displayed DataFrame
        for column in bi_summ_df.columns:
            try:
                bi_summ_df[column] = bi_summ_df[column].astype(np.int)
            except ValueError:
                bi_summ_df[column] = bi_summ_df[column]

        ## proportion by category summary
        # combine feature column and target
        prop_df = pd.concat([self.data[feature], self.target], axis=1)

        # remove any rows with nulls
        prop_df = prop_df[prop_df[feature].notnull()]

        # calculate percent of 100 by class label
        prop_df = prop_df.groupby([feature, self.target.name
                                   ]).agg({self.target.name: {"count"}})
        prop_df = prop_df.groupby(
            level=0).apply(lambda x: 100 * x / float(x.sum()))
        prop_df = prop_df.reset_index()

        multiIndex = prop_df.columns
        singleIndex = [i[0] for i in multiIndex.tolist()]
        singleIndex[-1] = "Count"
        prop_df.columns = singleIndex
        prop_df = prop_df.reset_index(drop=True)

        prop_df = pd.pivot_table(prop_df,
                                 values=["Count"],
                                 columns=[feature],
                                 index=[self.target.name],
                                 aggfunc={"Count": np.mean})
        prop_df = prop_df.reset_index(drop=True)

        multiIndex = prop_df.columns
        singleIndex = []

        for column in multiIndex.tolist():
            try:
                singleIndex.append(int(column[1]))
            except ValueError:
                singleIndex.append(column[1])

        prop_df.columns = singleIndex
        prop_df = prop_df.reset_index(drop=True)

        # insert column to DataFrame with actual class labels if provided, otherwise use raw class labels in target
        prop_df.insert(loc=0,
                       column="Class",
                       value=legend_labels if legend_labels is not None else
                       np.unique(self.target))

        # fill nan's with zero
        fill_columns = prop_df.iloc[:, :].columns
        prop_df[fill_columns] = prop_df[fill_columns].fillna(0)

        # if there are only two class labels, perform z-test/t-test
        if len(np.unique(bi_df[bi_df[feature].notnull()][feature])) == 2:

            # total observations
            total_obs1 = bi_df[(bi_df[feature] == np.unique(
                bi_df[feature])[0])][feature].shape[0]
            total_obs2 = bi_df[(bi_df[feature] == np.unique(
                bi_df[feature])[1])][feature].shape[0]

            # total positive observations
            pos_obs1 = bi_df[(bi_df[feature] == np.unique(bi_df[feature])[0])
                             &
                             (bi_df[self.target.name] == 1)][feature].shape[0]
            pos_obs2 = bi_df[(bi_df[feature] == np.unique(bi_df[feature])[1])
                             &
                             (bi_df[self.target.name] == 1)][feature].shape[0]

            # perform z-test, return z-statistic and p-value
            z, p_val = proportions_ztest(count=(pos_obs1, pos_obs2),
                                         nobs=(total_obs1, total_obs2))

            # add z-statistic and p-value to DataFrame
            stat_test_df = pd.DataFrame(
                data=[{
                    "z-test statistic": z,
                    "p-value": p_val
                }],
                columns=["z-test statistic", "p-value"],
                index=[feature],
            ).round(4)

            # display summary tables
            self.df_side_by_side(
                dfs=(uni_summ_df, bi_summ_df, prop_df, stat_test_df),
                names=[
                    "Feature summary",
                    "Feature vs. target summary",
                    "Target proportion",
                    "Statistical test",
                ],
            )
            if "percent_positive" in bi_summ_df:
                bi_summ_df = bi_summ_df.drop(["percent_positive"], axis=1)

        else:
            # display summary tables
            self.df_side_by_side(
                dfs=(uni_summ_df, bi_summ_df, prop_df),
                names=[
                    "Feature summary", "Feature vs. target summary",
                    "Target proportion"
                ],
            )
            if "percent_positive" in bi_summ_df:
                bi_summ_df = bi_summ_df.drop(["percent_positive"], axis=1)

        ### visualizations
        # set label rotation angle
        len_unique_val = len(unique_vals)
        avg_len_unique_val = sum(map(len, str(unique_vals))) / len(unique_vals)
        if len_unique_val <= 4 and avg_len_unique_val <= 12:
            rotation = 0
        elif len_unique_val >= 5 and len_unique_val <= 8 and avg_len_unique_val <= 8:
            rotation = 0
        elif len_unique_val >= 9 and len_unique_val <= 14 and avg_len_unique_val <= 4:
            rotation = 0
        else:
            rotation = 90

        # create prettierplot object
        p = PrettierPlot(chart_scale=chart_scale,
                         plot_orientation="wide_narrow")

        # add canvas to prettierplot object
        ax = p.make_canvas(title="Category counts\n* {}".format(feature),
                           position=131,
                           title_scale=0.82)

        # add treemap to canvas
        p.tree_map(
            counts=uni_summ_df["Count"].values,
            labels=uni_summ_df[feature].values,
            colors=style.color_gen(name=color_map,
                                   num=len(uni_summ_df[feature].values)),
            alpha=0.8,
            ax=ax,
        )

        # add canvas to prettierplot object
        ax = p.make_canvas(
            title="Category counts by target\n* {}".format(feature),
            position=132)

        # add faceted categorical plot to canvas
        p.facet_cat(
            df=bi_summ_df,
            feature=feature,
            label_rotate=rotation,
            color_map=color_map,
            bbox=(1.0, 1.15),
            alpha=0.8,
            legend_labels=legend_labels,
            x_units=None,
            ax=ax,
        )

        # add canvas to prettierplot object
        ax = p.make_canvas(
            title="Target proportion by category\n* {}".format(feature),
            position=133)

        # add stacked bar chart to canvas
        p.stacked_bar_h(
            df=prop_df.drop("Class", axis=1),
            bbox=(1.0, 1.15),
            legend_labels=legend_labels,
            color_map=color_map,
            alpha=0.8,
            ax=ax,
        )

        plt.show()