Пример #1
0
def eda_missing_summary(self, data=None, color=style.style_grey, display_df=False, chart_scale=15):
    """
    Documentation:

        ---
        Description:
            Creates vertical bar chart visualizing the percent of values missing for each feature.
            Optionally displays the underlying Pandas DataFrame.

        ---
        Parameters:
            data : Pandas DataFrame, default=None
                Pandas DataFrame containing independent variables. If left as none,
                the feature dataset provided to Machine during instantiation is used.
            color : str or color code, default=style.style_grey
                Bar color.
            display_df : boolean, default=False
                Controls whether to display summary data in Pandas DataFrame in addition to chart.
            chart_scale : int or float, default=15
                Controls size and proportions of chart and chart elements. Higher value creates
                larger plots and increases visual elements proportionally.
    """
    # use dataset provided during instantiation if None
    if data is None:
        data = self.data

    # return missingness summary
    percent_missing = self.missing_summary(data)

    # if missingness summary is not empty, create the visualization
    if not percent_missing.empty:
        # optionally display DataFrame summary
        if display_df:
            display(percent_missing)

        # create prettierplot object
        p = PrettierPlot(chart_scale=chart_scale, plot_orientation="wide_standard")

        # add canvas to prettierplot object
        ax = p.make_canvas(
            title="Percent missing by feature",
            y_shift=0.8,
            title_scale=0.8,
        )

        # add vertical bar chart to canvas
        p.bar_v(
            x=percent_missing.index,
            counts=percent_missing["Percent missing"],
            label_rotate=45 if len(percent_missing.index) <=5 else 90,
            color=color,
            y_units="p",
            x_tick_wrap=False,
            ax=ax,
        )
    
    # if missingness summary is empty, just print "No Nulls"
    else:
        print("No nulls")
Пример #2
0
def eda_missing_summary(self, training_data=True, color=style.style_grey, display_df=False, chart_scale=15):
    """
    Documentation:

        ---
        Description:
            Creates vertical bar chart visualizing the percent of values missing for each feature.
            Optionally displays the underlying Pandas DataFrame.

        ---
        Parameters:
            training_data : boolean, dafault=True
                Controls which dataset (training or validation) is used for visualization.
            color : str or color code, default=style.style_grey
                Bar color.
            display_df : boolean, default=False
                Controls whether to display summary data in Pandas DataFrame in addition to chart.
            chart_scale : int or float, default=15
                Controls size and proportions of chart and chart elements. Higher value creates
                larger plots and increases visual elements proportionally.
    """
    # dynamically choose training data objects or validation data objects
    data, _, mlm_dtypes = self.training_or_validation_dataset(training_data)

    # return missingness summary
    percent_missing = self.missing_summary(training_data)

    # if missingness summary is not empty, create the visualization
    if not percent_missing.empty:
        # optionally display DataFrame summary
        if display_df:
            display(percent_missing)

        # create prettierplot object
        p = PrettierPlot(chart_scale=chart_scale, plot_orientation="wide_standard")

        # add canvas to prettierplot object
        ax = p.make_canvas(
            title="Percent missing by feature",
            y_shift=0.8,
            title_scale=0.8,
        )

        # add vertical bar chart to canvas
        p.bar_v(
            x=percent_missing.index,
            counts=percent_missing["Percent missing"],
            label_rotate=45 if len(percent_missing.index) <=5 else 90,
            color=color,
            y_units="p",
            x_tick_wrap=False,
            ax=ax,
        )

        ax.set_ylim([0,100])

    # if missingness summary is empty, just print "No Nulls"
    else:
        print("No nulls")
Пример #3
0
def eda_skew_summary(self, data=None, color=style.style_grey, display_df=False, chart_scale=15):
    """
    Documentation:

        ---
        Description:
            Creates vertical bar chart visualizing the skew for each feature. Optionally
            displaying the underlying Pandas DataFrame.

        ---
        Parameters:
            data : Pandas DataFrame, default=None
                Pandas DataFrame containing independent variables. If left as none,
                the feature dataset provided to Machine during instantiation is used.
            color : str, color code, default=style.style_grey
                Bar color.
            display_df : boolean, default=False
                Controls whether to display summary data in Pandas DataFrame along with chart.
            chart_scale : int or float, default=15
                Controls size and proportions of chart and chart elements. Higher value creates
                larger plots and increases visual elements proportionally.
    """
    # use dataset provided during instantiation if None
    if data is None:
        data = self.data

    # return skewness summary
    skew_summary = self.skew_summary(data)

    # optionally display DataFrame summary
    if display_df:
        display(skew_summary)

    # create prettierplot object
    p = PrettierPlot(chart_scale=chart_scale, plot_orientation="wide_standard")

    # add canvas to prettierplot object
    ax = p.make_canvas(
        title="Skew by feature",
        y_shift=0.8,
        title_scale=0.8,
    )

    # add vertical bar chart to canvas
    p.bar_v(
        x=skew_summary.index,
        counts=skew_summary["Skew"],
        label_rotate=45 if len(skew_summary.index) <=5 else 90,
        color=color,
        y_units="fff",
        x_tick_wrap=False,
        ax=ax,
    )
Пример #4
0
def eda_skew_summary(self, training_data=True, color=style.style_grey, display_df=False, chart_scale=15):
    """
    Documentation:

        ---
        Description:
            Creates vertical bar chart visualizing the skew for each feature. Optionally
            displaying the underlying Pandas DataFrame.

        ---
        Parameters:
            training_data : boolean, dafault=True
                Controls which dataset (training or validation) is used for visualization.
            color : str, color code, default=style.style_grey
                Bar color.
            display_df : boolean, default=False
                Controls whether to display summary data in Pandas DataFrame along with chart.
            chart_scale : int or float, default=15
                Controls size and proportions of chart and chart elements. Higher value creates
                larger plots and increases visual elements proportionally.
    """
    # dynamically choose training data objects or validation data objects
    data, _, mlm_dtypes = self.training_or_validation_dataset(training_data)

    # return skewness summary
    skew_summary = self.skew_summary(data)

    # optionally display DataFrame summary
    if display_df:
        display(skew_summary)

    # create prettierplot object
    p = PrettierPlot(chart_scale=chart_scale, plot_orientation="wide_standard")

    # add canvas to prettierplot object
    ax = p.make_canvas(
        title="Skew by feature",
        y_shift=0.8,
        title_scale=0.8,
    )

    # add vertical bar chart to canvas
    p.bar_v(
        x=skew_summary.index,
        counts=skew_summary["Skew"],
        label_rotate=45 if len(skew_summary.index) <=5 else 90,
        color=color,
        y_units="fff",
        x_tick_wrap=False,
        ax=ax,
    )
Пример #5
0
def eda_num_target_cat_feat(self,
                            feature,
                            level_count_cap=50,
                            color_map="viridis",
                            chart_scale=15):
    """
    Documentation:

        ---
        Description:
            Produces exploratory data visualizations and statistical summaries for a category
            feature in the context of a numeric target.

        ---
        Parameters:
            feature : str
                Feature to visualize.
            level_count_cap : int, default=50
                Maximum number of unique levels in feature. If the number of levels exceeds the
                cap then the feature is skipped.
            color_map : str specifying built-in matplotlib colormap, default="viridis"
                Color map applied to plots.
            chart_scale : int or float, default=15
                Controls size and proportions of chart and chart elements. Higher value creates
                larger plots and increases visual elements proportionally.
    """
    # if number of unique levels in feature is less than specified level_count_cap
    if (len(np.unique(self.data[self.data[feature].notnull()][feature].values))
            < level_count_cap):

        ### data summaries
        ## feature summary
        # create empty DataFrame
        uni_summ_df = pd.DataFrame(columns=[feature, "Count", "Proportion"])

        # capture unique values and count of those unique values
        unique_vals, unique_counts = np.unique(
            self.data[self.data[feature].notnull()][feature],
            return_counts=True)

        # append each unique value, count and proportion to DataFrame
        for i, j in zip(unique_vals, unique_counts):
            uni_summ_df = uni_summ_df.append(
                {
                    feature: i,
                    "Count": j,
                    "Proportion": j / np.sum(unique_counts) * 100
                },
                ignore_index=True,
            )

        # sort DataFrame by "Proportion", descending
        uni_summ_df = uni_summ_df.sort_values(by=["Proportion"],
                                              ascending=False)

        # set values to int dtype where applicable to optimize
        if is_numeric_dtype(uni_summ_df[feature]):
            uni_summ_df[feature] = uni_summ_df[feature].astype("int64")
        uni_summ_df["Count"] = uni_summ_df["Count"].astype("int64")

        ## feature vs. target summary
        # combine feature column and target
        bi_df = pd.concat([self.data[feature], self.target], axis=1)

        # remove any rows with nulls
        bi_df = bi_df[bi_df[feature].notnull()]

        # cast target as float
        bi_df[self.target.name] = bi_df[self.target.name].astype(float)

        # create pivot table of target summary statistics, grouping by category feature
        bi_summ_piv_df = pd.pivot_table(
            bi_df,
            index=feature,
            aggfunc={
                self.target.name:
                [np.nanmin, np.nanmax, np.nanmean, np.nanmedian, np.nanstd]
            })
        multi_index = bi_summ_piv_df.columns
        single_index = pd.Index([i[1] for i in multi_index.tolist()])
        bi_summ_piv_df.columns = single_index
        bi_summ_piv_df.reset_index(inplace=True)
        bi_summ_piv_df = bi_summ_piv_df.rename(
            columns={
                "nanmin": "Min",
                "nanmax": "Max",
                "nanmean": "Mean",
                "nanmedian": "Median",
                "nanstd": "StdDev",
            })
        # fill nan's with zero
        fill_columns = bi_summ_piv_df.iloc[:, 1:].columns
        bi_summ_piv_df[fill_columns] = bi_summ_piv_df[fill_columns].fillna(0)

        # reorder column
        bi_summ_piv_df = bi_summ_piv_df[[
            feature, "Mean", "Median", "StdDev", "Min", "Max"
        ]]

        # convert to int
        if is_numeric_dtype(bi_summ_piv_df[feature]):
            bi_summ_piv_df[feature] = bi_summ_piv_df[feature].astype("int64")

        # display summary tables
        self.df_side_by_side(
            dfs=(uni_summ_df, bi_summ_piv_df),
            names=["Feature summary", "Feature vs. target summary"],
        )

        ### visualizations
        # create prettierplot object
        p = PrettierPlot(chart_scale=chart_scale,
                         plot_orientation="wide_narrow")

        # add canvas to prettierplot object
        ax = p.make_canvas(title="Category counts\n* {}".format(feature),
                           position=131,
                           title_scale=1.0)

        # add treemap to canvas
        p.tree_map(
            counts=uni_summ_df["Count"].values,
            labels=uni_summ_df[feature].values,
            colors=style.color_gen(name=color_map,
                                   num=len(uni_summ_df[feature].values)),
            alpha=0.8,
            ax=ax,
        )

        # add canvas to prettierplot object
        ax = p.make_canvas(title="Feature distribution\n* {}".format(feature),
                           position=132)

        # error catching block for resorting labels
        try:
            sorted(unique_vals, key=int)
        except ValueError:
            pass
        else:
            # sort unique_vals/unique_counts for bar chart
            new_ix = [
                sorted(list(unique_vals), key=int).index(i)
                for i in list(unique_vals)
            ]
            unique_vals = np.array(sorted(list(unique_vals), key=int))
            unique_counts = np.array(
                [y for x, y in sorted(zip(new_ix, unique_counts))])

            # sort temporary data frame for box plot
            bi_df[feature] = bi_df[feature].astype(int)

        # dynamically set rotation angle based on number unique values and maximum length of
        # category labels.
        len_unique_val = len(unique_vals)
        avg_len_unique_val = sum(map(len, str(unique_vals))) / len(unique_vals)
        if len_unique_val <= 4 and avg_len_unique_val <= 12:
            rotation = 0
        elif len_unique_val >= 5 and len_unique_val <= 8 and avg_len_unique_val <= 7.0:
            rotation = 0
        elif len_unique_val >= 9 and len_unique_val <= 14 and avg_len_unique_val <= 6:
            rotation = 0
        else:
            rotation = 30

        # represent x-axis tick labels as integers rather than floats
        x_values = list(map(str, unique_vals.tolist()))
        try:
            x_values = [int(float(x)) for x in x_values]
        except ValueError:
            pass

        # add bar chart to canvas
        p.bar_v(
            x=x_values,
            counts=unique_counts,
            label_rotate=rotation,
            color=style.style_grey,
            y_units="f",
            x_tick_wrap=True,
            ax=ax,
        )

        # hide every other label if total number of levels is greater than 40
        if len_unique_val > 40:
            n = 2
            [
                l.set_visible(False)
                for (i, l) in enumerate(ax.xaxis.get_ticklabels())
                if i % n != 0
            ]

        # add canvas to prettierplot object
        ax = p.make_canvas(title="Boxplot by category\n* {}".format(feature),
                           position=133)

        ## dynamically determine precision of y-units
        # capture min and max feature values
        dist_min = bi_df[self.target.name].values.min()
        dist_max = bi_df[self.target.name].values.max()

        # determine y-units precision based on min and max values in feature
        if -3 < dist_min < 3 and -3 < dist_max < 3 and dist_max / dist_min < 10:
            y_units = "fff"
        elif -30 < dist_min < 30 and -30 < dist_max < 30 and dist_max / dist_min < 3:
            y_units = "fff"
        elif -5 < dist_min < 5 and -5 < dist_max < 5 and dist_max / dist_min < 10:
            y_units = "ff"
        elif -90 < dist_min < 90 and -90 < dist_max < 90 and dist_max / dist_min < 5:
            y_units = "ff"
        else:
            y_units = "f"

        # add vertical box plot to canvas
        p.box_plot_v(
            x=feature,
            y=self.target.name,
            data=bi_df.sort_values([feature]),
            color=matplotlib.cm.get_cmap(name=color_map),
            label_rotate=rotation,
            y_units=y_units,
            ax=ax,
        )

        # hide every other label if total number of levels is greater than 40
        if len_unique_val > 40:
            n = 2
            [
                l.set_visible(False)
                for (i, l) in enumerate(ax.xaxis.get_ticklabels())
                if i % n != 0
            ]

        plt.show()
Пример #6
0
from prettierplot.plotter import PrettierPlot
from prettierplot import data
import numpy as np

df = data.attrition()

# capture unique EmployeeField values and frequency counts
unique_vals, unique_counts = np.unique(
    df[df["EducationField"].notnull()]["EducationField"], return_counts=True)

# create plotting instance
p = PrettierPlot(chart_scale=10)

# create Axes object and decorate
ax = p.make_canvas(title="Educational field category counts",
                   y_label="Category counts",
                   y_shift=0.47)

# add plots
p.bar_v(x=unique_vals, counts=unique_counts, label_rotate=45, x_tick_wrap=True)