Exemplo n.º 1
0
def mixed_data_features(df, add_nans=False):
    """Extracts the numerical/categorical feature parameters from a dataset.

    Parameters
    ----------
    df : pandas DataFrame
        The dataset with all the features to analyze (both numerical and
        categorical).
    add_nans : bool
        If True the sampler adds a "NaNs" category for the categorical features
        that have any null values and assigns it the appropriate fraction.

    Returns
    -------
    dict of dicts
        A dictionary with an entry per dataset feature (dictionary keys are the
        column names), where each numerical feature entry contains a nested
        dictionary with the values of the minimum and maximum values of the
        dynamic range of the dataset, as well as the mean and sigma of the
        distribution, and each categorical feature entry contains a nested
        dictionary with its categories and the fraction of each category present
        in the analyzed dataset (nested dictionary keys are "min", "max",
        "mean", "sigma", and "categories", which is also a dictionary with one
        entry per category).
    """
    features_dict = {}
    for feature in df:
        df_feature = df[[feature]]
        if is_discrete(df[feature]):
            single_feature_parameters = find_categories(df_feature, add_nans=add_nans)
        else:
            single_feature_parameters = dynamical_range(df_feature)
        features_dict[feature] = single_feature_parameters[feature]

    return features_dict
Exemplo n.º 2
0
def compute_conditional_metric(
    grouping_col,
    true_labs,
    pred_labs,
    metric,
    as_categorical=False,
    num_bins=10,
    quantile=False,
):
    """Compute metric values conditional on the grouping column.

    The metric is computed within unique values of the grouping column
    (categorical) or within bins partitioning its range (continuous).

    Parameters
    ----------
    grouping_col : Series
        Series defining a grouping for the metric computation.
    true_labs : Series
        Series of true labels for a test dataset.
    pred_labs : Series
        Series of labels predicted by a model for a test dataset.
    metric : function
        The evaluation metric to compute across the groupings. This should be
        a function f(y_true, y_pred) which accepts Series of true and
        predicted labels.
    as_categorical : bool
        Should the grouping column be treated as categorical, ie. binned
        on its unique values? If it is not numeric, this param is ignored.
    num_bins : int
        Number of bins to use for grouping a numeric column.
    quantile : bool
        Should the bin widths correspond to quantiles of a numerical column's
        distribution (`True`) or be equally-spaced over its range (`False`).

    Returns
    -------
    ConditionalMetricResult
    """

    y_vals = DataFrame({"y_true": true_labs, "y_pred": pred_labs})
    if is_discrete(grouping_col):
        as_categorical = True
    if as_categorical:
        grouping = grouping_col
        bins = grouping.unique()
    else:
        grouping, bins = get_bins(grouping_col, num_bins, quantile)
    binned_metric_vals = y_vals.groupby(grouping).apply(
        lambda gp: metric(gp["y_true"], gp["y_pred"]))

    return ConditionalMetricResult(
        vals=binned_metric_vals,
        bins=Series(bins),
        categorical=as_categorical,
        num_bins=num_bins,
        quantile=quantile,
    )
Exemplo n.º 3
0
def find_categories(df, add_nans=False):
    """Returns the categories of the dataset features.

    Parameters
    ----------
    df : pandas DataFrame
        The dataset with all the categorical features to analyze.
    add_nans : bool
        If True the sampler adds a "NaNs" category for the features that have
        any null values and assigns it the appropriate fraction.

    Returns
    -------
    dict of dicts
        A dictionary with an entry per dataset feature (dictionary keys are the
        column names), where each feature entry contains a nested dictionary
        with its categories and the fraction of each category present in the
        analyzed dataset (the nested dictionary key for this information is
        "categories", which is also a dictionary with one entry per category).
    """
    categories_dict = {}
    for feature in df:
        if is_discrete(df[feature]):
            # Remove NaN values from selection
            df_no_nans = df[df[feature].notnull()]

            # Log fraction of NaN values if required
            if add_nans:
                nan_fraction = df[feature].isnull().sum() / len(df)
                total_length = len(df)
            else:
                nan_fraction = 0
                total_length = len(df_no_nans)

            categories_dict[feature] = {
                "categories": {
                    key: None for key in df_no_nans[feature].unique().tolist()
                }
            }
            for category in categories_dict[feature]["categories"].keys():
                categories_dict[feature]["categories"][category] = (
                    df_no_nans[feature].value_counts()[category] / total_length
                )
            if add_nans and nan_fraction != 0:
                categories_dict[feature]["categories"]["NaNs"] = nan_fraction

    return categories_dict
Exemplo n.º 4
0
    def compute(self, **kwargs):
        """Compute the evaluation for the given datasets.

        Parameters
        ----------
        kwargs:
            On-the-fly overrides to the config option values for the computation.

        Returns
        -------
        SpatialDistributionResult
        """
        eval_config = PrescConfig(self._config)
        eval_config = eval_config["evaluations"]["spatial_distribution"]
        if kwargs:
            eval_config.set(kwargs)

        # Feature columns to include in the distance computation.
        feats_incl = eval_config["features_include"].get()
        feats_excl = eval_config["features_exclude"].get()
        feats = include_exclude_list(
            self._test_dataset.feature_names, included=feats_incl, excluded=feats_excl
        )
        num_feats = []
        categ_feats = []
        for col in feats:
            if is_discrete(self._test_dataset.features[col]):
                categ_feats.append(col)
            else:
                num_feats.append(col)

        # Figure the metric to use for each feature.
        dist_metrics_num, dist_metrics_categ = _get_distance_metrics_by_column(
            num_feats, categ_feats, eval_config
        )

        return compute_spatial_distribution(
            test_features=self._test_dataset.features,
            test_labs_true=self._test_dataset.labels,
            test_labs_pred=self._test_pred,
            base_features=self._train_dataset.features,
            base_labs=self._train_dataset.labels,
            numerical_dist_metric=dist_metrics_num,
            categorical_dist_metric=dist_metrics_categ,
            summary=eval_config["summary_agg"].get(),
        )
Exemplo n.º 5
0
def compute_conditional_distribution(
    data_col, true_labs, pred_labs, as_categorical=False, binning="fd", common_bins=True
):
    """Compute a distributional summary.

    The metric is computed within unique values of the grouping column
    (categorical) or within bins partitioning its range (continuous).

    Parameters
    ----------
    data_col :
        A column of data from a test dataset.
    true_labs : Series
        A series of true labels for the test dataset.
    pred_labs : Series
        A series of labels predicted by a model for the test dataset.
    as_categorical : bool
        Should the data column be treated as categorical, ie. binned
        on its unique values? If it is not numeric, this param is ignored.
    binning : str
        Binning scheme to use for a numerical column, passed to `numpy.histogram`.
        Can be a fixed number of bins or a string indicating a binning scheme
    common_bins : bool
        Should the bins be computed over the entire column and shared
        across groups (`True`) or computed within each group (`False`)

    Returns
    -------
    ConditionalDistributionResult
    """

    grouping = [true_labs, pred_labs]
    if is_discrete(data_col):
        as_categorical = True
    if as_categorical:
        grouping.append(data_col)
        distribs = data_col.groupby(grouping).size()
        if common_bins:
            # Extend the index in each label group to include all data values
            data_vals = distribs.index.get_level_values(-1).unique()
            y_vals = distribs.index.droplevel(-1).unique()
            full_ind = MultiIndex.from_tuples(
                [(yt, yp, x) for yt, yp in y_vals.values for x in data_vals],
                names=distribs.index.names,
            )
            distribs = distribs.reindex(index=full_ind, fill_value=0)
            bin_edges = Series(data_vals)
        else:
            # Convert the innermost index level to a Series of bin edges.
            bin_edges = distribs.rename(None).reset_index(level=-1).iloc[:, 0]
    else:
        if common_bins:
            bins = histogram_bin_edges(data_col, bins=binning)
        else:
            bins = binning
        # distribs will be a series with values (<hist_values>, <bin_edges>)
        distribs = data_col.groupby(grouping).apply(lambda x: histogram(x, bins=bins))
        bin_edges = distribs.map(lambda x: x[1])
        bin_ind_tuples = []
        for y in distribs.index:
            bin_ind_tuples.extend(
                [(y[0], y[1], x) for x in _histogram_bin_labels(bin_edges.loc[y])]
            )
        index_with_bins = MultiIndex.from_tuples(
            bin_ind_tuples, names=distribs.index.names + [None]
        )
        distribs = Series(
            distribs.map(lambda x: x[0]).explode().values, index=index_with_bins
        )
        if common_bins:
            # Retain the unique bin edges as an array
            bin_edges = Series(bin_edges.iloc[0])

    return ConditionalDistributionResult(
        vals=distribs,
        bins=Series(bin_edges),
        categorical=as_categorical,
        binning=binning,
        common_bins=common_bins,
    )