예제 #1
0
def split_data(dtype,
               x,
               y,
               special_codes=None,
               cat_cutoff=None,
               user_splits=None,
               check_input=True,
               outlier_detector=None,
               outlier_params=None,
               fix_lb=None,
               fix_ub=None,
               class_weight=None,
               sample_weight=None):
    """Split data into clean, missing and special values data.

    Parameters
    ----------
    dtype : str, optional (default="numerical")
        The variable data type. Supported data types are "numerical" for
        continuous and ordinal variables and "categorical" for categorical
        and nominal variables.

    x : array-like, shape = (n_samples)
        Data samples, where n_samples is the number of samples.

    y : array-like, shape = (n_samples)
        Target vector relative to x.

    special_codes : array-like or None, optional (default=None)
        List of special codes. Use special codes to specify the data values
        that must be treated separately.

    cat_cutoff : float or None, optional (default=None)
        Generate bin others with categories in which the fraction of
        occurrences is below the  ``cat_cutoff`` value. This option is
        available when ``dtype`` is "categorical".

    user_splits : array-like or None, optional (default=None)
        The list of pre-binning split points when ``dtype`` is "numerical" or
        the list of prebins when ``dtype`` is "categorical".

    check_input : bool, (default=True)
        If False, the input arrays x and y will not be checked.

    outlier_detector : str or None (default=None)
        The outlier detection method. Supported methods are "range" to use
        the interquartile range based method or "zcore" to use the modified
        Z-score method.

    outlier_params : dict or None (default=None)
        Dictionary of parameters to pass to the outlier detection method.

    fix_lb : float or None (default=None)
        Lower bound or minimum admissible value.

    fix_ub : float or None (default=None)
        Upper bound or maximum admissible value.

    class_weight : None

    sample_weight : None

    Returns
    -------
    x_clean : array, shape = (n_clean)
        Clean data samples

    y_clean : array, shape = (n_clean)
        Clean target samples.

    x_missing : array, shape = (n_missing)
        Missing data samples.

    y_missing : array, shape = (n_missing)
        Missing target samples.

    x_special : array, shape = (n_special)
        Special data samples.

    y_special : array, shape = (n_special)
        Special target samples.

    y_others : array, shape = (n_others)
        Others target samples.

    categories : array, shape (n_categories)
        List of categories.

    others : array, shape (n_other_categories)
        List of other categories.
    """
    if outlier_detector is not None:
        if outlier_detector not in ("range", "zscore"):
            raise ValueError('Invalid value for outlier_detector. Allowed '
                             'string values are "range" and "zscore".')

        if outlier_params is not None:
            if not isinstance(outlier_params, dict):
                raise TypeError("outlier_params must be a dict or None; "
                                "got {}.".format(outlier_params))

    if fix_lb is not None:
        if not isinstance(fix_lb, numbers.Number):
            raise ValueError("fix_lb must be a number; got {}.".format(fix_lb))

    if fix_ub is not None:
        if not isinstance(fix_ub, numbers.Number):
            raise ValueError("fix_ub must be a number; got {}.".format(fix_ub))

    if fix_lb is not None and fix_ub is not None:
        if fix_lb > fix_ub:
            raise ValueError("fix_lb must be <= fix_ub; got {} <= {}.".format(
                fix_lb, fix_ub))

    if check_input:
        x = check_array(x,
                        ensure_2d=False,
                        dtype=None,
                        force_all_finite='allow-nan')

        y = check_array(y, ensure_2d=False, dtype=None, force_all_finite=True)

        check_consistent_length(x, y)

    x = np.asarray(x)
    y = np.asarray(y)

    sample_weight = _check_sample_weight(sample_weight, x, dtype=x.dtype)

    if class_weight is not None:
        classes = np.unique(y)
        le = LabelEncoder()
        class_weight_ = compute_class_weight(class_weight, classes, y)
        sample_weight *= class_weight_[le.fit_transform(y)]

    if isinstance(x.dtype, object) or isinstance(y.dtype, object):
        missing_mask = pd.isnull(x) | pd.isnull(y)
    else:
        missing_mask = np.isinan(x) | np.isnan(y)

    if special_codes is None:
        clean_mask = ~missing_mask

        x_clean = x[clean_mask]
        y_clean = y[clean_mask]
        x_missing = x[missing_mask]
        y_missing = y[missing_mask]
        x_special = []
        y_special = []
        sw_clean = sample_weight[clean_mask]
        sw_missing = sample_weight[missing_mask]
        sw_special = []
    else:
        special_mask = pd.Series(x).isin(special_codes).values

        clean_mask = ~missing_mask & ~special_mask

        x_clean = x[clean_mask]
        y_clean = y[clean_mask]
        x_missing = x[missing_mask]
        y_missing = y[missing_mask]
        x_special = x[special_mask]
        y_special = y[special_mask]
        sw_clean = sample_weight[clean_mask]
        sw_missing = sample_weight[missing_mask]
        sw_special = sample_weight[special_mask]

    if dtype == "numerical":
        if outlier_detector is not None:
            if outlier_detector == "range":
                detector = RangeDetector()
            elif outlier_detector == "zscore":
                detector = ModifiedZScoreDetector()

            if outlier_params is not None:
                detector.set_params(**outlier_params)

            mask_outlier = detector.fit(x_clean).get_support()
            x_clean = x_clean[~mask_outlier]
            y_clean = y_clean[~mask_outlier]
            sw_clean = sw_clean[~mask_outlier]

        if fix_lb is not None or fix_ub is not None:
            if fix_lb is not None:
                mask = x_clean >= fix_lb
            elif fix_ub is not None:
                mask = x_clean <= fix_ub
            else:
                mask = (x_clean >= fix_lb) & (x_clean <= fix_ub)

            x_clean = x_clean[mask]
            y_clean = y_clean[mask]
            sw_clean = sw_clean[mask]

    if dtype == "categorical" and user_splits is None:
        if cat_cutoff is not None:
            mask_others, others = categorical_cutoff(x_clean, y_clean,
                                                     cat_cutoff)

            y_others = y_clean[mask_others]
            sw_others = sw_clean[mask_others]
            x_clean = x_clean[~mask_others]
            y_clean = y_clean[~mask_others]
            sw_clean = sw_clean[~mask_others]
        else:
            y_others = []
            others = []
            sw_others = []

        categories, x_clean = categorical_transform(x_clean, y_clean)

        return (x_clean, y_clean, x_missing, y_missing, x_special, y_special,
                y_others, categories, others, sw_clean, sw_missing, sw_special,
                sw_others)
    else:
        return (x_clean, y_clean, x_missing, y_missing, x_special, y_special,
                [], [], [], sw_clean, sw_missing, sw_special, [])
예제 #2
0
def split_data(dtype, x, y, special_codes=None, cat_cutoff=None,
               user_splits=None, check_input=True):
    """Split data into clean, missing and special values data.

    Parameters
    ----------
    x : array-like, shape = (n_samples)
        Data samples, where n_samples is the number of samples.

    y : array-like, shape = (n_samples)
        Target vector relative to x.

    special_codes : array-like or None (default=None)
        List of special values to be considered.

    user_splits_categorical : bool

    check_input : bool, (default=True)
        If False, the input arrays x and y will not be checked.

    Returns
    -------
    x_clean : array, shape = (n_clean)
        Clean data samples

    y_clean : array, shape = (n_clean)
        Clean target samples.

    x_missing : array, shape = (n_missing)
        Missing data samples.

    y_missing : array, shape = (n_missing)
        Missing target samples.

    x_special : array, shape = (n_special)
        Special data samples.

    y_special : array, shape = (n_special)
        Special target samples.

    y_others :

    categories :

    others :
    """
    if check_input:
        x = check_array(x, ensure_2d=False, dtype=None,
                        force_all_finite='allow-nan')

        y = check_array(y, ensure_2d=False, dtype=None,
                        force_all_finite=True)

        check_consistent_length(x, y)

    x = np.asarray(x)
    y = np.asarray(y)

    if isinstance(x.dtype, object) or isinstance(y.dtype, object):
        missing_mask = pd.isnull(x) | pd.isnull(y)
    else:
        missing_mask = np.isinan(x) | np.isnan(y)

    if special_codes is None:
        clean_mask = ~missing_mask

        x_clean = x[clean_mask]
        y_clean = y[clean_mask]
        x_missing = x[missing_mask]
        y_missing = y[missing_mask]
        x_special = []
        y_special = []
    else:
        special_mask = pd.Series(x).isin(special_codes).values

        clean_mask = ~missing_mask & ~special_mask

        x_clean = x[clean_mask]
        y_clean = y[clean_mask]
        x_missing = x[missing_mask]
        y_missing = y[missing_mask]
        x_special = x[special_mask]
        y_special = y[special_mask]

    if dtype == "categorical" and user_splits is None:
        if cat_cutoff is not None:
            mask_others, others = categorical_cutoff(
                x_clean, y_clean, cat_cutoff)

            y_others = y_clean[mask_others]
            x_clean = x_clean[~mask_others]
            y_clean = y_clean[~mask_others]
        else:
            y_others = []
            others = []

        categories, x_clean = categorical_transform(x_clean, y_clean)

        return (x_clean, y_clean, x_missing, y_missing, x_special, y_special,
                y_others, categories, others)
    else:
        return (x_clean, y_clean, x_missing, y_missing, x_special, y_special,
                [], [], [])
예제 #3
0
def transform_continuous_target(splits, dtype, x, n_records, sums,
                                special_codes, categories, cat_others,
                                metric_special, metric_missing, user_splits,
                                check_input):

    _check_metric_special_missing(metric_special, metric_missing)

    if check_input:
        x = check_array(x,
                        ensure_2d=False,
                        dtype=None,
                        force_all_finite='allow-nan')

    x = np.asarray(x)

    if isinstance(x.dtype, object):
        missing_mask = pd.isnull(x)
    else:
        missing_mask = np.isinan(x)

    if special_codes is None:
        clean_mask = ~missing_mask
    else:
        special_mask = pd.Series(x).isin(special_codes).values
        clean_mask = ~missing_mask & ~special_mask

    x_clean = x[clean_mask]

    if dtype == "numerical":
        if len(splits):
            indices = np.digitize(x_clean, splits, right=False)
        else:
            indices = np.zeros(x_clean.shape)
        n_bins = len(splits) + 1
    else:
        bins = bin_categorical(splits, categories, cat_others, user_splits)
        n_bins = len(bins)

    if "empirical" not in (metric_special, metric_missing):
        n_records = n_records[:n_bins]
        sums = sums[:n_bins]

    metric_value = sums / n_records

    x_transform = np.zeros(x.shape)

    if dtype == "numerical":
        x_clean_transform = np.zeros(x_clean.shape)
        for i in range(n_bins):
            mask = (indices == i)
            x_clean_transform[mask] = metric_value[i]

        x_transform[clean_mask] = x_clean_transform
    else:
        x_p = pd.Series(x)
        for i in range(n_bins):
            mask = x_p.isin(bins[i])
            x_transform[mask] = metric_value[i]

    if special_codes:
        if metric_special == "empirical":
            x_transform[special_mask] = metric_value[n_bins]
        else:
            x_transform[special_mask] = metric_special

    if metric_missing == "empirical":
        x_transform[missing_mask] = metric_value[n_bins + 1]
    else:
        x_transform[missing_mask] = metric_missing

    return x_transform
예제 #4
0
def transform_binary_target(splits,
                            dtype,
                            x,
                            n_nonevent,
                            n_event,
                            special_codes,
                            categories,
                            cat_others,
                            metric,
                            metric_special,
                            metric_missing,
                            user_splits,
                            check_input=False):

    if metric not in ("event_rate", "woe"):
        raise ValueError('Invalid value for metric. Allowed string '
                         'values are "event_rate" and "woe".')

    _check_metric_special_missing(metric_special, metric_missing)

    if check_input:
        x = check_array(x,
                        ensure_2d=False,
                        dtype=None,
                        force_all_finite='allow-nan')

    x = np.asarray(x)

    if isinstance(x.dtype, object):
        missing_mask = pd.isnull(x)
    else:
        missing_mask = np.isinan(x)

    if special_codes is None:
        clean_mask = ~missing_mask
    else:
        special_mask = pd.Series(x).isin(special_codes).values
        clean_mask = ~missing_mask & ~special_mask

    x_clean = x[clean_mask]

    if dtype == "numerical":
        if len(splits):
            indices = np.digitize(x_clean, splits, right=False)
        else:
            indices = np.zeros(x_clean.shape)
        n_bins = len(splits) + 1
    else:
        bins = bin_categorical(splits, categories, cat_others, user_splits)
        n_bins = len(bins)

    # Compute event rate and WoE
    n_records = n_event + n_nonevent
    t_n_nonevent = n_nonevent.sum()
    t_n_event = n_event.sum()

    if "empirical" not in (metric_special, metric_missing):
        n_event = n_event[:n_bins]
        n_nonevent = n_nonevent[:n_bins]
        n_records = n_records[:n_bins]

    # default woe and event rate is 0
    mask = (n_event > 0) & (n_nonevent > 0)
    event_rate = np.zeros(len(n_records))
    woe = np.zeros(len(n_records))
    event_rate[mask] = n_event[mask] / n_records[mask]
    constant = np.log(t_n_event / t_n_nonevent)
    woe[mask] = np.log(1 / event_rate[mask] - 1) + constant

    if metric == "woe":
        metric_value = woe
    else:
        metric_value = event_rate

    x_transform = np.zeros(x.shape)

    if dtype == "numerical":
        x_clean_transform = np.zeros(x_clean.shape)
        for i in range(n_bins):
            mask = (indices == i)
            x_clean_transform[mask] = metric_value[i]

        x_transform[clean_mask] = x_clean_transform
    else:
        x_p = pd.Series(x)
        for i in range(n_bins):
            mask = x_p.isin(bins[i])
            x_transform[mask] = metric_value[i]

    if special_codes:
        if metric_special == "empirical":
            x_transform[special_mask] = metric_value[n_bins]
        else:
            x_transform[special_mask] = metric_special

    if metric_missing == "empirical":
        x_transform[missing_mask] = metric_value[n_bins + 1]
    else:
        x_transform[missing_mask] = metric_missing

    return x_transform
예제 #5
0
def transform_multiclass_target(splits,
                                x,
                                n_event,
                                special_codes,
                                metric,
                                metric_special,
                                metric_missing,
                                check_input=False):

    if metric not in ("mean_woe", "weighted_mean_woe"):
        raise ValueError('Invalid value for metric. Allowed string '
                         'values are "mean_woe" and "weighted_mean_woe".')

    _check_metric_special_missing(metric_special, metric_missing)

    if check_input:
        x = check_array(x,
                        ensure_2d=False,
                        dtype=None,
                        force_all_finite='allow-nan')

    x = np.asarray(x)

    if isinstance(x.dtype, object):
        missing_mask = pd.isnull(x)
    else:
        missing_mask = np.isinan(x)

    if special_codes is None:
        clean_mask = ~missing_mask
    else:
        special_mask = pd.Series(x).isin(special_codes).values
        clean_mask = ~missing_mask & ~special_mask

    x_clean = x[clean_mask]

    if len(splits):
        indices = np.digitize(x_clean, splits, right=False)
    else:
        indices = np.zeros(x_clean.shape)
    n_bins = len(splits) + 1

    # Build non-event to compute one-vs-all WoE
    n_classes = n_event.shape[1]
    n_records = np.tile(n_event.sum(axis=1), (n_classes, 1)).T
    n_nonevent = n_records - n_event
    t_n_nonevent = n_nonevent.sum(axis=0)
    t_n_event = n_event.sum(axis=0)

    if "empirical" not in (metric_special, metric_missing):
        n_event = n_event[:n_bins, :]
        n_nonevent = n_nonevent[:n_bins, :]
        n_records = n_records[:n_bins, :]

    event_rate = n_event / n_records
    woe = np.zeros(n_event.shape)

    for i in range(n_classes):
        woe[:, i] = transform_event_rate_to_woe(event_rate[:, i],
                                                t_n_nonevent[i], t_n_event[i])

    if metric == "mean_woe":
        metric_value = woe.mean(axis=1)
    elif metric == "weighted_mean_woe":
        metric_value = np.average(woe, weights=t_n_event, axis=1)

    x_transform = np.zeros(x.shape)

    x_clean_transform = np.zeros(x_clean.shape)
    for i in range(n_bins):
        mask = (indices == i)
        x_clean_transform[mask] = metric_value[i]

    x_transform[clean_mask] = x_clean_transform

    if special_codes:
        if metric_special == "empirical":
            x_transform[special_mask] = metric_value[n_bins]
        else:
            x_transform[special_mask] = metric_special

    if metric_missing == "empirical":
        x_transform[missing_mask] = metric_value[n_bins + 1]
    else:
        x_transform[missing_mask] = metric_missing

    return x_transform