def split_data(dtype, x, y, special_codes=None, cat_cutoff=None, user_splits=None, check_input=True, outlier_detector=None, outlier_params=None, fix_lb=None, fix_ub=None, class_weight=None, sample_weight=None): """Split data into clean, missing and special values data. Parameters ---------- dtype : str, optional (default="numerical") The variable data type. Supported data types are "numerical" for continuous and ordinal variables and "categorical" for categorical and nominal variables. x : array-like, shape = (n_samples) Data samples, where n_samples is the number of samples. y : array-like, shape = (n_samples) Target vector relative to x. special_codes : array-like or None, optional (default=None) List of special codes. Use special codes to specify the data values that must be treated separately. cat_cutoff : float or None, optional (default=None) Generate bin others with categories in which the fraction of occurrences is below the ``cat_cutoff`` value. This option is available when ``dtype`` is "categorical". user_splits : array-like or None, optional (default=None) The list of pre-binning split points when ``dtype`` is "numerical" or the list of prebins when ``dtype`` is "categorical". check_input : bool, (default=True) If False, the input arrays x and y will not be checked. outlier_detector : str or None (default=None) The outlier detection method. Supported methods are "range" to use the interquartile range based method or "zcore" to use the modified Z-score method. outlier_params : dict or None (default=None) Dictionary of parameters to pass to the outlier detection method. fix_lb : float or None (default=None) Lower bound or minimum admissible value. fix_ub : float or None (default=None) Upper bound or maximum admissible value. class_weight : None sample_weight : None Returns ------- x_clean : array, shape = (n_clean) Clean data samples y_clean : array, shape = (n_clean) Clean target samples. x_missing : array, shape = (n_missing) Missing data samples. y_missing : array, shape = (n_missing) Missing target samples. x_special : array, shape = (n_special) Special data samples. y_special : array, shape = (n_special) Special target samples. y_others : array, shape = (n_others) Others target samples. categories : array, shape (n_categories) List of categories. others : array, shape (n_other_categories) List of other categories. """ if outlier_detector is not None: if outlier_detector not in ("range", "zscore"): raise ValueError('Invalid value for outlier_detector. Allowed ' 'string values are "range" and "zscore".') if outlier_params is not None: if not isinstance(outlier_params, dict): raise TypeError("outlier_params must be a dict or None; " "got {}.".format(outlier_params)) if fix_lb is not None: if not isinstance(fix_lb, numbers.Number): raise ValueError("fix_lb must be a number; got {}.".format(fix_lb)) if fix_ub is not None: if not isinstance(fix_ub, numbers.Number): raise ValueError("fix_ub must be a number; got {}.".format(fix_ub)) if fix_lb is not None and fix_ub is not None: if fix_lb > fix_ub: raise ValueError("fix_lb must be <= fix_ub; got {} <= {}.".format( fix_lb, fix_ub)) if check_input: x = check_array(x, ensure_2d=False, dtype=None, force_all_finite='allow-nan') y = check_array(y, ensure_2d=False, dtype=None, force_all_finite=True) check_consistent_length(x, y) x = np.asarray(x) y = np.asarray(y) sample_weight = _check_sample_weight(sample_weight, x, dtype=x.dtype) if class_weight is not None: classes = np.unique(y) le = LabelEncoder() class_weight_ = compute_class_weight(class_weight, classes, y) sample_weight *= class_weight_[le.fit_transform(y)] if isinstance(x.dtype, object) or isinstance(y.dtype, object): missing_mask = pd.isnull(x) | pd.isnull(y) else: missing_mask = np.isinan(x) | np.isnan(y) if special_codes is None: clean_mask = ~missing_mask x_clean = x[clean_mask] y_clean = y[clean_mask] x_missing = x[missing_mask] y_missing = y[missing_mask] x_special = [] y_special = [] sw_clean = sample_weight[clean_mask] sw_missing = sample_weight[missing_mask] sw_special = [] else: special_mask = pd.Series(x).isin(special_codes).values clean_mask = ~missing_mask & ~special_mask x_clean = x[clean_mask] y_clean = y[clean_mask] x_missing = x[missing_mask] y_missing = y[missing_mask] x_special = x[special_mask] y_special = y[special_mask] sw_clean = sample_weight[clean_mask] sw_missing = sample_weight[missing_mask] sw_special = sample_weight[special_mask] if dtype == "numerical": if outlier_detector is not None: if outlier_detector == "range": detector = RangeDetector() elif outlier_detector == "zscore": detector = ModifiedZScoreDetector() if outlier_params is not None: detector.set_params(**outlier_params) mask_outlier = detector.fit(x_clean).get_support() x_clean = x_clean[~mask_outlier] y_clean = y_clean[~mask_outlier] sw_clean = sw_clean[~mask_outlier] if fix_lb is not None or fix_ub is not None: if fix_lb is not None: mask = x_clean >= fix_lb elif fix_ub is not None: mask = x_clean <= fix_ub else: mask = (x_clean >= fix_lb) & (x_clean <= fix_ub) x_clean = x_clean[mask] y_clean = y_clean[mask] sw_clean = sw_clean[mask] if dtype == "categorical" and user_splits is None: if cat_cutoff is not None: mask_others, others = categorical_cutoff(x_clean, y_clean, cat_cutoff) y_others = y_clean[mask_others] sw_others = sw_clean[mask_others] x_clean = x_clean[~mask_others] y_clean = y_clean[~mask_others] sw_clean = sw_clean[~mask_others] else: y_others = [] others = [] sw_others = [] categories, x_clean = categorical_transform(x_clean, y_clean) return (x_clean, y_clean, x_missing, y_missing, x_special, y_special, y_others, categories, others, sw_clean, sw_missing, sw_special, sw_others) else: return (x_clean, y_clean, x_missing, y_missing, x_special, y_special, [], [], [], sw_clean, sw_missing, sw_special, [])
def split_data(dtype, x, y, special_codes=None, cat_cutoff=None, user_splits=None, check_input=True): """Split data into clean, missing and special values data. Parameters ---------- x : array-like, shape = (n_samples) Data samples, where n_samples is the number of samples. y : array-like, shape = (n_samples) Target vector relative to x. special_codes : array-like or None (default=None) List of special values to be considered. user_splits_categorical : bool check_input : bool, (default=True) If False, the input arrays x and y will not be checked. Returns ------- x_clean : array, shape = (n_clean) Clean data samples y_clean : array, shape = (n_clean) Clean target samples. x_missing : array, shape = (n_missing) Missing data samples. y_missing : array, shape = (n_missing) Missing target samples. x_special : array, shape = (n_special) Special data samples. y_special : array, shape = (n_special) Special target samples. y_others : categories : others : """ if check_input: x = check_array(x, ensure_2d=False, dtype=None, force_all_finite='allow-nan') y = check_array(y, ensure_2d=False, dtype=None, force_all_finite=True) check_consistent_length(x, y) x = np.asarray(x) y = np.asarray(y) if isinstance(x.dtype, object) or isinstance(y.dtype, object): missing_mask = pd.isnull(x) | pd.isnull(y) else: missing_mask = np.isinan(x) | np.isnan(y) if special_codes is None: clean_mask = ~missing_mask x_clean = x[clean_mask] y_clean = y[clean_mask] x_missing = x[missing_mask] y_missing = y[missing_mask] x_special = [] y_special = [] else: special_mask = pd.Series(x).isin(special_codes).values clean_mask = ~missing_mask & ~special_mask x_clean = x[clean_mask] y_clean = y[clean_mask] x_missing = x[missing_mask] y_missing = y[missing_mask] x_special = x[special_mask] y_special = y[special_mask] if dtype == "categorical" and user_splits is None: if cat_cutoff is not None: mask_others, others = categorical_cutoff( x_clean, y_clean, cat_cutoff) y_others = y_clean[mask_others] x_clean = x_clean[~mask_others] y_clean = y_clean[~mask_others] else: y_others = [] others = [] categories, x_clean = categorical_transform(x_clean, y_clean) return (x_clean, y_clean, x_missing, y_missing, x_special, y_special, y_others, categories, others) else: return (x_clean, y_clean, x_missing, y_missing, x_special, y_special, [], [], [])
def transform_continuous_target(splits, dtype, x, n_records, sums, special_codes, categories, cat_others, metric_special, metric_missing, user_splits, check_input): _check_metric_special_missing(metric_special, metric_missing) if check_input: x = check_array(x, ensure_2d=False, dtype=None, force_all_finite='allow-nan') x = np.asarray(x) if isinstance(x.dtype, object): missing_mask = pd.isnull(x) else: missing_mask = np.isinan(x) if special_codes is None: clean_mask = ~missing_mask else: special_mask = pd.Series(x).isin(special_codes).values clean_mask = ~missing_mask & ~special_mask x_clean = x[clean_mask] if dtype == "numerical": if len(splits): indices = np.digitize(x_clean, splits, right=False) else: indices = np.zeros(x_clean.shape) n_bins = len(splits) + 1 else: bins = bin_categorical(splits, categories, cat_others, user_splits) n_bins = len(bins) if "empirical" not in (metric_special, metric_missing): n_records = n_records[:n_bins] sums = sums[:n_bins] metric_value = sums / n_records x_transform = np.zeros(x.shape) if dtype == "numerical": x_clean_transform = np.zeros(x_clean.shape) for i in range(n_bins): mask = (indices == i) x_clean_transform[mask] = metric_value[i] x_transform[clean_mask] = x_clean_transform else: x_p = pd.Series(x) for i in range(n_bins): mask = x_p.isin(bins[i]) x_transform[mask] = metric_value[i] if special_codes: if metric_special == "empirical": x_transform[special_mask] = metric_value[n_bins] else: x_transform[special_mask] = metric_special if metric_missing == "empirical": x_transform[missing_mask] = metric_value[n_bins + 1] else: x_transform[missing_mask] = metric_missing return x_transform
def transform_binary_target(splits, dtype, x, n_nonevent, n_event, special_codes, categories, cat_others, metric, metric_special, metric_missing, user_splits, check_input=False): if metric not in ("event_rate", "woe"): raise ValueError('Invalid value for metric. Allowed string ' 'values are "event_rate" and "woe".') _check_metric_special_missing(metric_special, metric_missing) if check_input: x = check_array(x, ensure_2d=False, dtype=None, force_all_finite='allow-nan') x = np.asarray(x) if isinstance(x.dtype, object): missing_mask = pd.isnull(x) else: missing_mask = np.isinan(x) if special_codes is None: clean_mask = ~missing_mask else: special_mask = pd.Series(x).isin(special_codes).values clean_mask = ~missing_mask & ~special_mask x_clean = x[clean_mask] if dtype == "numerical": if len(splits): indices = np.digitize(x_clean, splits, right=False) else: indices = np.zeros(x_clean.shape) n_bins = len(splits) + 1 else: bins = bin_categorical(splits, categories, cat_others, user_splits) n_bins = len(bins) # Compute event rate and WoE n_records = n_event + n_nonevent t_n_nonevent = n_nonevent.sum() t_n_event = n_event.sum() if "empirical" not in (metric_special, metric_missing): n_event = n_event[:n_bins] n_nonevent = n_nonevent[:n_bins] n_records = n_records[:n_bins] # default woe and event rate is 0 mask = (n_event > 0) & (n_nonevent > 0) event_rate = np.zeros(len(n_records)) woe = np.zeros(len(n_records)) event_rate[mask] = n_event[mask] / n_records[mask] constant = np.log(t_n_event / t_n_nonevent) woe[mask] = np.log(1 / event_rate[mask] - 1) + constant if metric == "woe": metric_value = woe else: metric_value = event_rate x_transform = np.zeros(x.shape) if dtype == "numerical": x_clean_transform = np.zeros(x_clean.shape) for i in range(n_bins): mask = (indices == i) x_clean_transform[mask] = metric_value[i] x_transform[clean_mask] = x_clean_transform else: x_p = pd.Series(x) for i in range(n_bins): mask = x_p.isin(bins[i]) x_transform[mask] = metric_value[i] if special_codes: if metric_special == "empirical": x_transform[special_mask] = metric_value[n_bins] else: x_transform[special_mask] = metric_special if metric_missing == "empirical": x_transform[missing_mask] = metric_value[n_bins + 1] else: x_transform[missing_mask] = metric_missing return x_transform
def transform_multiclass_target(splits, x, n_event, special_codes, metric, metric_special, metric_missing, check_input=False): if metric not in ("mean_woe", "weighted_mean_woe"): raise ValueError('Invalid value for metric. Allowed string ' 'values are "mean_woe" and "weighted_mean_woe".') _check_metric_special_missing(metric_special, metric_missing) if check_input: x = check_array(x, ensure_2d=False, dtype=None, force_all_finite='allow-nan') x = np.asarray(x) if isinstance(x.dtype, object): missing_mask = pd.isnull(x) else: missing_mask = np.isinan(x) if special_codes is None: clean_mask = ~missing_mask else: special_mask = pd.Series(x).isin(special_codes).values clean_mask = ~missing_mask & ~special_mask x_clean = x[clean_mask] if len(splits): indices = np.digitize(x_clean, splits, right=False) else: indices = np.zeros(x_clean.shape) n_bins = len(splits) + 1 # Build non-event to compute one-vs-all WoE n_classes = n_event.shape[1] n_records = np.tile(n_event.sum(axis=1), (n_classes, 1)).T n_nonevent = n_records - n_event t_n_nonevent = n_nonevent.sum(axis=0) t_n_event = n_event.sum(axis=0) if "empirical" not in (metric_special, metric_missing): n_event = n_event[:n_bins, :] n_nonevent = n_nonevent[:n_bins, :] n_records = n_records[:n_bins, :] event_rate = n_event / n_records woe = np.zeros(n_event.shape) for i in range(n_classes): woe[:, i] = transform_event_rate_to_woe(event_rate[:, i], t_n_nonevent[i], t_n_event[i]) if metric == "mean_woe": metric_value = woe.mean(axis=1) elif metric == "weighted_mean_woe": metric_value = np.average(woe, weights=t_n_event, axis=1) x_transform = np.zeros(x.shape) x_clean_transform = np.zeros(x_clean.shape) for i in range(n_bins): mask = (indices == i) x_clean_transform[mask] = metric_value[i] x_transform[clean_mask] = x_clean_transform if special_codes: if metric_special == "empirical": x_transform[special_mask] = metric_value[n_bins] else: x_transform[special_mask] = metric_special if metric_missing == "empirical": x_transform[missing_mask] = metric_value[n_bins + 1] else: x_transform[missing_mask] = metric_missing return x_transform