def encode_with_label(self, X: pd.Series, y: pd.Series) -> pd.Series: """ Encode categorical features with its percentage of positive samples""" X, y = make_series(X), make_series(y) pct_pos = y.groupby(X).mean() # save the mapping for transform() self.discrete_encoding[X.name] = pct_pos return X.map(pct_pos)
def test_latest(): times = 7 max_lag = 4 series_list = [] for lag in range(max_lag + 1): values = np.ones((times - lag, )) * lag series_list.append(make_series(values, range(times - lag), lag)) expected = make_series([4, 4, 4, 3, 2, 1, 0], range(times)) assert np.allclose(expected, merge.latest(series_list))
def test_earliest(): times = 7 max_lag = 4 series_list = [] for lag in range(max_lag + 1): values = np.ones((times - (max_lag - lag), )) * lag series_list.append( make_series(values, range(times - (max_lag - lag)), lag)) expected = make_series([0, 0, 0, 1, 2, 3, 4], range(times)) assert np.allclose(expected, merge.earliest(series_list))
def test_diff_linear_noninc(): expected_output_list = [ make_series(np.zeros(TIMES), range(TIMES), 0), make_series( np.concatenate([ np.linspace(0, TIMES - 3, TIMES - 2), [4.6666666666666661] * 2 ]), range(TIMES), 1), make_series( np.concatenate([np.linspace(1, TIMES - 3, TIMES - 3), [5.0] * 3]), range(TIMES), 2), ] output = fill.diff_linear(INPUT_LIST) for o, eo in zip(output, expected_output_list): assert np.allclose(o, eo)
def is_monotonic(i, strict=True, ignore_na=True) -> bool: """ Check if an iterable is monotonic """ i = make_series(i) diff = i.diff()[1:] if ignore_na: diff = diff[diff.notnull()] sign = diff > 0 if strict else diff >= 0 if sign.sum() == 0 or (~sign).sum() == 0: return True return False
def test_diff_mean_noninc(): """ Check if the non incremental version works fine """ expected_output_list = [ make_series(np.zeros(TIMES), range(TIMES), 0), make_series( np.concatenate([np.linspace(0, TIMES - 3, TIMES - 2), [3.5] * 2]), range(TIMES), 1), make_series( np.concatenate([np.linspace(1, TIMES - 3, TIMES - 3), [4.0] * 3]), range(TIMES), 2), ] output = fill.diff_mean(INPUT_LIST) for o, eo in zip(output, expected_output_list): assert np.allclose(o, eo)
def test_diff_mean_inc(): """ Check for the incremental version """ expected_output_list = [ make_series(np.zeros(TIMES), range(TIMES), 0), make_series( np.concatenate([np.linspace(0, TIMES - 3, TIMES - 2), [3.5] * 2]), range(TIMES), 1), make_series( np.concatenate( [np.linspace(1, TIMES - 3, TIMES - 3), [8, 4.5, 4.5]]), range(TIMES), 2), ] output = fill.diff_mean(INPUT_LIST, inc=True) for o, eo in zip(output, expected_output_list): assert np.allclose(o, eo)
def test_mixer(): """ Test for mixer """ # noop defaults to uniform weights bias_fn = update.noop main_fn = partial(update.pick, index=2) mix_fn = update.create_mixer([main_fn, bias_fn], [0.3, 0.7]) losses = [ make_series(np.random.rand(10), range(10), extra_attrs={"model": model}) for model in MODELS ] weights = mix_fn(losses) expected_weights = make_weights([0.14, 0.14, 0.44, 0.14, 0.14], MODELS) assert np.allclose(expected_weights, weights)
def _fit(self, X, y, **fit_parmas): """ Fit a single feature and return the cutoff points""" if not is_numeric_dtype(X) and X.name not in self.categorical_cols: raise ValueError( 'Column {} is not numeric and not in categorical_cols.'.format( X.name)) y = force_zero_one(y) X, y = make_series(X), make_series(y) # if X is discrete, encode with positive ratio in y if X.name in self.categorical_cols: # the categorical columns will remain unchanged if # we turn off bin_cat_cols if not self.bin_cat_cols: return None X = self.encode_with_label(X, y) # the number of bins is the number of cutoff points minus 1 n_bins = X.nunique() - 1 # if the number of bins is already smaller than `n_bins` # then we'll leave this column as it is if n_bins < self.max_bin: return None # speed up the process with prebinning if self.prebin and n_bins > self.prebin: X, _ = equal_frequency_binning(X, n=self.prebin, encode=False) # X = make_series(X) # convert to mapping mapping = y.groupby(X).apply(list).to_dict() # set the overall expected ratio if len(mapping) == 0: return [-np.inf] self.expected_ratio = sum(sum(v) for v in mapping.values()) / sum( len(v) for v in mapping.values()) # if the expected_ratio is 0 or 1 there should be only 1 group and # any not-null value will be encoded into 1 if self.expected_ratio == 0 or self.expected_ratio == 1: return [-np.inf] n_bins = len(mapping) - 1 # merge bins based on chi square while n_bins > self.max_bin: mapping = self.merge_chisquare(mapping) n_bins = len(mapping) - 1 # merge bins to create mixed label in every bin if self.force_mix_label and n_bins > 1: is_pure = False while not is_pure: mapping, is_pure = self.merge_purity(mapping) # merge bins to keep bins to be monotonic if self.force_monotonic: while len(mapping) - 1 > 2 and not self.is_monotonic_post_bin( mapping): mapping = self.merge_chisquare(mapping) # clean up the cache self._chisquare_cache = dict() return mapping.keys()
import xarray as xr import numpy as np from ledge import fill from utils import make_series TIMES = 10 INPUT_LIST = [ make_series(np.zeros(TIMES), range(TIMES), 0), make_series(np.linspace(0, TIMES - 3, TIMES - 2), range(TIMES - 2), 1), make_series(np.linspace(1, TIMES - 3, TIMES - 3), range(TIMES - 3), 2) ] def test_diff_mean_noninc(): """ Check if the non incremental version works fine """ expected_output_list = [ make_series(np.zeros(TIMES), range(TIMES), 0), make_series( np.concatenate([np.linspace(0, TIMES - 3, TIMES - 2), [3.5] * 2]), range(TIMES), 1), make_series( np.concatenate([np.linspace(1, TIMES - 3, TIMES - 3), [4.0] * 3]), range(TIMES), 2), ] output = fill.diff_mean(INPUT_LIST) for o, eo in zip(output, expected_output_list):