Пример #1
0
 def encode_with_label(self, X: pd.Series, y: pd.Series) -> pd.Series:
     """ Encode categorical features with its percentage of positive samples"""
     X, y = make_series(X), make_series(y)
     pct_pos = y.groupby(X).mean()
     # save the mapping for transform()
     self.discrete_encoding[X.name] = pct_pos
     return X.map(pct_pos)
Пример #2
0
def test_latest():
    times = 7
    max_lag = 4

    series_list = []
    for lag in range(max_lag + 1):
        values = np.ones((times - lag, )) * lag
        series_list.append(make_series(values, range(times - lag), lag))

    expected = make_series([4, 4, 4, 3, 2, 1, 0], range(times))

    assert np.allclose(expected, merge.latest(series_list))
Пример #3
0
def test_earliest():
    times = 7
    max_lag = 4

    series_list = []
    for lag in range(max_lag + 1):
        values = np.ones((times - (max_lag - lag), )) * lag
        series_list.append(
            make_series(values, range(times - (max_lag - lag)), lag))

    expected = make_series([0, 0, 0, 1, 2, 3, 4], range(times))

    assert np.allclose(expected, merge.earliest(series_list))
Пример #4
0
def test_diff_linear_noninc():
    expected_output_list = [
        make_series(np.zeros(TIMES), range(TIMES), 0),
        make_series(
            np.concatenate([
                np.linspace(0, TIMES - 3, TIMES - 2), [4.6666666666666661] * 2
            ]), range(TIMES), 1),
        make_series(
            np.concatenate([np.linspace(1, TIMES - 3, TIMES - 3), [5.0] * 3]),
            range(TIMES), 2),
    ]

    output = fill.diff_linear(INPUT_LIST)

    for o, eo in zip(output, expected_output_list):
        assert np.allclose(o, eo)
Пример #5
0
 def is_monotonic(i, strict=True, ignore_na=True) -> bool:
     """ Check if an iterable is monotonic """
     i = make_series(i)
     diff = i.diff()[1:]
     if ignore_na:
         diff = diff[diff.notnull()]
     sign = diff > 0 if strict else diff >= 0
     if sign.sum() == 0 or (~sign).sum() == 0:
         return True
     return False
Пример #6
0
def test_diff_mean_noninc():
    """
    Check if the non incremental version works fine
    """

    expected_output_list = [
        make_series(np.zeros(TIMES), range(TIMES), 0),
        make_series(
            np.concatenate([np.linspace(0, TIMES - 3, TIMES - 2), [3.5] * 2]),
            range(TIMES), 1),
        make_series(
            np.concatenate([np.linspace(1, TIMES - 3, TIMES - 3), [4.0] * 3]),
            range(TIMES), 2),
    ]

    output = fill.diff_mean(INPUT_LIST)

    for o, eo in zip(output, expected_output_list):
        assert np.allclose(o, eo)
Пример #7
0
def test_diff_mean_inc():
    """
    Check for the incremental version
    """

    expected_output_list = [
        make_series(np.zeros(TIMES), range(TIMES), 0),
        make_series(
            np.concatenate([np.linspace(0, TIMES - 3, TIMES - 2), [3.5] * 2]),
            range(TIMES), 1),
        make_series(
            np.concatenate(
                [np.linspace(1, TIMES - 3, TIMES - 3), [8, 4.5, 4.5]]),
            range(TIMES), 2),
    ]

    output = fill.diff_mean(INPUT_LIST, inc=True)

    for o, eo in zip(output, expected_output_list):
        assert np.allclose(o, eo)
Пример #8
0
def test_mixer():
    """
    Test for mixer
    """

    # noop defaults to uniform weights
    bias_fn = update.noop
    main_fn = partial(update.pick, index=2)

    mix_fn = update.create_mixer([main_fn, bias_fn], [0.3, 0.7])

    losses = [
        make_series(np.random.rand(10),
                    range(10),
                    extra_attrs={"model": model}) for model in MODELS
    ]
    weights = mix_fn(losses)

    expected_weights = make_weights([0.14, 0.14, 0.44, 0.14, 0.14], MODELS)

    assert np.allclose(expected_weights, weights)
Пример #9
0
    def _fit(self, X, y, **fit_parmas):
        """ Fit a single feature and return the cutoff points"""
        if not is_numeric_dtype(X) and X.name not in self.categorical_cols:
            raise ValueError(
                'Column {} is not numeric and not in categorical_cols.'.format(
                    X.name))

        y = force_zero_one(y)
        X, y = make_series(X), make_series(y)

        # if X is discrete, encode with positive ratio in y
        if X.name in self.categorical_cols:
            # the categorical columns will remain unchanged if
            # we turn off  bin_cat_cols
            if not self.bin_cat_cols:
                return None
            X = self.encode_with_label(X, y)

        # the number of bins is the number of cutoff points minus 1
        n_bins = X.nunique() - 1

        # if the number of bins is already smaller than `n_bins`
        # then we'll leave this column as it is
        if n_bins < self.max_bin:
            return None

        # speed up the process with prebinning
        if self.prebin and n_bins > self.prebin:
            X, _ = equal_frequency_binning(X, n=self.prebin, encode=False)
            # X = make_series(X)

        # convert to mapping
        mapping = y.groupby(X).apply(list).to_dict()

        # set the overall expected ratio
        if len(mapping) == 0:
            return [-np.inf]

        self.expected_ratio = sum(sum(v) for v in mapping.values()) / sum(
            len(v) for v in mapping.values())
        # if the expected_ratio is 0 or 1 there should be only 1 group and
        # any not-null value will be encoded into 1
        if self.expected_ratio == 0 or self.expected_ratio == 1:
            return [-np.inf]

        n_bins = len(mapping) - 1
        # merge bins based on chi square
        while n_bins > self.max_bin:
            mapping = self.merge_chisquare(mapping)
            n_bins = len(mapping) - 1

        # merge bins to create mixed label in every bin
        if self.force_mix_label and n_bins > 1:
            is_pure = False
            while not is_pure:
                mapping, is_pure = self.merge_purity(mapping)

        # merge bins to keep bins to be monotonic
        if self.force_monotonic:
            while len(mapping) - 1 > 2 and not self.is_monotonic_post_bin(
                    mapping):
                mapping = self.merge_chisquare(mapping)

        # clean up the cache
        self._chisquare_cache = dict()
        return mapping.keys()
Пример #10
0
import xarray as xr
import numpy as np
from ledge import fill
from utils import make_series

TIMES = 10
INPUT_LIST = [
    make_series(np.zeros(TIMES), range(TIMES), 0),
    make_series(np.linspace(0, TIMES - 3, TIMES - 2), range(TIMES - 2), 1),
    make_series(np.linspace(1, TIMES - 3, TIMES - 3), range(TIMES - 3), 2)
]


def test_diff_mean_noninc():
    """
    Check if the non incremental version works fine
    """

    expected_output_list = [
        make_series(np.zeros(TIMES), range(TIMES), 0),
        make_series(
            np.concatenate([np.linspace(0, TIMES - 3, TIMES - 2), [3.5] * 2]),
            range(TIMES), 1),
        make_series(
            np.concatenate([np.linspace(1, TIMES - 3, TIMES - 3), [4.0] * 3]),
            range(TIMES), 2),
    ]

    output = fill.diff_mean(INPUT_LIST)

    for o, eo in zip(output, expected_output_list):