Пример #1
0
    def test_result_is_reproduceable_within_sampler(self, config, raw_data,
                                                    seed):
        """
        GIVEN a sampler instantiated with a random seed
        WHEN this sampler is applied to the same data twice
        THEN the results of both applications are equal
        """
        sampler = BinnedUniformSampler(random_seed=seed, **config)

        first_result = sampler.generate_samples(raw_data)
        second_result = sampler.generate_samples(raw_data)

        pd.testing.assert_frame_equal(first_result, second_result)
Пример #2
0
    def test_result_is_reproduceable_between_samplers(self, config, raw_data,
                                                      seed):
        """
        GIVEN two samplers with the same random seed and the same other parameters 
        WHEN both samlers are applied to the same raw data
        THEN the result is the same
        """
        first_sampler = BinnedUniformSampler(random_seed=seed, **config)
        first_result = first_sampler.generate_samples(raw_data)

        second_sampler = BinnedUniformSampler(random_seed=seed, **config)
        second_result = second_sampler.generate_samples(raw_data)

        pd.testing.assert_frame_equal(first_result, second_result)
Пример #3
0
    def test_y_data_but_no_x_data(self):
        """
        GIVEN a sampler and dummy data for a customer with no purchases falling into the \
            lookback period
        WHEN samples are generated from that customers data
        THEN there are no orderpositions marked with `x_include` in the result. 
        """
        lead_time = pd.to_timedelta("1d")
        lookback = pd.to_timedelta("2y")
        prediction_period = pd.to_timedelta("180d")
        max_date = pd.to_datetime("2020-01-01")
        min_date = (max_date - prediction_period - lead_time - lookback -
                    pd.to_timedelta("1d"))

        sampler = BinnedUniformSampler(
            min_date=min_date,
            max_date=max_date,
            lead_time=lead_time,
            prediction_period=prediction_period,
            samples_per_lookback=1,
            lookback=lookback,
        )

        # purchase that covers one prediction period, but no lookback
        customer_data = self.generate_data_for_one_customer(
            1, max_date - prediction_period - lead_time, max_date, n_orders=12)

        # a sampler with max date greater than the maximum order date and
        # params such that exactly one sample is created for the customer
        samples = sampler.generate_samples(customer_data)

        assert samples.index.get_level_values("sample_id").nunique() == 1
        assert samples.x_include.sum() == 0
        assert samples.y_include.sum() >= 1
Пример #4
0
    def test_acquisition_date_greater_max_date(self):
        """
        GIVEN a sampler and data for a customer whose acquisition date is greater \
            than the samplers `max_date`
        WHEN the sampler is applied to the data
        THEN it returns an empty dataframe
        """
        customer_data = self.generate_data_for_one_customer(
            1,
            min_date="2016-01-01",  # will be overwritten by acquisition date
            max_date="2020-08-01",
            acquisition_date="2020-01-01",
            n_orders=12,
        )
        sampler = BinnedUniformSampler(
            min_date="2016-01-01",
            max_date="2019-12-31",
            lead_time="28d",
            prediction_period="180d",
            lookback="180d",
            samples_per_lookback=1,
        )
        samples = sampler.generate_samples(customer_data)

        assert samples.empty
Пример #5
0
    def test_different_seeds_produce_different_outcomes(
            self, config, raw_data, seed):
        """
        GIVEN two samplers with the same parameters but different random seeds
        WHEN both samplers are applied to the same raw data
        THEN the result is different
        """
        first_sampler = BinnedUniformSampler(random_seed=seed, **config)
        second_sampler = BinnedUniformSampler(random_seed=seed + 1, **config)

        first_result = first_sampler.generate_samples(raw_data)
        second_result = second_sampler.generate_samples(raw_data)

        try:
            pd.testing.assert_frame_equal(first_result, second_result)
        except AssertionError:
            pass
        else:
            raise AssertionError
Пример #6
0
 def test_no_prediction_time_outside_min_and_max_date(
         self, sampler: BinnedUniformSampler, raw_data: pd.DataFrame):
     """
     GIVEN a sampler instance and some dummy purchase data for multiple customers
     WHEN samples are created from this data using the sampler
     THEN the prediction times of all samples is within `[sampler.min_date, sampler.max_date]`
     """
     sampled = sampler.generate_samples(raw_data)
     max_date = sampler.max_date
     min_date = sampler.min_date
     assert np.all(sampled.prediction_time > min_date)
     assert np.all(sampled.prediction_time < max_date)
Пример #7
0
 def test_no_data_in_lead_time(self, sampler: BinnedUniformSampler,
                               raw_data: pd.DataFrame):
     """
     GIVEN a sampler with a lead time and some dummy purchase data to create samples from
     WHEN samples are generated from this purchase data
     THEN there is no order position marked in the column `y_include` in the lead time
     """
     sampled = sampler.generate_samples(raw_data)
     y_data = sampled[sampled.y_include]
     lead_time = sampler.lead_time
     lower_boundary = lead_time + y_data.prediction_time
     assert np.all(y_data.order_date > lower_boundary)
Пример #8
0
 def test_sampled_data_columns(
     self,
     sampler: BinnedUniformSampler,
     expected_column: str,
     raw_data: pd.DataFrame,
 ):
     """
     GIVEN a sampler and dummy purchase data for multiple customers
     WHEN samples are created from this dummy data using the sampler
     THEN the output dataframe from the sampler contains all mandatory columns.
     """
     sampled = sampler.generate_samples(raw_data)
     columns = list(sampled.columns) + list(sampled.index.names)
     assert expected_column in columns
Пример #9
0
    def test_samples_for_new_customers(
        self,
        min_date,
        max_date,
        lookback,
        prediction_period,
        lead_time,
        samples_per_lookback,
    ):
        """
        GIVEN a set of parameters for a sampler
        WHEN samples are generated from a customer's purchase history that does not cover a \
            full lookback
        THEN the number of samples generated for this customer is still proportional to the \
            time period covered by the customers purchase history and there is at least one \
            sample regardless of the available purchase history.
        """
        min_date = pd.to_datetime(min_date)
        max_date = pd.to_datetime(max_date)
        lookback = pd.to_timedelta(lookback)
        prediction_period = pd.to_timedelta(prediction_period)
        lead_time = pd.to_timedelta(lead_time)
        samples_per_lookback = 3

        acquisition_date = (max_date - (lookback * np.random.uniform()) -
                            prediction_period - lead_time)
        new_customer_data = self.generate_data_for_one_customer(
            1, min_date, max_date, acquisition_date)

        sampler = BinnedUniformSampler(
            min_date=min_date,
            max_date=max_date,
            lead_time=lead_time,
            prediction_period=prediction_period,
            lookback=lookback,
            samples_per_lookback=samples_per_lookback,
        )
        samples = sampler.generate_samples(new_customer_data)

        n_samples = samples.index.get_level_values("sample_id").nunique()

        lookbacks_covered = ((max_date - prediction_period - lead_time) -
                             acquisition_date) / lookback
        expected_n_samples = np.max(
            [np.floor(lookbacks_covered * samples_per_lookback), 1])
        assert expected_n_samples == n_samples
        assert np.all(n_samples > 0)
Пример #10
0
    def test_samples_within_bins(
        self,
        min_date,
        max_date,
        lead_time,
        prediction_period,
        lookback,
        samples_per_lookback,
        random_seed,
    ):
        """
        GIVEN a set of parameters for a sampler
        WHEN samples are generated from one customers data using a sampler instantiated \
            with these parameters
        THEN all samples for this customers fall into equi-spaced bins of the customers's \
            purchase history.
        """
        np.random.seed(random_seed)

        # generate data for timespan
        customer_data = self.generate_data_for_one_customer(
            1, min_date, max_date)

        # initialize sampler
        sampler = BinnedUniformSampler(
            min_date=min_date,
            max_date=max_date,
            lead_time=lead_time,
            prediction_period=prediction_period,
            lookback=lookback,
            samples_per_lookback=samples_per_lookback,
        )
        min_date = pd.to_datetime(min_date)
        max_date = pd.to_datetime(max_date)
        lead_time = pd.to_timedelta(lead_time)
        prediction_period = pd.to_timedelta(prediction_period)
        lookback = pd.to_timedelta(lookback)

        samples = sampler.generate_samples(customer_data)

        # calculate boundaries of the sampling range
        upper = max_date - prediction_period - lead_time
        lower = max(min_date, customer_data.acquisition_date.max())

        # calculate the number of samples
        lookbacks_covered = (upper - lower) / lookback
        n_samples_expected = np.floor(lookbacks_covered * samples_per_lookback)
        n_samples_expected = n_samples_expected.astype(int)
        # at least one sample if customer has enough data for one prediction period
        if upper > lower:
            n_samples_expected = max(n_samples_expected, 1)
        else:
            n_samples_expected = 0

        # full lookbacks for customers with enough data
        if lower < (upper - lookback):
            lower = lower + lookback
        lower = max(lower, min_date + lookback)

        # calculate the size of the individual bins for sampling
        bin_size = (upper - lower) / n_samples_expected

        # sort prediction times into bins
        prediction_times = samples.groupby("sample_id").prediction_time.first()
        bins = np.floor((prediction_times - lower) / bin_size)

        # check that we actually generated the expected number of samples
        assert (samples.index.get_level_values("sample_id").nunique() ==
                n_samples_expected)
        # check that every sample falls into its own bin
        assert bins.nunique() == n_samples_expected