def test_result_is_reproduceable_within_sampler(self, config, raw_data, seed): """ GIVEN a sampler instantiated with a random seed WHEN this sampler is applied to the same data twice THEN the results of both applications are equal """ sampler = BinnedUniformSampler(random_seed=seed, **config) first_result = sampler.generate_samples(raw_data) second_result = sampler.generate_samples(raw_data) pd.testing.assert_frame_equal(first_result, second_result)
def test_result_is_reproduceable_between_samplers(self, config, raw_data, seed): """ GIVEN two samplers with the same random seed and the same other parameters WHEN both samlers are applied to the same raw data THEN the result is the same """ first_sampler = BinnedUniformSampler(random_seed=seed, **config) first_result = first_sampler.generate_samples(raw_data) second_sampler = BinnedUniformSampler(random_seed=seed, **config) second_result = second_sampler.generate_samples(raw_data) pd.testing.assert_frame_equal(first_result, second_result)
def test_y_data_but_no_x_data(self): """ GIVEN a sampler and dummy data for a customer with no purchases falling into the \ lookback period WHEN samples are generated from that customers data THEN there are no orderpositions marked with `x_include` in the result. """ lead_time = pd.to_timedelta("1d") lookback = pd.to_timedelta("2y") prediction_period = pd.to_timedelta("180d") max_date = pd.to_datetime("2020-01-01") min_date = (max_date - prediction_period - lead_time - lookback - pd.to_timedelta("1d")) sampler = BinnedUniformSampler( min_date=min_date, max_date=max_date, lead_time=lead_time, prediction_period=prediction_period, samples_per_lookback=1, lookback=lookback, ) # purchase that covers one prediction period, but no lookback customer_data = self.generate_data_for_one_customer( 1, max_date - prediction_period - lead_time, max_date, n_orders=12) # a sampler with max date greater than the maximum order date and # params such that exactly one sample is created for the customer samples = sampler.generate_samples(customer_data) assert samples.index.get_level_values("sample_id").nunique() == 1 assert samples.x_include.sum() == 0 assert samples.y_include.sum() >= 1
def test_acquisition_date_greater_max_date(self): """ GIVEN a sampler and data for a customer whose acquisition date is greater \ than the samplers `max_date` WHEN the sampler is applied to the data THEN it returns an empty dataframe """ customer_data = self.generate_data_for_one_customer( 1, min_date="2016-01-01", # will be overwritten by acquisition date max_date="2020-08-01", acquisition_date="2020-01-01", n_orders=12, ) sampler = BinnedUniformSampler( min_date="2016-01-01", max_date="2019-12-31", lead_time="28d", prediction_period="180d", lookback="180d", samples_per_lookback=1, ) samples = sampler.generate_samples(customer_data) assert samples.empty
def test_different_seeds_produce_different_outcomes( self, config, raw_data, seed): """ GIVEN two samplers with the same parameters but different random seeds WHEN both samplers are applied to the same raw data THEN the result is different """ first_sampler = BinnedUniformSampler(random_seed=seed, **config) second_sampler = BinnedUniformSampler(random_seed=seed + 1, **config) first_result = first_sampler.generate_samples(raw_data) second_result = second_sampler.generate_samples(raw_data) try: pd.testing.assert_frame_equal(first_result, second_result) except AssertionError: pass else: raise AssertionError
def test_no_prediction_time_outside_min_and_max_date( self, sampler: BinnedUniformSampler, raw_data: pd.DataFrame): """ GIVEN a sampler instance and some dummy purchase data for multiple customers WHEN samples are created from this data using the sampler THEN the prediction times of all samples is within `[sampler.min_date, sampler.max_date]` """ sampled = sampler.generate_samples(raw_data) max_date = sampler.max_date min_date = sampler.min_date assert np.all(sampled.prediction_time > min_date) assert np.all(sampled.prediction_time < max_date)
def test_no_data_in_lead_time(self, sampler: BinnedUniformSampler, raw_data: pd.DataFrame): """ GIVEN a sampler with a lead time and some dummy purchase data to create samples from WHEN samples are generated from this purchase data THEN there is no order position marked in the column `y_include` in the lead time """ sampled = sampler.generate_samples(raw_data) y_data = sampled[sampled.y_include] lead_time = sampler.lead_time lower_boundary = lead_time + y_data.prediction_time assert np.all(y_data.order_date > lower_boundary)
def test_sampled_data_columns( self, sampler: BinnedUniformSampler, expected_column: str, raw_data: pd.DataFrame, ): """ GIVEN a sampler and dummy purchase data for multiple customers WHEN samples are created from this dummy data using the sampler THEN the output dataframe from the sampler contains all mandatory columns. """ sampled = sampler.generate_samples(raw_data) columns = list(sampled.columns) + list(sampled.index.names) assert expected_column in columns
def test_samples_for_new_customers( self, min_date, max_date, lookback, prediction_period, lead_time, samples_per_lookback, ): """ GIVEN a set of parameters for a sampler WHEN samples are generated from a customer's purchase history that does not cover a \ full lookback THEN the number of samples generated for this customer is still proportional to the \ time period covered by the customers purchase history and there is at least one \ sample regardless of the available purchase history. """ min_date = pd.to_datetime(min_date) max_date = pd.to_datetime(max_date) lookback = pd.to_timedelta(lookback) prediction_period = pd.to_timedelta(prediction_period) lead_time = pd.to_timedelta(lead_time) samples_per_lookback = 3 acquisition_date = (max_date - (lookback * np.random.uniform()) - prediction_period - lead_time) new_customer_data = self.generate_data_for_one_customer( 1, min_date, max_date, acquisition_date) sampler = BinnedUniformSampler( min_date=min_date, max_date=max_date, lead_time=lead_time, prediction_period=prediction_period, lookback=lookback, samples_per_lookback=samples_per_lookback, ) samples = sampler.generate_samples(new_customer_data) n_samples = samples.index.get_level_values("sample_id").nunique() lookbacks_covered = ((max_date - prediction_period - lead_time) - acquisition_date) / lookback expected_n_samples = np.max( [np.floor(lookbacks_covered * samples_per_lookback), 1]) assert expected_n_samples == n_samples assert np.all(n_samples > 0)
def test_samples_within_bins( self, min_date, max_date, lead_time, prediction_period, lookback, samples_per_lookback, random_seed, ): """ GIVEN a set of parameters for a sampler WHEN samples are generated from one customers data using a sampler instantiated \ with these parameters THEN all samples for this customers fall into equi-spaced bins of the customers's \ purchase history. """ np.random.seed(random_seed) # generate data for timespan customer_data = self.generate_data_for_one_customer( 1, min_date, max_date) # initialize sampler sampler = BinnedUniformSampler( min_date=min_date, max_date=max_date, lead_time=lead_time, prediction_period=prediction_period, lookback=lookback, samples_per_lookback=samples_per_lookback, ) min_date = pd.to_datetime(min_date) max_date = pd.to_datetime(max_date) lead_time = pd.to_timedelta(lead_time) prediction_period = pd.to_timedelta(prediction_period) lookback = pd.to_timedelta(lookback) samples = sampler.generate_samples(customer_data) # calculate boundaries of the sampling range upper = max_date - prediction_period - lead_time lower = max(min_date, customer_data.acquisition_date.max()) # calculate the number of samples lookbacks_covered = (upper - lower) / lookback n_samples_expected = np.floor(lookbacks_covered * samples_per_lookback) n_samples_expected = n_samples_expected.astype(int) # at least one sample if customer has enough data for one prediction period if upper > lower: n_samples_expected = max(n_samples_expected, 1) else: n_samples_expected = 0 # full lookbacks for customers with enough data if lower < (upper - lookback): lower = lower + lookback lower = max(lower, min_date + lookback) # calculate the size of the individual bins for sampling bin_size = (upper - lower) / n_samples_expected # sort prediction times into bins prediction_times = samples.groupby("sample_id").prediction_time.first() bins = np.floor((prediction_times - lower) / bin_size) # check that we actually generated the expected number of samples assert (samples.index.get_level_values("sample_id").nunique() == n_samples_expected) # check that every sample falls into its own bin assert bins.nunique() == n_samples_expected