def test_single_batch(self, use_tensor_extension): df = mock_fit_data( periods=9, ids=[0], use_tensor_extension=use_tensor_extension ) seq_length = 2 generator = SequenceForecastBatchGenerator( df=df, batch_size=4, sequence_length=seq_length, id_column='id', sequence_columns=[ini.Columns.datetime, ini.Columns.target], sequence_prefix='seq_', last_step_columns=[], forecast_steps_min=1, forecast_steps_max=1, ) assert len(generator) == 1 batch = generator[0] assert isinstance(batch, pd.DataFrame) expected_columns = [ 'id', ini.Columns.datetime, ini.Columns.target, f'seq_{ini.Columns.datetime}', f'seq_{ini.Columns.target}' ] for col in expected_columns: assert col in batch sequence_columns = [ f'seq_{ini.Columns.datetime}', f'seq_{ini.Columns.target}' ] for sequence_column in sequence_columns: sequenced = batch[sequence_column] assert sequenced.values.shape[1] == seq_length
def test_single_batch_with_last_step(self): df = mock_fit_data(periods=9, ids=[0]) seq_length = 2 generator = SequenceForecastBatchGenerator( df=df, batch_size=4, sequence_length=seq_length, id_column='id', sequence_columns=[ini.Columns.datetime, ini.Columns.target], sequence_prefix='seq_', last_step_columns=[ini.Columns.datetime], last_step_prefix='last_step_', forecast_steps_min=1, forecast_steps_max=1, ) assert len(generator) == 1 batch = generator[0] assert isinstance(batch, pd.DataFrame) expected_columns = [ 'id', ini.Columns.datetime, ini.Columns.target, f'seq_{ini.Columns.datetime}', f'seq_{ini.Columns.target}', f'last_step_{ini.Columns.datetime}' ] for col in expected_columns: assert col in batch sequence_columns = [ f'seq_{ini.Columns.datetime}', f'seq_{ini.Columns.target}' ] for sequence_column in sequence_columns: values = batch[sequence_column].values assert values.shape[1] == seq_length last_step_columns = [f'last_step_{ini.Columns.datetime}'] for column in last_step_columns: values = batch[column].values assert len(values.shape) == 1
def test_random_offset(self, random): df = mock_fit_data(periods=101, ids=[0]) generator = SequenceForecastBatchGenerator( df=df, batch_offset=True, sequence_length=10, ) with pytest.raises(AssertionError): assert_array_equal(generator[0], generator[0])
def test_featurize_is_in_interval(featurizer, periods): df = mock_fit_data(start_date=datetime.datetime(2017, 1, 1, 0, 0), periods=periods) featurizer.set_params(column=datetime_column, attributes='is_in_interval', kwargs={ 'start_time': '17:00', 'end_time': '19:00', }) df = featurizer.transform(df) assert np.sum(df['is_in_interval']) == 4 * len(df) / 48
def test_get_time_is_in_interval_from_series(periods): df = mock_fit_data(start_date=datetime.datetime(2017, 1, 1, 0, 0), periods=periods) is_peak = get_time_is_in_interval_from_series(start_time='17:00', end_time='19:00', series=df[datetime_column]) not_peak = get_time_is_in_interval_from_series(start_time='19:00', end_time='17:00', series=df[datetime_column]) assert len(is_peak) == periods assert np.sum(not_peak) == 44 * len(df) / 48
def test_get_sequence_values(self): n_points, sequence_length = 10, 2 df = mock_fit_data(periods=n_points, ids=[0]) gen = SamplingForecastBatchGenerator( df=df, sequence_length=sequence_length, ) start_indices = np.array([0, 3, 1]) num_indices = len(start_indices) seq_values = gen._get_sequence_values(ini.Columns.id, start_indices) assert seq_values.shape == (num_indices, sequence_length)
def test_random_offset_value_with_period(self, random, seq_len, period, expected_max_offset): df = mock_fit_data(periods=101, ids=[0]) generator = SequenceForecastBatchGenerator(df=df, sequence_length=seq_len, batch_offset=True, batch_offset_period=period) offsets = [generator.random_offset_value for _ in range(100)] assert min(offsets) == 0 assert max(offsets) == expected_max_offset assert all(offset % period == 0 for offset in offsets)
def test_subgen_lengths(self, n_customers, batch_size, exp_sg_len): n_customers = 3 ids = np.arange(n_customers) df = mock_fit_data(periods=3, ids=ids) generator = SequenceForecastBatchGenerator( df=df, sequence_length=1, forecast_steps_max=1, batch_size=batch_size, ) assert all(sgl == exp_sg_len for sgl in generator.subgen_lengths)
def test_n_batches(self, n_points, seq_length, fc_max, batch_size, n_batches_expected): df = mock_fit_data(periods=n_points, ids=[0]) generator = SequenceForecastBatchGenerator( df=df, batch_size=batch_size, sequence_length=seq_length, forecast_steps_min=1, forecast_steps_max=fc_max, ) assert len(generator) == n_batches_expected
def test_n_subgens(self, n_customers): ids = np.arange(n_customers) df = mock_fit_data(periods=4, ids=ids) generator = SequenceForecastBatchGenerator( df=df, sequence_length=2, forecast_steps_max=1, batch_size=2**10, ) assert len(generator.chunks) == n_customers assert len(generator.subgen_lengths) == n_customers assert len(generator.subgen_index_bounds) == n_customers + 1
def test_invalid_start_time(self): df = mock_fit_data(periods=1344, ids=[0]) df = df.sort_values(by=[ini.Columns.datetime]) start_time = (df[ini.Columns.datetime][0] + pd.Timedelta(1, unit='m')).time() generator = SequenceForecastBatchGenerator( df=df, sequence_length=48, sequence_columns=[ini.Columns.datetime], start_time=start_time) with pytest.raises(ValueError): generator[0]
def test_aggregate_ids(self): n_customers = 2 ids = np.arange(n_customers) df = mock_fit_data(periods=3, ids=ids) generator = SequenceForecastBatchGenerator(df=df, sequence_length=2, forecast_steps_max=1, batch_size=2, batch_aggregator=2) assert len(generator) == 1 batch = generator[0] assert len(batch) == 2
def test_get_is_morning_peak_from_series(featurizer, periods): df = mock_fit_data(start_date=datetime.datetime(2017, 1, 1, 0, 0), periods=periods) featurizer.set_params( column=datetime_column, attributes=['is_peak', 'is_daytime', 'is_morningpeak']) df = featurizer.transform(df) assert 'is_peak' in df assert 'is_daytime' in df assert 'is_morningpeak' in df assert np.sum(df['is_peak']) == 6 * len(df) / 48 assert np.sum(df['is_daytime']) == 34 * len(df) / 48 assert np.sum(df['is_morningpeak']) == 10 * len(df) / 48
def test_find_batch_raises_outside_subgens(self): n_customers = 3 ids = np.arange(n_customers) df = mock_fit_data(periods=3, ids=ids) generator = SequenceForecastBatchGenerator( df=df, sequence_length=1, forecast_steps_max=1, batch_size=2**10, ) batch_idx = 2**10 with pytest.raises(IndexError): generator.find_subbatch_in_subgens(batch_idx)
def test_num_examples(self, n_points, seq_length, fc_max, n_sequences_expected): df = mock_fit_data(periods=n_points, ids=[0]) generator = SequenceForecastBatchGenerator( df=df, sequence_length=seq_length, forecast_steps_min=1, forecast_steps_max=fc_max, ) assert generator.num_examples == n_sequences_expected generator.batch_offset = True assert generator.num_examples == max(0, n_sequences_expected - 1)
def test_find_batch_in_subgens(self, batch_size, batch_idx, exp_subgen_idx, exp_idx_in_subgen): n_customers = 3 ids = np.arange(n_customers) df = mock_fit_data(periods=3, ids=ids) generator = SequenceForecastBatchGenerator( df=df, sequence_length=1, forecast_steps_max=1, batch_size=batch_size, ) subgen_idx, idx_in_subgen = generator.find_subbatch_in_subgens( batch_idx) assert subgen_idx == exp_subgen_idx assert idx_in_subgen == exp_idx_in_subgen
def test_batch_size(self, n_points, seq_length, fc_max, batch_size, expected_last_batch_size): df = mock_fit_data(periods=n_points, ids=[0]) generator = SequenceForecastBatchGenerator( df=df, batch_size=batch_size, sequence_columns=[ini.Columns.target], last_step_columns=[], sequence_length=seq_length, forecast_steps_min=1, forecast_steps_max=fc_max, ) for batch_idx in range(len(generator) - 1): assert len(generator[batch_idx]) == batch_size assert len(generator[-1]) == expected_last_batch_size
def test_n_batches_with_offset( self, n_points, seq_length, fc_max, batch_size, n_batches_expected, use_tensor_extension ): df = mock_fit_data( periods=n_points, ids=[0], use_tensor_extension=use_tensor_extension ) generator = SequenceForecastBatchGenerator( df=df, batch_size=batch_size, sequence_length=seq_length, forecast_steps_min=1, forecast_steps_max=fc_max, batch_offset=True, ) assert len(generator) == n_batches_expected
def test_start_time(self, start_time_idx, expected_start_time_idx): df = mock_fit_data(periods=1344, ids=[0]) df = df.sort_values(by=[ini.Columns.datetime]) if start_time_idx is None: start_time = None else: start_time = df[ini.Columns.datetime][start_time_idx].time() expected_start_time = df[ ini.Columns.datetime][expected_start_time_idx].time() generator = SequenceForecastBatchGenerator( df=df, sequence_length=48, sequence_columns=[ini.Columns.datetime], batch_offset=False, start_time=start_time) batch = generator[0] actual_start_time = batch[f'seq_{ini.Columns.datetime}'][0][0].time() assert actual_start_time == expected_start_time
def indexed_df(): return mock_fit_data(index=True)
def df(): return mock_fit_data(periods=N_TIMES, ids=np.arange(N_IDS))
def df(use_tensor_extension): return mock_fit_data(use_tensor_extension=use_tensor_extension)
def df(): return mock_fit_data(start_date=datetime.datetime(2017, 1, 1, 1, 0))
def validation_df(): return mock_fit_data(periods=13)
def df(): return mock_fit_data(periods=13)
def df(): return mock_fit_data()