def create_transformation(self) -> Transformation: return Chain( trans=[ AsNumpyArray(field=FieldName.TARGET, expected_ndim=1), AddTimeFeatures( start_field=FieldName.START, target_field=FieldName.TARGET, output_field=FieldName.FEAT_TIME, time_features=time_features_from_frequency_str(self.freq), pred_length=self.prediction_length, ), SetFieldIfNotPresent( field=FieldName.FEAT_STATIC_CAT, value=[0.0] ), AsNumpyArray(field=FieldName.FEAT_STATIC_CAT, expected_ndim=1), transform.InstanceSplitter( target_field=transform.FieldName.TARGET, is_pad_field=transform.FieldName.IS_PAD, start_field=transform.FieldName.START, forecast_start_field=transform.FieldName.FORECAST_START, train_sampler=TestSplitSampler(), time_series_fields=[FieldName.FEAT_TIME], past_length=self.context_length, future_length=self.prediction_length, ), ] )
def test_ExpectedNumInstanceSampler(): N = 6 train_length = 2 pred_length = 1 ds = make_dataset(N, train_length) t = transform.Chain(trans=[ transform.InstanceSplitter( target_field=FieldName.TARGET, is_pad_field=FieldName.IS_PAD, start_field=FieldName.START, forecast_start_field=FieldName.FORECAST_START, instance_sampler=transform.ExpectedNumInstanceSampler( num_instances=4, min_future=pred_length), past_length=train_length, future_length=pred_length, ) ]) assert_serializable(t) scale_hist = ScaleHistogram() repetition = 2 for i in range(repetition): for data in t(iter(ds), is_train=True): target_values = data["past_target"] # for simplicity, discard values that are zeros to avoid confusion with padding target_values = target_values[target_values > 0] scale_hist.add(target_values) expected_values = {i: 2**i * repetition for i in range(1, N)} assert expected_values == scale_hist.bin_counts
def create_transformation(self) -> transform.Transformation: return transform.Chain(trans=[ transform.AsNumpyArray(field=FieldName.TARGET, expected_ndim=1), transform.AddTimeFeatures( start_field=FieldName.START, target_field=FieldName.TARGET, output_field=FieldName.FEAT_TIME, time_features=time_features_from_frequency_str(self.freq), pred_length=self.prediction_length, ), transform.VstackFeatures( output_field=FieldName.FEAT_DYNAMIC_REAL, input_fields=[FieldName.FEAT_TIME], ), transform.SetFieldIfNotPresent(field=FieldName.FEAT_STATIC_CAT, value=[0.0]), transform.AsNumpyArray(field=FieldName.FEAT_STATIC_CAT, expected_ndim=1), transform.InstanceSplitter( target_field=FieldName.TARGET, is_pad_field=FieldName.IS_PAD, start_field=FieldName.START, forecast_start_field=FieldName.FORECAST_START, train_sampler=ExpectedNumInstanceSampler(num_instances=1), past_length=self.context_length, future_length=self.prediction_length, time_series_fields=[FieldName.FEAT_DYNAMIC_REAL], ), ])
def test_InstanceSplitter(start, target, lead_time: int, is_train: bool, pick_incomplete: bool): train_length = 100 pred_length = 13 t = transform.InstanceSplitter( target_field=FieldName.TARGET, is_pad_field=FieldName.IS_PAD, start_field=FieldName.START, forecast_start_field=FieldName.FORECAST_START, train_sampler=transform.UniformSplitSampler(p=1.0), past_length=train_length, future_length=pred_length, lead_time=lead_time, time_series_fields=["some_time_feature"], pick_incomplete=pick_incomplete, ) assert_serializable(t) other_feat = np.arange(len(target) + 100) data = { "start": start, "target": target, "some_time_feature": other_feat, "some_other_col": "ABC", } if not is_train and not pick_incomplete and len(target) < train_length: with pytest.raises(AssertionError): out = list(t.flatmap_transform(data, is_train=is_train)) return else: out = list(t.flatmap_transform(data, is_train=is_train)) if is_train: assert len(out) == max( 0, len(target) - pred_length - lead_time + 1 - (0 if pick_incomplete else train_length), ) else: assert len(out) == 1 for o in out: assert "target" not in o assert "some_time_feature" not in o assert "some_other_col" in o assert len(o["past_some_time_feature"]) == train_length assert len(o["past_target"]) == train_length if is_train: assert len(o["future_target"]) == pred_length assert len(o["future_some_time_feature"]) == pred_length else: assert len(o["future_target"]) == 0 assert len(o["future_some_time_feature"]) == pred_length
def test_Transformation(): train_length = 100 ds = gluonts.dataset.common.ListDataset( [{"start": "2012-01-01", "target": [0.2] * train_length}], freq="1D" ) pred_length = 10 t = transform.Chain( trans=[ transform.AddTimeFeatures( start_field=transform.FieldName.START, target_field=transform.FieldName.TARGET, output_field="time_feat", time_features=[ time_feature.DayOfWeek(), time_feature.DayOfMonth(), time_feature.MonthOfYear(), ], pred_length=pred_length, ), transform.AddAgeFeature( target_field=transform.FieldName.TARGET, output_field="age", pred_length=pred_length, log_scale=True, ), transform.AddObservedValuesIndicator( target_field=transform.FieldName.TARGET, output_field="observed_values", ), transform.VstackFeatures( output_field="dynamic_feat", input_fields=["age", "time_feat"], drop_inputs=True, ), transform.InstanceSplitter( target_field=transform.FieldName.TARGET, is_pad_field=transform.FieldName.IS_PAD, start_field=transform.FieldName.START, forecast_start_field=transform.FieldName.FORECAST_START, train_sampler=transform.ExpectedNumInstanceSampler( num_instances=4 ), past_length=train_length, future_length=pred_length, time_series_fields=["dynamic_feat", "observed_values"], ), ] ) assert_serializable(t) for u in t(iter(ds), is_train=True): print(u)
def _create_instance_splitter(self, mode: str): return transform.InstanceSplitter( target_field=FieldName.TARGET, is_pad_field=FieldName.IS_PAD, start_field=FieldName.START, forecast_start_field=FieldName.FORECAST_START, instance_sampler=TestSplitSampler(), time_series_fields=[FieldName.FEAT_TIME], past_length=self.context_length, future_length=self.prediction_length, )
def test_InstanceSplitter(start, target, is_train): train_length = 100 pred_length = 13 t = transform.InstanceSplitter( target_field=transform.FieldName.TARGET, is_pad_field=transform.FieldName.IS_PAD, start_field=transform.FieldName.START, forecast_start_field=transform.FieldName.FORECAST_START, train_sampler=transform.UniformSplitSampler(p=1.0), past_length=train_length, future_length=pred_length, time_series_fields=["some_time_feature"], pick_incomplete=True, ) assert_serializable(t) other_feat = np.arange(len(target) + 100) data = { "start": start, "target": target, "some_time_feature": other_feat, "some_other_col": "ABC", } out = list(t.flatmap_transform(data, is_train=is_train)) if is_train: assert len(out) == max(0, len(target) - pred_length + 1) else: assert len(out) == 1 for o in out: assert "target" not in o assert "some_time_feature" not in o assert "some_other_col" in o assert len(o["past_some_time_feature"]) == train_length assert len(o["past_target"]) == train_length if is_train: assert len(o["future_target"]) == pred_length assert len(o["future_some_time_feature"]) == pred_length else: assert len(o["future_target"]) == 0 assert len(o["future_some_time_feature"]) == pred_length
def test_BucketInstanceSampler(): N = 6 train_length = 2 pred_length = 1 ds = make_dataset(N, train_length) dataset_stats = calculate_dataset_statistics(ds) t = transform.Chain( trans=[ transform.InstanceSplitter( target_field=transform.FieldName.TARGET, is_pad_field=transform.FieldName.IS_PAD, start_field=transform.FieldName.START, forecast_start_field=transform.FieldName.FORECAST_START, train_sampler=transform.BucketInstanceSampler( dataset_stats.scale_histogram ), past_length=train_length, future_length=pred_length, pick_incomplete=True, ) ] ) assert_serializable(t) scale_hist = ScaleHistogram() repetition = 200 for i in range(repetition): for data in t(iter(ds), is_train=True): target_values = data["past_target"] # for simplicity, discard values that are zeros to avoid confusion with padding target_values = target_values[target_values > 0] scale_hist.add(target_values) expected_values = {i: repetition for i in range(1, N)} found_values = scale_hist.bin_counts for i in range(1, N): assert abs( expected_values[i] - found_values[i] < expected_values[i] * 0.3 )
def _create_instance_splitter(self, mode: str): assert mode in ["training", "validation", "test"] instance_sampler = { "training": self.train_sampler, "validation": self.validation_sampler, "test": TestSplitSampler(), }[mode] return transform.InstanceSplitter( target_field=FieldName.TARGET, is_pad_field=FieldName.IS_PAD, start_field=FieldName.START, forecast_start_field=FieldName.FORECAST_START, instance_sampler=instance_sampler, past_length=self.context_length, future_length=self.prediction_length, time_series_fields=[FieldName.FEAT_DYNAMIC_REAL], )
def test_instance_splitter(): splitter = transform.InstanceSplitter( target_field=FieldName.TARGET, is_pad_field=FieldName.IS_PAD, start_field=FieldName.START, forecast_start_field=FieldName.FORECAST_START, instance_sampler=transform.ExpectedNumInstanceSampler(num_instances=4), past_length=100, future_length=10, time_series_fields=["dynamic_feat", "observed_values"], ) splitter2 = clone( splitter, { "instance_sampler": transform.ExpectedNumInstanceSampler(num_instances=5) }, ) assert equals(splitter, clone(splitter)) assert not equals(splitter, splitter2)
def test_multi_dim_transformation(is_train): train_length = 10 first_dim = np.arange(1, 11, 1).tolist() first_dim[-1] = "NaN" second_dim = np.arange(11, 21, 1).tolist() second_dim[0] = "NaN" ds = gluonts.dataset.common.ListDataset( data_iter=[{"start": "2012-01-01", "target": [first_dim, second_dim]}], freq="1D", one_dim_target=False, ) pred_length = 2 # Looks weird - but this is necessary to assert the nan entries correctly. first_dim[-1] = np.nan second_dim[0] = np.nan t = transform.Chain( trans=[ transform.AddTimeFeatures( start_field=transform.FieldName.START, target_field=transform.FieldName.TARGET, output_field="time_feat", time_features=[ time_feature.DayOfWeek(), time_feature.DayOfMonth(), time_feature.MonthOfYear(), ], pred_length=pred_length, ), transform.AddAgeFeature( target_field=transform.FieldName.TARGET, output_field="age", pred_length=pred_length, log_scale=True, ), transform.AddObservedValuesIndicator( target_field=transform.FieldName.TARGET, output_field="observed_values", convert_nans=False, ), transform.VstackFeatures( output_field="dynamic_feat", input_fields=["age", "time_feat"], drop_inputs=True, ), transform.InstanceSplitter( target_field=transform.FieldName.TARGET, is_pad_field=transform.FieldName.IS_PAD, start_field=transform.FieldName.START, forecast_start_field=transform.FieldName.FORECAST_START, train_sampler=transform.ExpectedNumInstanceSampler( num_instances=4 ), past_length=train_length, future_length=pred_length, time_series_fields=["dynamic_feat", "observed_values"], output_NTC=False, ), ] ) assert_serializable(t) if is_train: for u in t(iter(ds), is_train=True): assert_shape(u["past_target"], (2, 10)) assert_shape(u["past_dynamic_feat"], (4, 10)) assert_shape(u["past_observed_values"], (2, 10)) assert_shape(u["future_target"], (2, 2)) assert_padded_array( u["past_observed_values"], np.array([[1.0] * 9 + [0.0], [0.0] + [1.0] * 9]), u["past_is_pad"], ) assert_padded_array( u["past_target"], np.array([first_dim, second_dim]), u["past_is_pad"], ) else: for u in t(iter(ds), is_train=False): assert_shape(u["past_target"], (2, 10)) assert_shape(u["past_dynamic_feat"], (4, 10)) assert_shape(u["past_observed_values"], (2, 10)) assert_shape(u["future_target"], (2, 0)) assert_padded_array( u["past_observed_values"], np.array([[1.0] * 9 + [0.0], [0.0] + [1.0] * 9]), u["past_is_pad"], ) assert_padded_array( u["past_target"], np.array([first_dim, second_dim]), u["past_is_pad"], )