Пример #1
0
    def test_tsdataset_split(self):
        df = get_multi_id_ts_df()
        tsdata_train, tsdata_valid, tsdata_test =\
            TSDataset.from_pandas(df, dt_col="datetime", target_col="value",
                                  extra_feature_col=["extra feature"], id_col="id",
                                  with_split=True, val_ratio=0.1, test_ratio=0.1,
                                  largest_look_back=5, largest_horizon=2)

        assert set(np.unique(tsdata_train.to_pandas()["id"])) == {"00", "01"}
        assert set(np.unique(tsdata_valid.to_pandas()["id"])) == {"00", "01"}
        assert set(np.unique(tsdata_test.to_pandas()["id"])) == {"00", "01"}

        assert len(tsdata_train.to_pandas()) == (50 * 0.8)*2
        assert len(tsdata_valid.to_pandas()) == (50 * 0.1 + 5 + 2 - 1)*2
        assert len(tsdata_test.to_pandas()) == (50 * 0.1 + 5 + 2 - 1)*2
Пример #2
0
    def test_tsdataset_rolling_feature_multiple(self):
        df = get_multi_id_ts_df()
        horizon = random.randint(2, 10)
        lookback = random.randint(2, 20)
        tsdata = TSDataset.from_pandas(df,
                                       dt_col="datetime",
                                       target_col="value",
                                       extra_feature_col=["extra feature"],
                                       id_col="id")
        tsdata.gen_rolling_feature(settings="minimal", window_size=lookback)
        tsdata._check_basic_invariants()
        tsdata = TSDataset.from_pandas(df,
                                       dt_col="datetime",
                                       target_col="value",
                                       extra_feature_col=["extra feature"],
                                       id_col="id")
        tsdata.gen_rolling_feature(settings="minimal",
                                   window_size=lookback,
                                   n_jobs=2)
        tsdata._check_basic_invariants()

        # roll train
        tsdata.roll(lookback=lookback, horizon=horizon)
        x, y = tsdata.to_numpy()
        feature_num = len(tsdata.feature_col) + len(tsdata.target_col)
        assert x.shape == ((50 - lookback - horizon + 1) * 2, lookback,
                           feature_num)
        assert y.shape == ((50 - lookback - horizon + 1) * 2, horizon, 1)

        tsdata.roll(lookback=lookback, horizon=horizon, id_sensitive=True)
        x, y = tsdata.to_numpy()
        assert x.shape == ((50 - lookback - horizon + 1), lookback,
                           feature_num * 2)
        assert y.shape == ((50 - lookback - horizon + 1), horizon, 2)

        tsdata._check_basic_invariants()
Пример #3
0
    def test_dt_sorted(self):
        df = pd.DataFrame({
            "datetime":
            np.array(['20000101', '20000102', '20000102', '20000101']),
            "value":
            np.array([1.9, 2.3, 2.4, 2.6]),
            "id":
            np.array(['00', '01', '00', '01'])
        })

        tsdata = TSDataset.from_pandas(df,
                                       target_col='value',
                                       dt_col='datetime')
        with pytest.raises(RuntimeError):
            tsdata._check_basic_invariants(strict_check=True)
Пример #4
0
    def test_non_pd_datetime(self):
        df = get_non_dt()
        tsdata = TSDataset.from_pandas(df,
                                       dt_col="datetime",
                                       target_col="value",
                                       extra_feature_col="extra feature",
                                       id_col="id")

        with pytest.raises(AssertionError):
            tsdata.resample('2D')
        with pytest.raises(AssertionError):
            tsdata.gen_dt_feature()
        with pytest.raises(AssertionError):
            tsdata.gen_rolling_feature(settings="minimal", window_size=1000)

        tsdata._check_basic_invariants()
Пример #5
0
    def test_tsdataset_initialization(self):
        df = get_ts_df()

        # legal input
        tsdata = TSDataset.from_pandas(df, dt_col="datetime", target_col="value",
                                       extra_feature_col=["extra feature"], id_col="id")
        assert tsdata._id_list == ['00']
        assert tsdata.feature_col == ["extra feature"]
        assert tsdata.target_col == ["value"]
        assert tsdata.dt_col == "datetime"
        assert tsdata._is_pd_datetime

        tsdata = TSDataset.from_pandas(df, dt_col="datetime", target_col=["value"],
                                       extra_feature_col="extra feature", id_col="id")
        assert tsdata._id_list == ['00']
        assert tsdata.feature_col == ["extra feature"]
        assert tsdata.target_col == ["value"]
        assert tsdata.dt_col == "datetime"
        assert tsdata._is_pd_datetime

        tsdata = TSDataset.from_pandas(df.drop(columns=["id"]), dt_col="datetime",
                                       target_col=["value"], extra_feature_col="extra feature")
        assert tsdata._id_list == ['0']
        assert tsdata.feature_col == ["extra feature"]
        assert tsdata.target_col == ["value"]
        assert tsdata.dt_col == "datetime"
        assert tsdata._is_pd_datetime

        # illegal input
        with pytest.raises(AssertionError):
            tsdata = TSDataset.from_pandas(df, dt_col="datetime", target_col=["value"],
                                           extra_feature_col="extra feature", id_col=0)
        with pytest.raises(AssertionError):
            tsdata = TSDataset.from_pandas(df, dt_col=0, target_col=["value"],
                                           extra_feature_col="extra feature", id_col="id")
        with pytest.raises(AssertionError):
            tsdata = TSDataset.from_pandas(df, dt_col="datetime", target_col=0,
                                           extra_feature_col="extra feature", id_col="id")
        with pytest.raises(AssertionError):
            tsdata = TSDataset.from_pandas(0, dt_col="datetime", target_col=["value"],
                                           extra_feature_col="extra feature", id_col="id")
        with pytest.raises(AssertionError):
            tsdata = TSDataset.from_pandas(df, dt_col="datetime", target_col=["value1"],
                                           extra_feature_col="extra feature", id_col="id")
Пример #6
0
    def test_tsdataset_roll_single_id(self):
        df = get_ts_df()
        horizon = random.randint(1, 10)
        lookback = random.randint(1, 20)

        tsdata = TSDataset.from_pandas(df,
                                       dt_col="datetime",
                                       target_col="value",
                                       extra_feature_col=["extra feature"],
                                       id_col="id")

        with pytest.raises(RuntimeError):
            tsdata.to_numpy()

        # roll train
        tsdata.roll(lookback=lookback, horizon=horizon)
        x, y = tsdata.to_numpy()
        assert x.shape == (len(df) - lookback - horizon + 1, lookback, 2)
        assert y.shape == (len(df) - lookback - horizon + 1, horizon, 1)

        tsdata.roll(lookback=lookback,
                    horizon=horizon,
                    feature_col=["extra feature"],
                    target_col="value")
        x, y = tsdata.to_numpy()
        assert x.shape == (len(df) - lookback - horizon + 1, lookback, 2)
        assert y.shape == (len(df) - lookback - horizon + 1, horizon, 1)

        tsdata.roll(lookback=lookback,
                    horizon=horizon,
                    feature_col=[],
                    target_col="value")
        x, y = tsdata.to_numpy()
        assert x.shape == (len(df) - lookback - horizon + 1, lookback, 1)
        assert y.shape == (len(df) - lookback - horizon + 1, horizon, 1)

        # roll test
        horizon = 0
        lookback = random.randint(1, 20)

        tsdata.roll(lookback=lookback, horizon=horizon)
        x, y = tsdata.to_numpy()
        assert x.shape == (len(df) - lookback - horizon + 1, lookback, 2)
        assert y is None
        tsdata._check_basic_invariants()
Пример #7
0
 def test_check_scale_sequence(self):
     df = get_multi_id_ts_df()
     # with split is True.
     td_train, td_valid, td_test = TSDataset.from_pandas(
         df,
         dt_col="datetime",
         target_col="value",
         extra_feature_col=["extra feature"],
         id_col="id",
         with_split=True,
         val_ratio=0.1,
         test_ratio=0.1)
     from sklearn.preprocessing import StandardScaler
     stand = StandardScaler()
     with pytest.raises(AssertionError):
         for tsdata in [td_train, td_valid, td_test]:
             tsdata.scale(stand, fit=False)
         tsdata._check_basic_invariants()
    def test_select_feature(self):
        sample_num = np.random.randint(100, 200)
        df = pd.DataFrame({
            "datetime":
            pd.date_range('1/1/2019', periods=sample_num),
            "value":
            np.random.randn(sample_num),
            "id":
            np.array(['00'] * sample_num)
        })
        train_ts, val_ts, _ = TSDataset.from_pandas(df,
                                                    target_col=['value'],
                                                    dt_col='datetime',
                                                    id_col='id',
                                                    with_split=True,
                                                    val_ratio=0.1)

        search_space = {
            'hidden_dim': hp.grid_search([32, 64]),
            'layer_num': hp.randint(1, 3),
            'lr': hp.choice([0.001, 0.003, 0.01]),
            'dropout': hp.uniform(0.1, 0.2)
        }

        input_feature_dim, output_feature_dim = 1, 1
        auto_estimator = AutoTSEstimator(model='lstm',
                                         search_space=search_space,
                                         past_seq_len=6,
                                         future_seq_len=1,
                                         input_feature_num=input_feature_dim,
                                         output_target_num=output_feature_dim,
                                         selected_features="auto",
                                         metric="mse",
                                         loss=torch.nn.MSELoss(),
                                         cpus_per_trial=2,
                                         name="auto_trainer")

        auto_estimator.fit(data=train_ts,
                           epochs=1,
                           batch_size=hp.choice([32, 64]),
                           validation_data=val_ts,
                           n_sampling=1)
        config = auto_estimator.get_best_config()
        assert config['past_seq_len'] == 6
Пример #9
0
    def test_tsdataset_roll_multi_id(self):
        df = get_multi_id_ts_df()
        horizon = random.randint(1, 10)
        lookback = random.randint(1, 20)

        tsdata = TSDataset.from_pandas(df, dt_col="datetime", target_col="value",
                                       extra_feature_col=["extra feature"], id_col="id")

        # test train
        tsdata.roll(lookback=lookback, horizon=horizon, id_sensitive=True)
        x, y = tsdata.to_numpy()
        assert x.shape == ((50-lookback-horizon+1), lookback, 4)
        assert y.shape == ((50-lookback-horizon+1), horizon, 2)

        tsdata.roll(lookback=lookback, horizon=horizon)
        x, y = tsdata.to_numpy()
        assert x.shape == ((50-lookback-horizon+1)*2, lookback, 2)
        assert y.shape == ((50-lookback-horizon+1)*2, horizon, 1)
        tsdata._check_basic_invariants()
Пример #10
0
    def test_tsdataset_to_torch_loader_roll(self):
        df_single_id = get_ts_df()
        df_multi_id = get_multi_id_ts_df()
        for df in [df_single_id, df_multi_id]:
            horizon = random.randint(1, 10)
            lookback = random.randint(1, 20)
            batch_size = 32

            tsdata = TSDataset.from_pandas(df,
                                           dt_col="datetime",
                                           target_col="value",
                                           extra_feature_col=["extra feature"],
                                           id_col="id")

            # train
            torch_loader = tsdata.to_torch_data_loader(batch_size=32,
                                                       roll=True,
                                                       lookback=lookback,
                                                       horizon=horizon)
            for x_batch, y_batch in torch_loader:
                assert tuple(x_batch.size()) == (batch_size, lookback, 2)
                assert tuple(y_batch.size()) == (batch_size, horizon, 1)
                break

            # test
            torch_loader = tsdata.to_torch_data_loader(batch_size=32,
                                                       roll=True,
                                                       lookback=lookback,
                                                       horizon=0)
            for x_batch in torch_loader:
                assert tuple(x_batch.size()) == (batch_size, lookback, 2)
                break

            # specify feature_col
            torch_loader = tsdata.to_torch_data_loader(batch_size=32,
                                                       roll=True,
                                                       lookback=lookback,
                                                       horizon=horizon,
                                                       feature_col=[])
            for x_batch, y_batch in torch_loader:
                assert tuple(x_batch.size()) == (batch_size, lookback, 1)
                assert tuple(y_batch.size()) == (batch_size, horizon, 1)
                break
Пример #11
0
    def assert_equal_with_tsdataset(
        df,
        horizon,
        lookback,
        feature_num=1,
    ):
        # get results rolled by tsdata.roll
        extra_feature_col = None if feature_num == 0 else ["extra feature"]
        tsdata = TSDataset.from_pandas(df,
                                       dt_col="datetime",
                                       target_col="value",
                                       extra_feature_col=extra_feature_col,
                                       id_col="id")
        tsdata.roll(lookback=lookback, horizon=horizon)
        x, y = tsdata.to_numpy()

        # get results rolled by RollDataset
        roll_dataset = RollDataset(df=df,
                                   lookback=lookback,
                                   horizon=horizon,
                                   feature_col=tsdata.feature_col,
                                   target_col=tsdata.target_col,
                                   id_col=tsdata.id_col)

        assert len(roll_dataset) == len(x)
        for i in range(len(x)):
            if horizon != 0:
                # for train and y is not None.
                xi, yi = x[i], y[i]
                roll_dataset_xi, roll_dataset_yi = roll_dataset[i]
                np.testing.assert_array_almost_equal(
                    xi,
                    roll_dataset_xi.detach().numpy())
                np.testing.assert_array_almost_equal(
                    yi,
                    roll_dataset_yi.detach().numpy())
            else:
                # for test, y is None.
                xi = x[i]
                roll_dataset_xi = roll_dataset[i]
                np.testing.assert_array_almost_equal(
                    xi,
                    roll_dataset_xi.detach().numpy())
Пример #12
0
 def test_tsdataset_datetime_feature(self):
     df = get_multi_id_ts_df()
     tsdata = TSDataset.from_pandas(df,
                                    dt_col="datetime",
                                    target_col="value",
                                    extra_feature_col=["extra feature"],
                                    id_col="id")
     tsdata.gen_dt_feature()
     assert set(tsdata.to_pandas().columns) == {
         'IS_AWAKE(datetime)', 'IS_BUSY_HOURS(datetime)', 'HOUR(datetime)',
         'DAY(datetime)', 'IS_WEEKEND(datetime)', 'WEEKDAY(datetime)',
         'MONTH(datetime)', 'DAYOFYEAR(datetime)', 'WEEKOFYEAR(datetime)',
         'MINUTE(datetime)', 'extra feature', 'value', 'datetime', 'id'
     }
     assert set(tsdata.feature_col) == {
         'IS_AWAKE(datetime)', 'IS_BUSY_HOURS(datetime)', 'HOUR(datetime)',
         'DAY(datetime)', 'IS_WEEKEND(datetime)', 'WEEKDAY(datetime)',
         'MONTH(datetime)', 'DAYOFYEAR(datetime)', 'WEEKOFYEAR(datetime)',
         'MINUTE(datetime)', 'extra feature'
     }
Пример #13
0
    def test_tsdataset_roll_order(self):
        df = pd.DataFrame({"datetime": np.array(['1/1/2019', '1/1/2019', '1/2/2019', '1/2/2019']),
                           "value": np.array([1.9, 2.3, 2.4, 2.6]),
                           "id": np.array(['00', '01', '00', '01']),
                           "extra feature1": np.array([1, 0, 3, 0]),
                           "extra feature2": np.array([2, 9, 4, 2])})
        tsdata = TSDataset.from_pandas(df,
                                       dt_col="datetime",
                                       target_col="value",
                                       extra_feature_col=["extra feature1", "extra feature2"],
                                       id_col="id")
        x, y = tsdata.roll(lookback=1, horizon=1, id_sensitive=False).to_numpy()
        assert x.shape == (2, 1, 3) and y.shape == (2, 1, 1)
        assert np.array_equal(x, np.array([[[1.9, 1, 2]], [[2.3, 0, 9]]]))
        assert np.array_equal(y, np.array([[[2.4]], [[2.6]]]))

        x, y = tsdata.roll(lookback=1, horizon=1, id_sensitive=True).to_numpy()
        assert x.shape == (1, 1, 6) and y.shape == (1, 1, 2)
        assert np.array_equal(x, np.array([[[1.9, 2.3, 1, 2, 0, 9]]]))
        assert np.array_equal(y, np.array([[[2.4, 2.6]]]))
Пример #14
0
    def test_tsdataset_to_torch_loader(self):
        df = get_ts_df()
        horizon = random.randint(1, 10)
        lookback = random.randint(1, 20)
        batch_size = random.randint(16, 32)

        tsdata = TSDataset.from_pandas(df,
                                       dt_col="datetime",
                                       target_col="value",
                                       extra_feature_col=["extra feature"],
                                       id_col="id")

        with pytest.raises(RuntimeError):
            tsdata.to_torch_data_loader()

        tsdata.roll(lookback=lookback, horizon=horizon)
        loader = tsdata.to_torch_data_loader(batch_size=batch_size,
                                             lookback=lookback,
                                             horizon=horizon)
        for x_batch, y_batch in loader:
            assert tuple(x_batch.size()) == (batch_size, lookback, 2)
            assert tuple(y_batch.size()) == (batch_size, horizon, 1)
            break
Пример #15
0
    def test_tsdataset_to_torch_loader_roll(self):
        df_single_id = get_ts_df()
        df_multi_id = get_multi_id_ts_df()
        for df in [df_single_id, df_multi_id]:
            horizon = random.randint(1, 10)
            lookback = random.randint(1, 20)
            batch_size = random.randint(16, 32)

            tsdata = TSDataset.from_pandas(df,
                                           dt_col="datetime",
                                           target_col="value",
                                           extra_feature_col=["extra feature"],
                                           id_col="id")

            # train
            torch_loader = tsdata.to_torch_data_loader(batch_size=batch_size,
                                                       roll=True,
                                                       lookback=lookback,
                                                       horizon=horizon)
            for x_batch, y_batch in torch_loader:
                assert tuple(x_batch.size()) == (batch_size, lookback, 2)
                assert tuple(y_batch.size()) == (batch_size, horizon, 1)
                break

            # test
            torch_loader = tsdata.to_torch_data_loader(batch_size=batch_size,
                                                       roll=True,
                                                       lookback=lookback,
                                                       horizon=0)
            for x_batch in torch_loader:
                assert tuple(x_batch.size()) == (batch_size, lookback, 2)
                break

            # specify feature_col
            torch_loader = tsdata.to_torch_data_loader(batch_size=batch_size,
                                                       roll=True,
                                                       lookback=lookback,
                                                       horizon=horizon,
                                                       feature_col=[])
            for x_batch, y_batch in torch_loader:
                assert tuple(x_batch.size()) == (batch_size, lookback, 1)
                assert tuple(y_batch.size()) == (batch_size, horizon, 1)
                break

            # Non-subset relationship
            with pytest.raises(ValueError):
                tsdata.to_torch_data_loader(
                    batch_size=batch_size,
                    roll=True,
                    lookback=lookback,
                    horizon=horizon,
                    target_col=['value', 'extra feature'])

            # specify horizon_list
            horizon_list = [1, 3, 5]
            torch_loader = tsdata.to_torch_data_loader(batch_size=batch_size,
                                                       roll=True,
                                                       lookback=lookback,
                                                       horizon=horizon_list)
            for x_batch, y_batch in torch_loader:
                assert tuple(x_batch.size()) == (batch_size, lookback, 2)
                assert tuple(y_batch.size()) == (batch_size, len(horizon_list),
                                                 1)
                break

            # multi target_col
            tsdata = TSDataset.from_pandas(
                df,
                dt_col="datetime",
                target_col=["value", "extra feature"],
                id_col="id")
            torch_loader = tsdata.to_torch_data_loader(batch_size=batch_size,
                                                       roll=True,
                                                       lookback=lookback,
                                                       horizon=horizon)
            for x_batch, y_batch in torch_loader:
                assert tuple(x_batch.size()) == (batch_size, lookback, 2)
                assert tuple(y_batch.size()) == (batch_size, horizon, 2)
                break
Пример #16
0
 def test_tsdataset_global_feature_multiple(self):
     df = get_multi_id_ts_df()
     tsdata = TSDataset.from_pandas(df, dt_col="datetime", target_col="value",
                                    extra_feature_col=["extra feature"], id_col="id")
     tsdata.gen_global_feature(settings="minimal")
     tsdata._check_basic_invariants()