Exemplo n.º 1
0
    def test_static_inputs(self, tmp_path):
        ds = _make_dataset().isel(lat=slice(0, 2), lon=slice(0, 1))
        ds_static = ds.mean(dim="time")

        cfg = Config(Path("tests/testconfigs/test_config.yml"))
        create_and_assign_temp_run_path_to_config(cfg, tmp_path)
        assert False
Exemplo n.º 2
0
    def test_kenya_data(self, tmp_path):
        if TEST_REAL_DATA:
            ds = pickle.load(Path("data/kenya.pkl").open("rb")).isel(
                lat=slice(0, 5), lon=slice(0, 5)
            )
            cfg = Config(Path("tests/testconfigs/config.yml"))
            create_and_assign_temp_run_path_to_config(cfg, tmp_path)

            dl = PixelDataLoader(
                ds, cfg=cfg, num_workers=1, mode="train", batch_size=cfg.batch_size
            )

            data = dl.__iter__().__next__()
            x, _ = data["x_d"], data["y"]

            batch_size = 256
            seq_length = cfg.seq_length
            input_variables = ["precip", "t2m", "SMsurf"]
            autoregressive = True
            n_inputs = (
                len(input_variables) + 1 if autoregressive else len(input_variables)
            )

            assert cfg.batch_size == batch_size
            assert cfg.autoregressive == autoregressive
            assert x.shape == (
                batch_size,
                seq_length,
                n_inputs,
            ), f"X Data Mismatch! Expected: {(batch_size, seq_length, n_inputs)} Got: {x.shape}"
        else:
            pass
Exemplo n.º 3
0
    def test_runoff_data(self, tmp_path):
        if TEST_REAL_DATA:
            ds = xr.open_dataset("data/ALL_dynamic_ds.nc").isel(station_id=slice(0, 5))
            cfg = Config(Path("tests/testconfigs/config_runoff.yml"))
            create_and_assign_temp_run_path_to_config(cfg, tmp_path)

            # train period
            input_variables = [] if cfg.input_variables is None else cfg.input_variables
            train_ds = ds[input_variables + [cfg.target_variable]].sel(
                time=slice(cfg.train_start_date, cfg.train_end_date)
            )
            train_dl = PixelDataLoader(
                train_ds,
                cfg=cfg,
                mode="train",
                num_workers=4,
                batch_size=cfg.batch_size,
            )

            #  check data is loaded properly
            data = next(iter(train_dl))
            x, y = data["x_d"], data["y"]

            n_in_vars = (
                len(cfg.input_variables) + 1
                if cfg.autoregressive
                else len(cfg.input_variables)
            )
            assert x.shape == (cfg.batch_size, cfg.seq_length, n_in_vars)
            assert y.shape == (cfg.batch_size, 1, 1)
        else:
            pass
Exemplo n.º 4
0
    def test_dataloader(self, tmp_path):
        ds = _make_dataset()
        cfg = Config(Path("tests/testconfigs/test_config.yml"))
        create_and_assign_temp_run_path_to_config(cfg, tmp_path)
        static = create_static(cfg=cfg, ds=ds)
        dl = PixelDataLoader(
            ds,
            cfg=cfg,
            num_workers=1,
            mode="train",
            batch_size=cfg.batch_size,
            static_data=static,
        )

        assert dl.batch_size == cfg.batch_size

        seq_length = cfg.seq_length
        autoregressive = cfg.autoregressive
        data = next(iter(dl))
        x, y = data["x_d"], data["y"]
        n_inputs = len(["features"]) + 1 if autoregressive else len(["features"])

        assert x.shape == (
            cfg.batch_size,
            seq_length,
            n_inputs + 2 if cfg.encode_doys else n_inputs,
        ), f"Size Mismatch! Expected: {(cfg.batch_size, seq_length, n_inputs)} Got: {x.shape}"
Exemplo n.º 5
0
    def test_runoff_example(self, tmp_path):
        cfg = Config(Path("tests/testconfigs/config_runoff.yml"))
        create_and_assign_temp_run_path_to_config(cfg, tmp_path)

        cfg._cfg["data_path"] = Path("data/ALL_dynamic_ds.nc")
        cfg._cfg["static_data_path"] = Path("data/camels_static.nc")
        cfg._cfg["static_inputs"] = ["p_mean", "pet_mean", "area", "gauge_elev"]
        cfg._cfg["n_epochs"] = 3

        ds, static = load_data(cfg)

        #  select subset of 3 basins
        basins = [1001, 2001, 2002]
        ds = ds.sel(station_id=basins)
        static = static.sel(station_id=basins)

        trainer = Trainer(cfg, ds, static_data=static)
        self.check_loaded_data(
            cfg,
            trainer,
            data=ds.sel(time=slice(cfg.train_start_date, cfg.train_end_date)),
        )

        losses = trainer.train_and_validate()

        tester = Tester(cfg, ds, static_data=static)
        preds = tester.run_test()

        return losses, preds
Exemplo n.º 6
0
    def test_dataset(self, tmp_path):
        target_variable = "target"
        input_variables = ["feature"]
        for path in [
            Path("tests/testconfigs/test_config_simulate.yml"),
            Path("tests/testconfigs/test_config.yml"),
        ]:
            cfg = Config(path)
            cfg._cfg["forecast_variables"] = cfg.input_variables

            create_and_assign_temp_run_path_to_config(cfg, tmp_path)
            raw_ds = _make_dataset().isel(lat=slice(0, 2), lon=slice(0, 1))
            static = create_static(cfg=cfg, ds=raw_ds)
            ds = XarrayDataset(
                raw_ds, cfg=cfg, mode="train", DEBUG=True, static_data=static
            )

            assert ds.target == target_variable
            assert (
                ds.inputs == input_variables + ["autoregressive"]
                if cfg.autoregressive
                else input_variables
            )

            x_features = (
                len(input_variables) + 1 if cfg.autoregressive else len(input_variables)
            )
            seq_length = cfg.seq_length
            for i in range(10):
                data = ds.__getitem__(i)
                x, y = data["x_d"], data["y"]

                assert y.shape == (1, 1)
                assert x.shape == (
                    seq_length,
                    x_features + 2 if cfg.encode_doys else x_features,
                ), f"Shape Mismatch! Expect: {(seq_length, x_features)} Got: {x.shape}"

                meta = data["meta"]
                times = (
                    meta["target_time"]
                    .detach()
                    .numpy()
                    .astype("datetime64[ns]")
                    .flatten()
                )
                pixel, _ = ds.lookup_table[int(meta["index"])]
                latlon = tuple([float(l) for l in str(pixel).split("_")])

                y_unnorm = (
                    ds.normalizer.individual_inverse(y, pixel, variable="target")
                    .detach()
                    .numpy()
                )

                #  extract from the original xr.Dataset
                y_exp = raw_ds.sel(
                    lat=latlon[0], lon=latlon[1], time=times, method="nearest"
                )[cfg.target_variable].values
                assert np.isclose(y_unnorm.reshape(y_exp.shape), y_exp, atol=1e-5)
Exemplo n.º 7
0
    def test_dataset_beijing(self, tmp_path):
        if is_connected():
            path = Path("tests/testconfigs/pollution.yml")
            cfg = Config(path)
            create_and_assign_temp_run_path_to_config(cfg, tmp_path)
            raw_ds = get_pollution_data_beijing().to_xarray().isel(time=slice(0, 1000))
            ds = XarrayDataset(raw_ds, cfg=cfg, mode="train", DEBUG=True)

            assert ds.y != {}
Exemplo n.º 8
0
    def test_forecast_inputs(self, tmp_path):
        ds = _make_dataset().isel(lat=slice(0, 2), lon=slice(0, 1))
        ds_forecast = (
            ds.shift(time=1).rename({"feature": "feature_fcast1"}).drop("target")
        )
        ds = xr.merge([ds, ds_forecast])

        cfg = Config(Path("tests/testconfigs/test_config.yml"))
        create_and_assign_temp_run_path_to_config(cfg, tmp_path)
        assert False
Exemplo n.º 9
0
    def test_longer_horizon_fcast(self, tmp_path):
        cfg = Config(Path("tests/testconfigs/test_1d_config_horizon.yml"))
        create_and_assign_temp_run_path_to_config(cfg, tmp_path)
        ds = load_test_jena_data_as_dataset()

        dl = PixelDataLoader(
            ds, cfg=cfg, num_workers=1, mode="train", batch_size=cfg.batch_size
        )
        data = dl.__iter__().__next__()
        _, y = data["x_d"], data["y"]

        assert y.shape == (cfg.batch_size, 1, 1)
Exemplo n.º 10
0
    def test_1D_data(self, tmp_path):
        # convert pandas to xarray object
        ds = load_test_jena_data_as_dataset()
        cfg = Config(Path("tests/testconfigs/test_1d_config_horizon.yml"))
        create_and_assign_temp_run_path_to_config(cfg, tmp_path)

        dl = PixelDataLoader(
            ds, cfg=cfg, num_workers=1, mode="train", batch_size=cfg.batch_size
        )

        data = dl.__iter__().__next__()
        x, y = data["x_d"], data["y"]

        assert x.shape == (cfg.batch_size, cfg.seq_length, len(cfg.input_variables))
        assert y.shape == (cfg.batch_size, 1, 1)
Exemplo n.º 11
0
    def test_lstm_forward_pass(self, tmp_path):
        ds = pickle.load(Path("data/kenya.pkl").open("rb"))
        cfg = Config(Path("tests/testconfigs/config.yml"))
        create_and_assign_temp_run_path_to_config(cfg, tmp_path)
        dl = PixelDataLoader(ds, cfg=cfg, mode="train")

        model = LSTM(
            input_size=dl.input_size + dl.static_input_size + dl.forecast_input_size,
            hidden_size=cfg.hidden_size,
            output_size=dl.output_size,
            forecast_horizon=dl.horizon,
        )
        data = dl.__iter__().__next__()
        y_hat = model(data)

        assert all(np.isin(["h_n", "c_n", "y_hat"], [k for k in y_hat.keys()]))
Exemplo n.º 12
0
    def test_linear_regression_forward_pass(self, tmp_path):
        ds = _make_dataset()
        cfg = Config(Path("tests/testconfigs/test_config.yml"))
        create_and_assign_temp_run_path_to_config(cfg, tmp_path)
        dl = PixelDataLoader(ds, cfg=cfg, mode="train", DEBUG=True)

        model = LinearRegression(
            input_size=(dl.input_size + dl.static_input_size + dl.forecast_input_size)
            * cfg.seq_length,
            output_size=dl.output_size,
            forecast_horizon=dl.horizon,
        )
        data = dl.__iter__().__next__()
        y_hat = model(data)

        assert isinstance(y_hat, dict)
        assert y_hat["y_hat"].shape == (1, 1)
Exemplo n.º 13
0
    def test_kenya_vci_example(self, tmp_path):
        cfg = Config(Path("tests/testconfigs/config.yml"))
        create_and_assign_temp_run_path_to_config(cfg, tmp_path)

        cfg._cfg["data_path"] = Path("data/kenya.nc")
        cfg._cfg["n_epochs"] = 3

        ds, static = load_data(cfg)

        trainer = Trainer(cfg, ds, static_data=static)
        self.check_loaded_data(
            cfg,
            trainer,
            data=ds.sel(time=slice(cfg.train_start_date, cfg.train_end_date)),
        )

        losses = trainer.train_and_validate()

        tester = Tester(cfg, ds, static_data=static)
        preds = tester.run_test()

        return losses, preds
Exemplo n.º 14
0
    def test_correct_data_returned(self, tmp_path):
        #  create dummy config path
        cfg = Config(Path("tests/testconfigs/test_config.yml"))
        cfg._cfg["encode_doys"] = True
        cfg._cfg["static_inputs"] = "embedding"
        cfg._cfg["forecast_variables"] = cfg.input_variables
        #  create temporary run directory (usually done by the )
        create_and_assign_temp_run_path_to_config(cfg, tmp_path)
        #  create dummy dataset
        ds = _make_dataset().isel(lat=slice(0, 2), lon=slice(0, 1))

        #  initialise the dataloader
        dl = PixelDataLoader(ds, cfg=cfg, mode="train", DEBUG=True)
        #  one sample from the dataloader
        data = dl.__iter__().__next__()
        x, y = data["x_d"], data["y"]

        #  recreate the stacked dataset
        stacked_ds = dl.dataset.ds

        if cfg.encode_doys:
            stacked_ds, _ = add_doy_encoding_as_feature_to_dataset(
                stacked_ds, inputs=cfg.input_variables, target=cfg.target_variable
            )

        #  get the current_time_index and pixel from the __getitem__() call
        getitem_call = int(data["meta"]["index"])
        pixel, current_time_index = dl.dataset.lookup_table[getitem_call]

        # check that the returned data is valid
        #  TODO: wrap into function for getting the valid times!
        est_target_time = pd.to_datetime(
            np.array(data["meta"]["target_time"]).astype("datetime64[ns]")
        )[0]

        #  rounding error because of storing as float
        input_data_times = pd.to_datetime(stacked_ds.time.values)
        true_target_index = input_data_times.get_loc(est_target_time, method="nearest")
        true_target_time = input_data_times[true_target_index]

        assert current_time_index + cfg.horizon == true_target_index

        # :: RECREATE TARGET DATA ::
        all_expected_y = stacked_ds.sel(sample=pixel)["target"].values

        expected_y = stacked_ds.sel(sample=pixel, time=true_target_time)[
            cfg.target_variable
        ].values
        expected_y_index = (
            stacked_ds.sel(sample=pixel)
            .isel(time=true_target_index)[cfg.target_variable]
            .values
        )
        assert expected_y == expected_y_index
        assert np.isclose(y.flatten()[-1], expected_y)

        ## :: RECREATE INPUT DATA ::
        # max_input_ix should be the CURRENT TIME (+ 1 because of exlusive upper indexing)
        max_input_ix = int(true_target_index - cfg.horizon)
        assert max_input_ix == current_time_index
        max_input_time = input_data_times[max_input_ix]

        #  min_input_ix = the first input time
        min_input_ix = int(max_input_ix - cfg.seq_length) + 1
        min_input_time = input_data_times[min_input_ix]

        input_vars = (
            cfg.input_variables + ["autoregressive"]
            if cfg.autoregressive
            else cfg.input_variables
        )
        input_vars = (
            input_vars + ["sin_doy", "cos_doy"] if cfg.encode_doys else input_vars
        )

        # has x been drawn from the actual underlying data?
        all_expected_x = stacked_ds.sel(sample=pixel)["feature"].values
        _expected_x = all_expected_x[min_input_ix:max_input_ix]
        # assert x == _expected_x

        # assert all(
        #     np.isin(
        #         np.round(x.numpy().flatten(), 3).astype("float64"),
        #         np.round(all_expected_x.flatten(), 3).astype("float64"),
        #     )
        # )

        # get the exact expected input vector
        # NOTE: slice is NOT EXCLUSIVE UPPER therefore need to exclude the final
        expected_x_feature = (
            stacked_ds.sel(sample=pixel, time=slice(min_input_time, max_input_time))[
                input_vars
            ]
            .to_array()
            .values.T
        )

        x_feature = np.array(x)
        x_feature = x_feature.reshape(expected_x_feature.shape)

        assert np.allclose(x_feature, expected_x_feature)
Exemplo n.º 15
0
    def test_linear_example(self, tmp_path):
        """Test the linear dataset.

        Args:
            tmp_path ([type]): [description]
        """
        cfg = Config(Path("tests/testconfigs/test_config.yml"))
        create_and_assign_temp_run_path_to_config(cfg, tmp_path)

        #  Create linear dataset
        alpha = 0
        beta = 2
        epsilon_sigma = 0

        ds = create_linear_ds(
            horizon=cfg.horizon, alpha=alpha, beta=beta, epsilon_sigma=epsilon_sigma
        ).isel(lat=slice(0, 2), lon=slice(0, 2))
        static = create_static(cfg=cfg, ds=ds)
        dl = PixelDataLoader(
            ds,
            cfg=cfg,
            num_workers=1,
            mode="train",
            batch_size=cfg.batch_size,
            DEBUG=True,
            static_data=static,
        )

        #  load all of the data into memory
        data = load_all_data_from_dl_into_memory(dl)
        x = data["x_d"]

        # (n_samples, n_features, seq_length)
        assert x.shape == (
            len(cfg.input_variables) + 2
            if cfg.encode_doys
            else len(cfg.input_variables),
            cfg.seq_length,
        )
        assert x.shape[-1] == cfg.seq_length
        y = data["y"]
        times = pd.to_datetime(data["time"].astype("datetime64[ns]").flatten())

        # matching batch dims (n_samples) for all samples
        assert x.shape[0] == y.shape[0]

        #  test ONE SINGLE (x, y) sample
        SAMPLE = 1

        # get metadata for sample
        idx = int(data["index"][SAMPLE])
        pixel, valid_current_time_index = dl.dataset.lookup_table[idx]
        latlon = tuple([float(l) for l in str(pixel).split("_")])
        target_time = times[SAMPLE]
        # current_time = times[valid_current_time_index][0]

        #  get the correct times (weird indexing becuase of imperfect translation of float -> datetime64[ns])
        max_time = target_time - DateOffset(months=cfg.horizon) + DateOffset(days=2)
        min_time = max_time - DateOffset(months=cfg.seq_length)
        input_times = pd.date_range(min_time, max_time, freq="M")[-cfg.seq_length :]

        #  recreate the data that should be loaded from the raw xr.Dataset
        stacked, _ = _stack_xarray(ds, spatial_coords=cfg.pixel_dims)
        normalizer = dl.normalizer
        norm_stacked = normalizer.transform(stacked)

        all_y = norm_stacked["target"].sel(sample=pixel)
        _y = all_y.sel(time=target_time, method="nearest")
        all_x = norm_stacked["feature"].sel(sample=pixel)
        _x_d = all_x.sel(time=input_times, method="nearest")

        #  check that the dataloader saves & returns the correct values
        assert np.allclose(
            dl.dataset.y[pixel], (all_y.values)
        ), "The DataLoader saves incorrect y values to memory"
        assert np.isclose(
            _y.values, y[SAMPLE]
        ), "The DataLoader returns an incorrect value from the Dataset"

        #  input (X) data
        dataset_loaded = dl.dataset.x_d[pixel]
        # assert dataset_loaded.shape == (, cfg.seq_length)

        expected = all_x.values.reshape(dataset_loaded.shape)
        mask = np.isnan(expected)
        expected = expected[~mask]
        dataset_loaded = dataset_loaded[~mask]

        assert np.allclose(
            dataset_loaded, expected
        ), f"The dataloader is saving the wrong data to the lookup table. {dataset_loaded[:10]} {expected[:10]}"

        #  get input X data from INDEX (not times)
        max_input_ix = int(valid_current_time_index)
        min_input_ix = int(max_input_ix - cfg.seq_length) + 1
        _x_d_index_values = all_x.values[min_input_ix : max_input_ix + 1]

        assert np.allclose(_x_d_index_values, _x_d.values)

        # TODO: Why does this not work?
        assert np.allclose(
            _x_d_index_values.values, x[SAMPLE]
        ), "The dynamic data is not the data we expect"

        #  check that the raw data is the linear combination we expect
        # "target" should be linear combination of previous timestep "feature"
        # (y = x @ [0, 2])
        zeros = np.zeros((cfg.seq_length - 1, 1))
        betas = np.append(zeros, beta).reshape(-1, 1)
        unnorm_x = dl.dataset.normalizer.individual_inverse(
            x[SAMPLE], pixel_id=pixel, variable=cfg.input_variables[0]
        )
        unnorm_y = dl.dataset.normalizer.individual_inverse(
            y[SAMPLE], pixel_id=pixel, variable=cfg.target_variable
        )

        #  time=target_time,
        ds.sel(lat=latlon[0], lon=latlon[1], method="nearest")[cfg.target_variable]
        assert np.isclose(unnorm_x @ betas, unnorm_y)
Exemplo n.º 16
0
    def test_single_train_step(self, tmp_path):
        torch.manual_seed(1)
        np.random.seed(1)

        hidden_size = 64
        ds = pickle.load(Path("data/kenya.pkl").open("rb")).isel(
            lat=slice(0, 2), lon=slice(0, 4)
        )

        paths = [
            Path("tests/testconfigs/config.yml"),
            Path("tests/testconfigs/config_multi_horizon.yml"),
        ]
        for path in paths:
            cfg = Config(path)
            cfg._cfg["static_inputs"] = "embedding"
            create_and_assign_temp_run_path_to_config(cfg, tmp_path)

            dl = PixelDataLoader(
                ds, mode="train", cfg=cfg, num_workers=1, batch_size=cfg.batch_size,
            )

            data1 = dl.dataset.__getitem__(0)
            data1["x_s"]

            data = dl.__iter__().__next__()
            x, y = data["x_d"], data["y"]

            # are we working with batches or individual predictions?
            x = x.unsqueeze(0) if x.ndim == 2 else x

            model = (
                LSTM(
                    input_size=dl.input_size
                    + dl.static_input_size
                    + dl.forecast_input_size,
                    hidden_size=hidden_size,
                    output_size=dl.output_size,
                    forecast_horizon=dl.horizon,
                )
                .float()
                .to(cfg.device)
            )

            optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
            loss_obj = F.mse_loss
            before = model.forward(data)
            for data in tqdm(dl):
                input, target = data["x_d"], data["y"]
                optimizer.zero_grad()
                yhat = model.forward(data)
                #  shape = [batch_size, seq_length, forecast_horizon]
                assert yhat["y_hat"].shape == (cfg.batch_size, 1, 1)

                # get the final predictions to calculate loss
                loss = loss_obj(yhat["y_hat"], target)
                loss.backward()
                optimizer.step()
                break

            after = model.forward(data)

            loss_bf = loss_obj(before["y_hat"], y)
            loss_af = loss_obj(after["y_hat"], y)

            # NOTE: the LSTM only returns the final hidden and cell state layer NOT each timestep
            # TODO: why is the LSTM returning a hidden array of shape (seq_length, 1, hs)
            assert before["h_n"].shape == (1, cfg.batch_size, hidden_size)
            assert before["y_hat"].shape == (cfg.batch_size, 1, 1)

            if cfg.horizon == 1:
                assert (
                    loss_af < loss_bf
                ), "The model did not learn anything after one epoch of training"