Exemplo n.º 1
0
def regression(
    experiment="one_month_forecast",
    include_pred_month=True,
    surrounding_pixels=None,
    explain=False,
    static="features",
    ignore_vars=None,
    predict_delta=False,
    spatial_mask=None,
    include_latlons=False,
):
    predictor = LinearRegression(
        get_data_path(),
        experiment=experiment,
        include_pred_month=include_pred_month,
        surrounding_pixels=surrounding_pixels,
        static=static,
        ignore_vars=ignore_vars,
        predict_delta=predict_delta,
        spatial_mask=spatial_mask,
        include_latlons=include_latlons,
    )
    predictor.train()
    predictor.evaluate(save_preds=True)

    # mostly to test it works
    if explain:
        predictor.explain(save_shap_values=True)
    def test_big_mean(self, tmp_path, monkeypatch):
        def mockiter(self):
            class MockIterator:
                def __init__(self):
                    self.idx = 0
                    self.max_idx = 10

                def __iter__(self):
                    return self

                def __next__(self):
                    if self.idx < self.max_idx:
                        # batch_size = 10, timesteps = 2, num_features = 1
                        self.idx += 1
                        return (np.ones(
                            (10, 2, 1)), np.ones(
                                (10, ), dtype=np.int8), np.ones(
                                    (10, 2)), np.ones((10, 2)), np.ones(
                                        (10, 2)), np.ones((10, 2))), None
                    else:
                        raise StopIteration()

            return MockIterator()

        def do_nothing(self, data_path, batch_file_size, shuffle_data, mode,
                       pred_months, surrounding_pixels, ignore_vars):

            pass

        monkeypatch.setattr(DataLoader, '__iter__', mockiter)
        monkeypatch.setattr(DataLoader, '__init__', do_nothing)

        model = LinearRegression(tmp_path)
        calculated_mean = model._calculate_big_mean()

        # 1 for the 2 features and for the first month, 0 for the rest
        expected_mean = np.array([
            1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 1.,
            1.
        ])

        # np.isclose because of rounding
        assert np.isclose(calculated_mean, expected_mean).all()
Exemplo n.º 3
0
def run_models(data_path, experiment):
    # NOTE: this model is the same for all experiments
    print("Running persistence model")
    predictor = Persistence(data_path, experiment=experiment)
    predictor.evaluate(save_preds=True)

    # linear regression
    print(f"Running Linear Regression model: {experiment}")
    predictor = LinearRegression(data_path,
                                 experiment=experiment,
                                 include_pred_month=True,
                                 surrounding_pixels=1)
    predictor.train(num_epochs=10, early_stopping=3)

    # linear network
    print(f"Running Linear Neural Network model: {experiment}")
    predictor = LinearNetwork(
        data_folder=data_path,
        experiment=experiment,
        layer_sizes=[100],
        include_pred_month=True,
        surrounding_pixels=1,
    )
    predictor.train(num_epochs=10, early_stopping=3)
    predictor.evaluate(save_preds=True)
    predictor.save_model()

    # recurrent network
    print(f"Running RNN (LSTM) model: {experiment}")
    predictor = RecurrentNetwork(
        data_folder=data_path,
        hidden_size=128,
        experiment=experiment,
        include_pred_month=True,
        surrounding_pixels=1,
    )
    predictor.train(num_epochs=10, early_stopping=3)
    predictor.evaluate(save_preds=True)
    predictor.save_model()

    # EA LSTM
    print(f"Running Entity Aware LSTM model: {experiment}")
    predictor = EARecurrentNetwork(
        data_folder=data_path,
        hidden_size=128,
        experiment=experiment,
        include_pred_month=True,
        surrounding_pixels=1,
    )
    predictor.train(num_epochs=10, early_stopping=3)
    predictor.evaluate(save_preds=True)
    predictor.save_model()
Exemplo n.º 4
0
def regression(experiment='one_month_forecast',
               include_pred_month=True,
               surrounding_pixels=1):
    # if the working directory is alread ml_drought don't need ../data
    if Path('.').absolute().as_posix().split('/')[-1] == 'ml_drought':
        data_path = Path('data')
    else:
        data_path = Path('../data')

    predictor = LinearRegression(data_path,
                                 experiment=experiment,
                                 include_pred_month=include_pred_month,
                                 surrounding_pixels=surrounding_pixels)
    predictor.train()
    predictor.evaluate(save_preds=True)

    # mostly to test it works
    predictor.explain(save_shap_values=True)
Exemplo n.º 5
0
def regression(
    experiment="one_month_forecast",
    include_pred_month=True,
    surrounding_pixels=None,
    ignore_vars=None,
):

    data_path = get_data_path()
    predictor = LinearRegression(
        data_path,
        experiment=experiment,
        include_pred_month=include_pred_month,
        surrounding_pixels=surrounding_pixels,
        ignore_vars=ignore_vars,
        static="embeddings",
        spatial_mask=data_path /
        "interim/boundaries_preprocessed/kenya_asal_mask.nc",
    )
    predictor.train()
    predictor.evaluate(save_preds=True)

    # mostly to test it works
    predictor.explain(save_shap_values=True)
Exemplo n.º 6
0
    def test_save(self, tmp_path, monkeypatch):

        coef_array = np.array([1, 1, 1, 1, 1])
        intercept_array = np.array([2])

        def mocktrain(self):
            class MockModel:
                @property
                def coef_(self):
                    return coef_array

                @property
                def intercept_(self):
                    return intercept_array

            self.model = MockModel()

        monkeypatch.setattr(LinearRegression, "train", mocktrain)

        model = LinearRegression(
            tmp_path, experiment="one_month_forecast", normalize_y=False
        )
        model.train()
        model.save_model()

        assert (
            tmp_path / "models/one_month_forecast/linear_regression/model.pkl"
        ).exists(), f"Model not saved!"

        with (tmp_path / "models/one_month_forecast/linear_regression/model.pkl").open(
            "rb"
        ) as f:
            model_dict = pickle.load(f)
        assert np.array_equal(
            coef_array, model_dict["model"]["coef"]
        ), "Different coef array saved!"
        assert np.array_equal(
            intercept_array, model_dict["model"]["intercept"]
        ), "Different intercept array saved!"
        assert (
            model_dict["experiment"] == "one_month_forecast"
        ), "Different experiment saved!"
Exemplo n.º 7
0
def regression(
    experiment="one_month_forecast",
    include_pred_month=True,
    surrounding_pixels=None,
    ignore_vars=None,
    include_static=True,
):
    # if the working directory is alread ml_drought don't need ../data
    if Path(".").absolute().as_posix().split("/")[-1] == "ml_drought":
        data_path = Path("data")
    else:
        data_path = Path("../data")

    predictor = LinearRegression(
        data_path,
        experiment=experiment,
        include_pred_month=include_pred_month,
        surrounding_pixels=surrounding_pixels,
        ignore_vars=ignore_vars,
        include_static=include_static,
    )
    predictor.train(early_stopping=5)
    predictor.evaluate(save_preds=True)
    def test_save(self, tmp_path, monkeypatch):

        coef_array = np.array([1, 1, 1, 1, 1])
        intercept_array = np.array([2])

        def mocktrain(self):
            class MockModel:
                @property
                def coef_(self):
                    return coef_array

                @property
                def intercept_(self):
                    return intercept_array

            self.model = MockModel()

        monkeypatch.setattr(LinearRegression, 'train', mocktrain)

        model = LinearRegression(tmp_path, experiment='one_month_forecast')
        model.train()
        model.save_model()

        assert (tmp_path /
                'models/one_month_forecast/linear_regression/model.pkl'
                ).exists(), f'Model not saved!'

        with (tmp_path /
              'models/one_month_forecast/linear_regression/model.pkl'
              ).open('rb') as f:
            model_dict = pickle.load(f)
        assert np.array_equal(coef_array, model_dict['model']['coef']), \
            'Different coef array saved!'
        assert np.array_equal(intercept_array, model_dict['model']['intercept']), \
            'Different intercept array saved!'
        assert model_dict[
            'experiment'] == 'one_month_forecast', 'Different experiment saved!'
Exemplo n.º 9
0
    def test_train(
        self, tmp_path, capsys, use_pred_months, experiment, monthly_agg, predict_delta
    ):
        x, _, _ = _make_dataset(size=(5, 5), const=True)
        x_static, _, _ = _make_dataset(size=(5, 5), add_times=False)
        y = x.isel(time=[-1])

        x_add1, _, _ = _make_dataset(size=(5, 5), const=True, variable_name="precip")
        x_add1 = x_add1 * 2
        x_add2, _, _ = _make_dataset(size=(5, 5), const=True, variable_name="temp")
        x_add2 = x_add2 * 3
        x = xr.merge([x, x_add1, x_add2])

        norm_dict = {
            "VHI": {"mean": 0, "std": 1},
            "precip": {"mean": 0, "std": 1},
            "temp": {"mean": 0, "std": 1},
        }

        static_norm_dict = {"VHI": {"mean": 0.0, "std": 1.0}}

        test_features = tmp_path / f"features/{experiment}/train/2001_12"
        test_features.mkdir(parents=True)
        pred_features = tmp_path / f"features/{experiment}/test/2001_12"
        pred_features.mkdir(parents=True)
        static_features = tmp_path / f"features/static"
        static_features.mkdir(parents=True)

        with (tmp_path / f"features/{experiment}/normalizing_dict.pkl").open("wb") as f:
            pickle.dump(norm_dict, f)

        with (tmp_path / f"features/static/normalizing_dict.pkl").open("wb") as f:
            pickle.dump(static_norm_dict, f)

        x.to_netcdf(test_features / "x.nc")
        x.to_netcdf(pred_features / "x.nc")
        y.to_netcdf(test_features / "y.nc")
        y.to_netcdf(pred_features / "y.nc")
        x_static.to_netcdf(static_features / "data.nc")

        model = LinearRegression(
            tmp_path,
            include_pred_month=use_pred_months,
            experiment=experiment,
            include_monthly_aggs=monthly_agg,
            predict_delta=predict_delta,
            normalize_y=True,
        )
        model.train()

        captured = capsys.readouterr()
        expected_stdout = "Epoch 1, train RMSE: "
        assert (
            expected_stdout in captured.out
        ), f"Expected stdout to be {expected_stdout}, got {captured.out}"

        assert (
            type(model.model) == linear_model.SGDRegressor
        ), f"Model attribute not a linear regression!"

        if experiment == "nowcast":
            coef_size = (3 * 35) + 2
        elif experiment == "one_month_forecast":
            coef_size = 3 * 36
        if monthly_agg:
            # doubled including the mean, tripled including the std
            coef_size *= 2
        if use_pred_months:
            coef_size += 12

        coef_size += 3  # for the yearly aggs
        coef_size += 1  # for the static variable
        coef_size += 1  # for the prev_y_var

        assert model.model.coef_.size == coef_size, f"Got unexpected coef size"

        test_arrays_dict, preds_dict = model.predict()
        assert (
            test_arrays_dict["2001_12"]["y"].size == preds_dict["2001_12"].shape[0]
        ), "Expected length of test arrays to be the same as the predictions"

        # test saving the model outputs
        model.evaluate(save_preds=True)

        save_path = model.data_path / "models" / experiment / "linear_regression"
        assert (save_path / "preds_2001_12.nc").exists()
        assert (save_path / "results.json").exists()

        pred_ds = xr.open_dataset(save_path / "preds_2001_12.nc")
        assert np.isin(["lat", "lon", "time"], [c for c in pred_ds.coords]).all()
        assert y.time == pred_ds.time
Exemplo n.º 10
0
    def test_big_mean(self, tmp_path, monkeypatch):
        def mockiter(self):
            class MockIterator:
                def __init__(self):
                    self.idx = 0
                    self.max_idx = 10

                def __iter__(self):
                    return self

                def __next__(self):
                    if self.idx < self.max_idx:
                        # batch_size = 10, timesteps = 2, num_features = 1
                        self.idx += 1
                        return (
                            (
                                np.ones((10, 2, 1)),
                                np.ones((10,), dtype=np.int8),
                                np.ones((10, 2)),
                                np.ones((10, 2)),
                                np.ones((10, 2)),
                                np.ones((10, 2)),
                                np.ones((10, 1)),
                            ),
                            None,
                        )
                    else:
                        raise StopIteration()

            return MockIterator()

        def do_nothing(
            self,
            data_path,
            batch_file_size,
            mode,
            shuffle_data,
            clear_nans,
            normalize,
            experiment,
            mask,
            pred_months,
            to_tensor,
            surrounding_pixels,
            ignore_vars,
            monthly_aggs,
            static,
            device,
            predict_delta,
            spatial_mask,
            normalize_y,
            incl_yearly_aggs,
        ):

            pass

        monkeypatch.setattr(DataLoader, "__iter__", mockiter)
        monkeypatch.setattr(DataLoader, "__init__", do_nothing)

        model = LinearRegression(tmp_path, normalize_y=False)
        calculated_mean = model._calculate_big_mean()

        # 1 for the 2 features and for the first month, 0 for the rest
        expected_mean = np.array(
            [
                1.0,
                1.0,
                1.0,
                0.0,
                0.0,
                0.0,
                0.0,
                0.0,
                0.0,
                0.0,
                0.0,
                0.0,
                0.0,
                0.0,
                1.0,
                1.0,
                1.0,
                1.0,
                1.0,
            ]
        )

        # np.isclose because of rounding
        assert np.isclose(calculated_mean, expected_mean).all()
Exemplo n.º 11
0
# train models
from pathlib import Path
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

from src.models import LinearRegression, LinearNetwork, Persistence
from src.models.data import DataLoader

data_path = Path("data")
l = LinearRegression(data_path)
l.train()

ln = LinearNetwork(layer_sizes=[100], data_folder=data_path)
ln.train(num_epochs=10)

# ------------------------------------------------------------------------------
# try and explain the LinearRegression model
# ------------------------------------------------------------------------------
test_arrays_loader = DataLoader(
    data_path=data_path, batch_file_size=1, shuffle_data=False, mode="test"
)
key, val = list(next(iter(test_arrays_loader)).items())[0]
explanations = l.explain(val.x)

# plot the SHAP explanations

# 1. mean spatial and temporal response
mean_expl = explanations.mean(axis=0).mean(axis=0)
x_vars = val.x_vars
Exemplo n.º 12
0
import xarray as xr

from pathlib import Path
from src.models import Persistence, LinearRegression

data_path = Path("data")

# high level api
predictor = Persistence(data_path)
predictor.evaluate(save_preds=True)

predictor = LinearRegression(data_path)
predictor.evaluate(save_preds=True)

# 1 LOAD_TRAIN_DATA get into the guts
x, y = predictor.load_train_arrays()

# 2 ds_folder_to_np (load x/y from the train data)
train_data_path = predictor.data_path / "features/train"
out_x, out_y = [], []
for subtrain in train_data_path.iterdir():
    if (subtrain / "x.nc").exists() and (subtrain / "y.nc").exists():
        arrays = predictor.ds_folder_to_np(subtrain,
                                           clear_nans=True,
                                           return_latlons=False)
        out_x.append(arrays.x)
        out_y.append(arrays.y)

# 3
x, y = xr.open_dataset(folder / "x.nc"), xr.open_dataset(folder / "y.nc")
x_np, y_np = x.to_array().values, y.to_array().values
Exemplo n.º 13
0
from pathlib import Path
import numpy as np
import matplotlib.pyplot as plt
import pickle

from src.analysis import plot_shap_values
from src.models import Persistence, LinearRegression, LinearNetwork
from src.models.data import DataLoader

%load_ext autoreload
%autoreload 2

data_dir = Path('/Volumes/Lees_Extend/data/ecmwf_sowc/data')
predictor = LinearRegression(data_folder=data_dir, experiment='nowcast')
predictor.train()
Exemplo n.º 14
0
    def test_train(self, tmp_path, capsys, use_pred_months, experiment,
                   monthly_agg):
        x, _, _ = _make_dataset(size=(5, 5), const=True)
        x_static, _, _ = _make_dataset(size=(5, 5), add_times=False)
        y = x.isel(time=[-1])

        x_add1, _, _ = _make_dataset(size=(5, 5),
                                     const=True,
                                     variable_name='precip')
        x_add2, _, _ = _make_dataset(size=(5, 5),
                                     const=True,
                                     variable_name='temp')
        x = xr.merge([x, x_add1, x_add2])

        norm_dict = {
            'VHI': {
                'mean': 0,
                'std': 1
            },
            'precip': {
                'mean': 0,
                'std': 1
            },
            'temp': {
                'mean': 0,
                'std': 1
            }
        }

        static_norm_dict = {'VHI': {'mean': 0.0, 'std': 1.0}}

        test_features = tmp_path / f'features/{experiment}/train/hello'
        test_features.mkdir(parents=True)
        pred_features = tmp_path / f'features/{experiment}/test/hello'
        pred_features.mkdir(parents=True)
        static_features = tmp_path / f'features/static'
        static_features.mkdir(parents=True)

        with (tmp_path /
              f'features/{experiment}/normalizing_dict.pkl').open('wb') as f:
            pickle.dump(norm_dict, f)

        with (tmp_path /
              f'features/static/normalizing_dict.pkl').open('wb') as f:
            pickle.dump(static_norm_dict, f)

        x.to_netcdf(test_features / 'x.nc')
        x.to_netcdf(pred_features / 'x.nc')
        y.to_netcdf(test_features / 'y.nc')
        y.to_netcdf(pred_features / 'y.nc')
        x_static.to_netcdf(static_features / 'data.nc')

        model = LinearRegression(tmp_path,
                                 include_pred_month=use_pred_months,
                                 experiment=experiment,
                                 include_monthly_aggs=monthly_agg)
        model.train()

        captured = capsys.readouterr()
        expected_stdout = 'Epoch 1, train RMSE: '
        assert expected_stdout in captured.out, \
            f'Expected stdout to be {expected_stdout}, got {captured.out}'

        assert type(model.model) == linear_model.SGDRegressor, \
            f'Model attribute not a linear regression!'

        if experiment == 'nowcast':
            coef_size = (3 * 35) + 2
        elif experiment == 'one_month_forecast':
            coef_size = (3 * 36)
        if monthly_agg:
            # doubled including the mean, tripled including the std
            coef_size *= 2
        if use_pred_months:
            coef_size += 12

        coef_size += 3  # for the yearly aggs
        coef_size += 1  # for the static variable

        assert model.model.coef_.size == coef_size, f'Got unexpected coef size'

        test_arrays_dict, preds_dict = model.predict()
        assert (
            test_arrays_dict['hello']['y'].size == preds_dict['hello'].shape[0]
        ), 'Expected length of test arrays to be the same as the predictions'

        # test saving the model outputs
        model.evaluate(save_preds=True)

        save_path = model.data_path / 'models' / experiment / 'linear_regression'
        assert (save_path / 'preds_hello.nc').exists()
        assert (save_path / 'results.json').exists()

        pred_ds = xr.open_dataset(save_path / 'preds_hello.nc')
        assert np.isin(['lat', 'lon', 'time'],
                       [c for c in pred_ds.coords]).all()
        assert y.time == pred_ds.time