def regression( experiment="one_month_forecast", include_pred_month=True, surrounding_pixels=None, explain=False, static="features", ignore_vars=None, predict_delta=False, spatial_mask=None, include_latlons=False, ): predictor = LinearRegression( get_data_path(), experiment=experiment, include_pred_month=include_pred_month, surrounding_pixels=surrounding_pixels, static=static, ignore_vars=ignore_vars, predict_delta=predict_delta, spatial_mask=spatial_mask, include_latlons=include_latlons, ) predictor.train() predictor.evaluate(save_preds=True) # mostly to test it works if explain: predictor.explain(save_shap_values=True)
def regression(experiment='one_month_forecast', include_pred_month=True, surrounding_pixels=1): # if the working directory is alread ml_drought don't need ../data if Path('.').absolute().as_posix().split('/')[-1] == 'ml_drought': data_path = Path('data') else: data_path = Path('../data') predictor = LinearRegression(data_path, experiment=experiment, include_pred_month=include_pred_month, surrounding_pixels=surrounding_pixels) predictor.train() predictor.evaluate(save_preds=True) # mostly to test it works predictor.explain(save_shap_values=True)
def test_save(self, tmp_path, monkeypatch): coef_array = np.array([1, 1, 1, 1, 1]) intercept_array = np.array([2]) def mocktrain(self): class MockModel: @property def coef_(self): return coef_array @property def intercept_(self): return intercept_array self.model = MockModel() monkeypatch.setattr(LinearRegression, "train", mocktrain) model = LinearRegression( tmp_path, experiment="one_month_forecast", normalize_y=False ) model.train() model.save_model() assert ( tmp_path / "models/one_month_forecast/linear_regression/model.pkl" ).exists(), f"Model not saved!" with (tmp_path / "models/one_month_forecast/linear_regression/model.pkl").open( "rb" ) as f: model_dict = pickle.load(f) assert np.array_equal( coef_array, model_dict["model"]["coef"] ), "Different coef array saved!" assert np.array_equal( intercept_array, model_dict["model"]["intercept"] ), "Different intercept array saved!" assert ( model_dict["experiment"] == "one_month_forecast" ), "Different experiment saved!"
def regression( experiment="one_month_forecast", include_pred_month=True, surrounding_pixels=None, ignore_vars=None, ): data_path = get_data_path() spatial_mask = data_path / "interim/boundaries_preprocessed/kenya_asal_mask.nc" spatial_mask = None predictor = LinearRegression( data_path, experiment=experiment, include_pred_month=include_pred_month, surrounding_pixels=surrounding_pixels, ignore_vars=ignore_vars, static="embeddings", spatial_mask=spatial_mask, ) predictor.train() predictor.evaluate(save_preds=True)
def regression( experiment="one_month_forecast", include_pred_month=True, surrounding_pixels=None, ignore_vars=None, include_static=True, ): # if the working directory is alread ml_drought don't need ../data if Path(".").absolute().as_posix().split("/")[-1] == "ml_drought": data_path = Path("data") else: data_path = Path("../data") predictor = LinearRegression( data_path, experiment=experiment, include_pred_month=include_pred_month, surrounding_pixels=surrounding_pixels, ignore_vars=ignore_vars, include_static=include_static, ) predictor.train(early_stopping=5) predictor.evaluate(save_preds=True)
def test_save(self, tmp_path, monkeypatch): coef_array = np.array([1, 1, 1, 1, 1]) intercept_array = np.array([2]) def mocktrain(self): class MockModel: @property def coef_(self): return coef_array @property def intercept_(self): return intercept_array self.model = MockModel() monkeypatch.setattr(LinearRegression, 'train', mocktrain) model = LinearRegression(tmp_path, experiment='one_month_forecast') model.train() model.save_model() assert (tmp_path / 'models/one_month_forecast/linear_regression/model.pkl' ).exists(), f'Model not saved!' with (tmp_path / 'models/one_month_forecast/linear_regression/model.pkl' ).open('rb') as f: model_dict = pickle.load(f) assert np.array_equal(coef_array, model_dict['model']['coef']), \ 'Different coef array saved!' assert np.array_equal(intercept_array, model_dict['model']['intercept']), \ 'Different intercept array saved!' assert model_dict[ 'experiment'] == 'one_month_forecast', 'Different experiment saved!'
def test_train( self, tmp_path, capsys, use_pred_months, experiment, monthly_agg, predict_delta ): x, _, _ = _make_dataset(size=(5, 5), const=True) x_static, _, _ = _make_dataset(size=(5, 5), add_times=False) y = x.isel(time=[-1]) x_add1, _, _ = _make_dataset(size=(5, 5), const=True, variable_name="precip") x_add1 = x_add1 * 2 x_add2, _, _ = _make_dataset(size=(5, 5), const=True, variable_name="temp") x_add2 = x_add2 * 3 x = xr.merge([x, x_add1, x_add2]) norm_dict = { "VHI": {"mean": 0, "std": 1}, "precip": {"mean": 0, "std": 1}, "temp": {"mean": 0, "std": 1}, } static_norm_dict = {"VHI": {"mean": 0.0, "std": 1.0}} test_features = tmp_path / f"features/{experiment}/train/2001_12" test_features.mkdir(parents=True) pred_features = tmp_path / f"features/{experiment}/test/2001_12" pred_features.mkdir(parents=True) static_features = tmp_path / f"features/static" static_features.mkdir(parents=True) with (tmp_path / f"features/{experiment}/normalizing_dict.pkl").open("wb") as f: pickle.dump(norm_dict, f) with (tmp_path / f"features/static/normalizing_dict.pkl").open("wb") as f: pickle.dump(static_norm_dict, f) x.to_netcdf(test_features / "x.nc") x.to_netcdf(pred_features / "x.nc") y.to_netcdf(test_features / "y.nc") y.to_netcdf(pred_features / "y.nc") x_static.to_netcdf(static_features / "data.nc") model = LinearRegression( tmp_path, include_pred_month=use_pred_months, experiment=experiment, include_monthly_aggs=monthly_agg, predict_delta=predict_delta, normalize_y=True, ) model.train() captured = capsys.readouterr() expected_stdout = "Epoch 1, train RMSE: " assert ( expected_stdout in captured.out ), f"Expected stdout to be {expected_stdout}, got {captured.out}" assert ( type(model.model) == linear_model.SGDRegressor ), f"Model attribute not a linear regression!" if experiment == "nowcast": coef_size = (3 * 35) + 2 elif experiment == "one_month_forecast": coef_size = 3 * 36 if monthly_agg: # doubled including the mean, tripled including the std coef_size *= 2 if use_pred_months: coef_size += 12 coef_size += 3 # for the yearly aggs coef_size += 1 # for the static variable coef_size += 1 # for the prev_y_var assert model.model.coef_.size == coef_size, f"Got unexpected coef size" test_arrays_dict, preds_dict = model.predict() assert ( test_arrays_dict["2001_12"]["y"].size == preds_dict["2001_12"].shape[0] ), "Expected length of test arrays to be the same as the predictions" # test saving the model outputs model.evaluate(save_preds=True) save_path = model.data_path / "models" / experiment / "linear_regression" assert (save_path / "preds_2001_12.nc").exists() assert (save_path / "results.json").exists() pred_ds = xr.open_dataset(save_path / "preds_2001_12.nc") assert np.isin(["lat", "lon", "time"], [c for c in pred_ds.coords]).all() assert y.time == pred_ds.time
# train models from pathlib import Path import matplotlib.pyplot as plt import numpy as np import pandas as pd import seaborn as sns from src.models import LinearRegression, LinearNetwork, Persistence from src.models.data import DataLoader data_path = Path("data") l = LinearRegression(data_path) l.train() ln = LinearNetwork(layer_sizes=[100], data_folder=data_path) ln.train(num_epochs=10) # ------------------------------------------------------------------------------ # try and explain the LinearRegression model # ------------------------------------------------------------------------------ test_arrays_loader = DataLoader( data_path=data_path, batch_file_size=1, shuffle_data=False, mode="test" ) key, val = list(next(iter(test_arrays_loader)).items())[0] explanations = l.explain(val.x) # plot the SHAP explanations # 1. mean spatial and temporal response mean_expl = explanations.mean(axis=0).mean(axis=0) x_vars = val.x_vars
from pathlib import Path import numpy as np import matplotlib.pyplot as plt import pickle from src.analysis import plot_shap_values from src.models import Persistence, LinearRegression, LinearNetwork from src.models.data import DataLoader %load_ext autoreload %autoreload 2 data_dir = Path('/Volumes/Lees_Extend/data/ecmwf_sowc/data') predictor = LinearRegression(data_folder=data_dir, experiment='nowcast') predictor.train()
def test_train(self, tmp_path, capsys, use_pred_months, experiment, monthly_agg): x, _, _ = _make_dataset(size=(5, 5), const=True) x_static, _, _ = _make_dataset(size=(5, 5), add_times=False) y = x.isel(time=[-1]) x_add1, _, _ = _make_dataset(size=(5, 5), const=True, variable_name='precip') x_add2, _, _ = _make_dataset(size=(5, 5), const=True, variable_name='temp') x = xr.merge([x, x_add1, x_add2]) norm_dict = { 'VHI': { 'mean': 0, 'std': 1 }, 'precip': { 'mean': 0, 'std': 1 }, 'temp': { 'mean': 0, 'std': 1 } } static_norm_dict = {'VHI': {'mean': 0.0, 'std': 1.0}} test_features = tmp_path / f'features/{experiment}/train/hello' test_features.mkdir(parents=True) pred_features = tmp_path / f'features/{experiment}/test/hello' pred_features.mkdir(parents=True) static_features = tmp_path / f'features/static' static_features.mkdir(parents=True) with (tmp_path / f'features/{experiment}/normalizing_dict.pkl').open('wb') as f: pickle.dump(norm_dict, f) with (tmp_path / f'features/static/normalizing_dict.pkl').open('wb') as f: pickle.dump(static_norm_dict, f) x.to_netcdf(test_features / 'x.nc') x.to_netcdf(pred_features / 'x.nc') y.to_netcdf(test_features / 'y.nc') y.to_netcdf(pred_features / 'y.nc') x_static.to_netcdf(static_features / 'data.nc') model = LinearRegression(tmp_path, include_pred_month=use_pred_months, experiment=experiment, include_monthly_aggs=monthly_agg) model.train() captured = capsys.readouterr() expected_stdout = 'Epoch 1, train RMSE: ' assert expected_stdout in captured.out, \ f'Expected stdout to be {expected_stdout}, got {captured.out}' assert type(model.model) == linear_model.SGDRegressor, \ f'Model attribute not a linear regression!' if experiment == 'nowcast': coef_size = (3 * 35) + 2 elif experiment == 'one_month_forecast': coef_size = (3 * 36) if monthly_agg: # doubled including the mean, tripled including the std coef_size *= 2 if use_pred_months: coef_size += 12 coef_size += 3 # for the yearly aggs coef_size += 1 # for the static variable assert model.model.coef_.size == coef_size, f'Got unexpected coef size' test_arrays_dict, preds_dict = model.predict() assert ( test_arrays_dict['hello']['y'].size == preds_dict['hello'].shape[0] ), 'Expected length of test arrays to be the same as the predictions' # test saving the model outputs model.evaluate(save_preds=True) save_path = model.data_path / 'models' / experiment / 'linear_regression' assert (save_path / 'preds_hello.nc').exists() assert (save_path / 'results.json').exists() pred_ds = xr.open_dataset(save_path / 'preds_hello.nc') assert np.isin(['lat', 'lon', 'time'], [c for c in pred_ds.coords]).all() assert y.time == pred_ds.time