def engineer(data_path, experiment='one_month_forecast', process_static=True, pred_months=12, expected_length=12): engineer = Engineer(data_path, experiment=experiment, process_static=process_static) engineer.engineer( test_year=2018, target_variable='VHI', pred_months=pred_months, expected_length=pred_months, )
def engineer_static(): # if the working directory is alread ml_drought don't need ../data if Path('.').absolute().as_posix().split('/')[-1] == 'ml_drought': data_path = Path('data') else: data_path = Path('../data') Engineer.engineer_static_only(data_path)
def engineer(pred_months=3, target_var="VCI1M"): engineer = Engineer(get_data_path(), experiment="one_month_forecast", process_static=False) engineer.engineer( test_year=[y for y in range(2016, 2019)], target_variable=target_var, pred_months=pred_months, expected_length=pred_months, )
def eng_strato(): # if the working directory is alread ml_drought don't need ../data if Path('.').absolute().as_posix().split('/')[-1] == 'ml_drought': data_path = Path('data') else: data_path = Path('../data') engineer = Engineer(data_path, experiment='strato') engineer.engineer( test_year=2018, target_variable='u', )
def test_init(self, tmp_path): with pytest.raises(AssertionError) as e: Engineer(tmp_path) assert "does not exist. Has the preprocesser been run?" in str(e) (tmp_path / "interim").mkdir() Engineer(tmp_path) assert (tmp_path / "features").exists(), "Features directory not made!" assert (tmp_path / "features" / "one_month_forecast").exists(), "\
def engineer(experiment="one_month_forecast", process_static=True, pred_months=12): engineer = Engineer(get_data_path(), experiment=experiment, process_static=process_static) engineer.engineer( test_year=[y for y in range(2011, 2019)], target_variable="VCI", pred_months=pred_months, expected_length=pred_months, )
def engineer(experiment='one_month_forecast', process_static=True, pred_months=12): # if the working directory is alread ml_drought don't need ../data if Path('.').absolute().as_posix().split('/')[-1] == 'ml_drought': data_path = Path('data') else: data_path = Path('../data') engineer = Engineer(data_path, experiment=experiment, process_static=process_static) engineer.engineer( test_year=2018, target_variable='VCI', pred_months=pred_months, expected_length=pred_months, )
def test_engineer(self, tmp_path): _setup(tmp_path) pred_months = expected_length = 11 engineer = Engineer(tmp_path) engineer.engineer( test_year=2001, target_variable="a", pred_months=pred_months, expected_length=expected_length, ) def check_folder(folder_path): y = xr.open_dataset(folder_path / "y.nc") assert "b" not in set( y.variables), "Got unexpected variables in test set" x = xr.open_dataset(folder_path / "x.nc") for expected_var in {"a", "b"}: assert expected_var in set( x.variables), "Missing variables in testing input dataset" assert (len(x.time.values) == expected_length ), "Wrong number of months in the test x dataset" assert len( y.time.values) == 1, "Wrong number of months in test y dataset" # check_folder(tmp_path / 'features/one_month_forecast/train/1999_12') for month in range(1, 13): check_folder(tmp_path / f"features/one_month_forecast/test/2001_{month}") check_folder(tmp_path / f"features/one_month_forecast/train/2000_{month}") assert (len( list((tmp_path / "features/one_month_forecast/train" ).glob("2001_*"))) == 0), "Test data in the training data!" assert (tmp_path / "features/one_month_forecast/normalizing_dict.pkl" ).exists(), f"Normalizing dict not saved!" with (tmp_path / "features/one_month_forecast/normalizing_dict.pkl" ).open("rb") as f: norm_dict = pickle.load(f) for key, val in norm_dict.items(): assert key in {"a", "b"}, f"Unexpected key!" assert norm_dict[key]["mean"] == 1, f"Mean incorrectly calculated!" assert norm_dict[key]["std"] == 0, f"Std incorrectly calculated!"
def test_get_preprocessed(self, tmp_path, monkeypatch): expected_files, expected_vars = _setup(tmp_path) def mock_init(self, data_folder): self.name = "dummy" self.interim_folder = data_folder / "interim" monkeypatch.setattr(Engineer, "__init__", mock_init) engineer = Engineer(tmp_path) files = engineer._get_preprocessed_files(static=False) assert set(expected_files) == set( files), f"Did not retrieve expected files!"
def test_static(self, tmp_path): _, expected_vars = _setup(tmp_path, add_times=False, static=True) engineer = Engineer(tmp_path, process_static=True) assert ( tmp_path / "features/static").exists(), "Static output folder does not exist!" engineer.process_static() output_file = tmp_path / "features/static/data.nc" assert output_file.exists(), "Static output folder does not exist!" static_data = xr.open_dataset(output_file) for var in expected_vars: assert var in static_data.data_vars
def engineer( pred_months=3, target_var="boku_VCI", process_static=False, global_means: bool = True, log_vars: Optional[List[str]] = None, ): engineer = Engineer(get_data_path(), experiment="one_month_forecast", process_static=process_static) engineer.engineer( test_year=[y for y in range(2016, 2019)], target_variable=target_var, pred_months=pred_months, expected_length=pred_months, global_means=global_means, )
def test_yearsplit(self, tmp_path): _setup(tmp_path) dataset, _, _ = _make_dataset(size=(2, 2)) engineer = Engineer(tmp_path) train = engineer._train_test_split( dataset, years=[2001], target_variable="VHI", pred_months=11, expected_length=11, ) assert (train.time.values < np.datetime64("2001-01-01")).all( ), "Got years greater than the test year in the training set!"
def test_join(self, tmp_path, monkeypatch): expected_files, expected_vars = _setup(tmp_path) def mock_init(self, data_folder): self.name = "dummy" self.interim_folder = data_folder / "interim" monkeypatch.setattr(Engineer, "__init__", mock_init) engineer = Engineer(tmp_path) joined_ds = engineer._make_dataset(static=False) dims = ["lon", "lat", "time"] output_vars = [var for var in joined_ds.variables if var not in dims] assert set(output_vars) == set( expected_vars), f"Did not retrieve all the expected variables!"
def test_stratify(self, tmp_path): _setup(tmp_path) engineer = Engineer(tmp_path) ds_target, _, _ = _make_dataset(size=(20, 20)) ds_predictor, _, _ = _make_dataset(size=(20, 20)) ds_predictor = ds_predictor.rename({"VHI": "predictor"}) ds = ds_predictor.merge(ds_target) xy_dict, max_train_date = engineer._stratify_xy( ds=ds, year=2001, target_variable="VHI", target_month=1, pred_months=4, expected_length=4, ) assert ( xy_dict["x"].time.size == 4), f'OneMonthForecast experiment `x`\ should have 4 times Got: {xy_dict["x"].time.size}' assert (max_train_date == dt.datetime(2000, 12, 31).date()), f"\
def engineer_static(): Engineer.engineer_static_only(get_data_path())
def engineer(self, engineer_args: Dict) -> None: """Run the engineer on the data """ engineer_args["init_args"]["data_folder"] = self.data engineer = Engineer(**engineer_args["init_args"]) engineer.engineer(**engineer_args["run_args"])
def run_training_period_experiments(pred_months: int = 3): expected_length = pred_months # Read the target data print("** Reading the target data **") data_dir = get_data_path() target_data = xr.open_dataset(data_dir / "interim" / "VCI_preprocessed" / "data_kenya.nc") # sort by the annual median (across pixels/time) print("** Sorting the target data **") sorted_years, _ = sort_by_median_target_variable(target_data) print(f"** sorted_years: {sorted_years} **") print(f"** min_year: {min(sorted_years)} max_year: {max(sorted_years)} **") # create all experiments # train_hilo(9), test_hilo(3), train_length(1) print("** Creating all experiments **") hilos = ["high", "med", "low"] train_lengths = [5, 10, 20] experiments = [ Experiment(train_length=train_length, train_hilo=train_hilo, test_hilo=test_hilo) for train_hilo, test_hilo, train_length in itertools.product( hilos, hilos, train_lengths) ] print("** Running all experiments **") for experiment in experiments[7:]: test_years, train_years = get_experiment_years( sorted_years, experiment.train_length, experiment.test_hilo, experiment.train_hilo, test_length=3, ) debug = True if debug: print( "\n" + "-" * 10 + "\n", "train_length: " + str(experiment.train_length), "test_hilo: " + experiment.test_hilo, "train_hilo: " + experiment.train_hilo, "\ntrain_years:\n", train_years, "\n", "test_years:\n", test_years, "\n" + "-" * 10 + "\n", ) # have to recreate each engineer for the experiment # TODO: definite inefficiency should this be in DataLoader? engineer = Engineer( get_data_path(), experiment="one_month_forecast", process_static=True, different_training_periods=True, ) engineer.engineer_class.engineer( test_year=test_years, # defined by experiment train_years=train_years, # defined by experiment pred_months=pred_months, # 3 by default expected_length=expected_length, # == pred_month by default target_variable="VCI", ) # TODO: # add extra years if selected the first year in timeseries (often not 12months) # e.g. 1981_11 is the first valid month in our dataset # Run the models always_ignore_vars = ["ndvi", "p84.162", "sp", "tp", "Eb"] ignore_vars = always_ignore_vars run_experiments( train_hilo=experiment.train_hilo, test_hilo=experiment.test_hilo, train_length=len(train_years), ignore_vars=ignore_vars, run_regression=False, all_models=False, static=True, ) # save some key facts about the experiment to an experiment.json file expt_dict = dict( train_hilo=experiment.train_hilo, test_hilo=experiment.test_hilo, train_length=len(train_years), ignore_vars=ignore_vars, train_years=train_years, test_years=test_years, ) with open(data_dir / "models/one_month_forecast/experiment.json", "wb") as fp: json.dump(expt_dict, fp, sort_keys=True, indent=4) # rename the features/one_month_forecast directory rename_experiment_dir( data_dir, train_hilo=experiment.train_hilo, test_hilo=experiment.test_hilo, train_length=len(train_years), dir_="features", )
import numpy as np from collections import defaultdict import calendar from datetime import datetime, date from pathlib import Path import xarray as xr from typing import cast, Dict, List, Optional, Union, Tuple from typing import DefaultDict as DDict from src.engineer import Engineer from src.preprocess.base import BasePreProcessor data_path = Path("/Volumes/Lees_Extend/data/ecmwf_sowc/data") engineer = Engineer(data_path) engineer.engineer(test_year=1990, target_variable="VHI", pred_months=3, expected_length=3) # wrong shapes! datasets = engineer._get_preprocessed_files() ds_list = [xr.open_dataset(ds) for ds in datasets] dims_list = [[dim for dim in ds.dims] for ds in ds_list] variable_list = [[var for var in ds.variables if var not in dims_list[i]][0] for i, ds in enumerate(ds_list)] da_list = [ds[variable_list[i]] for i, ds in enumerate(ds_list)] # ds = engineer._make_dataset() years = [1990]
engineer._stratify_training_data engineer._train_test_split engineer.stratify_xy engineer.get_datetime engineer._save """ import xarray as xr import numpy as np from pathlib import Path from src.preprocess.base import BasePreProcessor from src.engineer import Engineer data_path = Path("/Volumes/Lees_Extend/data/ecmwf_sowc/data") engineer = Engineer(data_path) # engineer.engineer(test_year=1994, target_variable='VHI') # wrong shapes! datasets = engineer._get_preprocessed_files() ds_list = [xr.open_dataset(ds) for ds in datasets] dims_list = [[dim for dim in ds.dims] for ds in ds_list] variable_list = [[var for var in ds.variables if var not in dims_list[i]][0] for i, ds in enumerate(ds_list)] da_list = [ds[variable_list[i]] for i, ds in enumerate(ds_list)] pp = BasePreProcessor(data_path) c_ds = ds_list[0] e_ds = ds_list[1] v_ds = ds_list[2]