def test(self, tmp_path): self._initialise_data(tmp_path) # SETTINGS train_dates = [2000, 2010] target_var = "discharge_spec" x_variables = ["precipitation", "peti"] static_variables = ["pet_mean", "aridity", "p_seasonality"] seq_length = 10 with_static = True concat_static = False basins = get_basins(tmp_path) with_basin_str = True # INITIALIZE engineer = RunoffEngineer( data_dir=tmp_path, basins=basins, train_dates=train_dates, with_basin_str=with_basin_str, target_var=target_var, x_variables=x_variables, static_variables=static_variables, ignore_static_vars=None, seq_length=seq_length, with_static=with_static, concat_static=concat_static, ) engineer.create_training_data() h5_file = engineer.out_file assert h5_file.exists() with h5py.File(h5_file, "r") as f: x = f["input_data"][:] y = f["target_data"][:] str_arr = f["sample_2_basin"][:] str_arr = [x.decode("ascii") for x in str_arr] q_stds = f["q_stds"][:] assert isinstance(x, np.ndarray) assert isinstance(y, np.ndarray) assert isinstance(str_arr, list) assert isinstance(q_stds, np.ndarray) assert len(np.unique(q_stds)) == 2 assert len(np.unique(str_arr)) == 2 assert x[0].shape == (seq_length, len(x_variables)) assert len(x) == len(y)
def train( data_dir: Path, basins: List[str], train_dates: List[int], with_basin_str: bool = True, target_var: str = "discharge_spec", x_variables: Optional[List[str]] = ["precipitation", "peti"], static_variables: Optional[List[str]] = None, ignore_static_vars: Optional[List[str]] = None, seq_length: int = 365, with_static: bool = True, concat_static: bool = False, seed: int = 10101, cache: bool = True, batch_size: int = 32, num_workers: int = 1, hidden_size: int = 256, initial_forget_gate_bias: int = 5, dropout: float = 0.4, use_mse: bool = True, learning_rate: float = 1e-3, epochs: int = 10, ): # Set seeds random.seed(seed) np.random.seed(seed) torch.cuda.manual_seed(seed) torch.manual_seed(seed) basins = get_basins(data_dir) # engineer the data for this training run _prepare_data( data_dir=data_dir, basins=basins, train_dates=train_dates, with_basin_str=with_basin_str, target_var=target_var, x_variables=x_variables, static_variables=static_variables, ignore_static_vars=ignore_static_vars, seq_length=seq_length, with_static=with_static, concat_static=concat_static, ) # create dataloader data = CamelsH5( data_dir=data_dir, basins=basins, concat_static=concat_static, cache=cache, with_static=with_static, train_dates=train_dates, ) # initialise key parameters of the Model input_size_stat = len(data.static_df.columns) if with_static else 0 dynamic_size = len(data.x_variables) if with_static: input_size_dyn = (dynamic_size + input_size_stat if concat_static else dynamic_size) else: input_size_dyn = dynamic_size loader = DataLoader(data, batch_size=batch_size, shuffle=True, num_workers=num_workers) model = Model( input_size_dyn=input_size_dyn, input_size_stat=input_size_stat, hidden_size=hidden_size, initial_forget_bias=initial_forget_gate_bias, dropout=dropout, concat_static=concat_static, no_static=not with_static, # inverse with_static ).to(DEVICE) optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate) # define loss function if use_mse: loss_func = nn.MSELoss() else: loss_func = NSELoss() # type: ignore # reduce learning rates after each 10 epochs learning_rates = {11: 5e-4, 21: 1e-4} for epoch in range(1, epochs + 1): # set new learning rate if epoch in learning_rates.keys(): for param_group in optimizer.param_groups: # type: ignore param_group["lr"] = learning_rates[epoch] train_epoch(model, optimizer, loss_func, loader, epoch, use_mse) # save the model model_str = _get_model_str(with_static=with_static, concat_static=concat_static) model_path = data_dir / f"models/model_{model_str}_epoch{epoch}.pt" model_path.parents[0].mkdir(exist_ok=True, parents=True) model.model_path = model_path torch.save(model.state_dict(), str(model_path)) return model
def evaluate( data_dir: Path, model_path: Path, input_size_dyn: int, input_size_stat: int, val_dates: List[int], with_static: bool = True, static_variables: Optional[List[str]] = None, dropout: float = 0.4, concat_static: bool = False, hidden_size: int = 256, target_var: str = "discharge_spec", x_variables: Optional[List[str]] = ["precipitation", "peti"], seq_length: int = 365, ): """Evaluate the model Parameters ---------- user_cfg : Dict Dictionary containing the user entered evaluation config """ basins = get_basins(data_dir) # get static data (attributes) means/stds static_df = load_static_data( data_dir=data_dir, basins=basins, drop_lat_lon=True, static_variables=static_variables, ) means = static_df.mean() stds = static_df.std() # create model model = Model( input_size_dyn=input_size_dyn, input_size_stat=input_size_stat, hidden_size=hidden_size, dropout=dropout, concat_static=concat_static, no_static=not with_static, ).to(DEVICE) # load trained model weight_file = model_path model.load_state_dict(torch.load(weight_file, map_location=DEVICE)) val_dates = np.sort(val_dates) date_range = pd.date_range(start=f"{val_dates[0]}-01-01", end=f"{val_dates[-1]}-12-31", freq="D") results: Dict[pd.DataFrame] = {} normalization_dict = pickle.load( open(data_dir / "features/normalization_dict.pkl", "rb")) for basin in tqdm.tqdm(basins): ds_test = CAMELSCSV( data_dir=data_dir, basin=basin, train_dates=val_dates, normalization_dict=normalization_dict, is_train=True, target_var=target_var, x_variables=x_variables, static_variables=static_variables, seq_length=seq_length, with_static=with_static, concat_static=concat_static, ) loader = DataLoader(ds_test, batch_size=1024, shuffle=False, num_workers=1) preds, obs = evaluate_basin(model, loader, normalization_dict=normalization_dict) df = pd.DataFrame(data={ "qobs": obs.flatten(), "qsim": preds.flatten() }, index=date_range) results[basin] = df save_eval_results( data_dir=data_dir, results=results, with_static=with_static, concat_static=concat_static, )
def test_(self, tmp_path): self._initialise_data(tmp_path) # SETTINGS with_basin_str = True train_dates = [2000] target_var = "discharge_spec" x_variables = ["precipitation", "peti"] static_variables = ["pet_mean", "aridity", "p_seasonality"] seq_length = 10 with_static = True concat_static = False basins = get_basins(tmp_path) dropout = 0.4 hidden_size = 256 seed = 10101 cache = True use_mse = True batch_size = 50 num_workers = 1 initial_forget_gate_bias = 5 learning_rate = 1e-3 epochs = 1 model = train_model( data_dir=tmp_path, basins=basins, train_dates=train_dates, with_basin_str=with_basin_str, target_var=target_var, x_variables=x_variables, static_variables=static_variables, ignore_static_vars=None, seq_length=seq_length, with_static=with_static, concat_static=concat_static, dropout=dropout, hidden_size=hidden_size, seed=seed, cache=cache, use_mse=use_mse, batch_size=batch_size, num_workers=num_workers, initial_forget_gate_bias=initial_forget_gate_bias, learning_rate=learning_rate, epochs=epochs, ) input_size_dyn = model.input_size_dyn input_size_stat = model.input_size_stat model_path = model.model_path evaluate_model( data_dir=tmp_path, model_path=model_path, input_size_dyn=input_size_dyn, input_size_stat=input_size_stat, val_dates=train_dates, with_static=with_static, static_variables=static_variables, dropout=dropout, concat_static=concat_static, hidden_size=hidden_size, target_var=target_var, x_variables=x_variables, seq_length=seq_length, ) # is the data directory correctly formatted? dirs = ["features", "models", "interim", "raw"] assert all(np.isin(dirs, [d.name for d in tmp_path.iterdir()])) # are the models / predictions saved properly? results_pkl = [f for f in (tmp_path / "models").glob("*.pkl")][0] assert "ealstm_results.pkl" in results_pkl.name assert "ealstm" in [f.name for f in (tmp_path / "models").glob("*.pt")][0] # check that all basins are found as keys in results Dict results = pickle.load(open(results_pkl, "rb")) assert all(np.isin(basins, [k for k in results.keys()]))
def test(self, tmp_path): _copy_runoff_data_to_tmp_path(tmp_path) processsor = CAMELSGBPreprocessor(tmp_path, open_shapefile=False) processsor.preprocess() # SETTINGS train_dates = [2000, 2010] target_var = "discharge_spec" x_variables = ["precipitation", "peti"] static_variables = ["pet_mean", "aridity", "p_seasonality"] seq_length = 10 with_static = True is_train = True concat_static = False # DERIVED Values n_times = len( pd.date_range( f"{train_dates[0]}-01-01", f"{train_dates[-1]}-12-31", freq="D" ) ) n_features = len(x_variables) n_stations = 2 n_static_features = len(static_variables) normalization_dict = CalculateNormalizationParams( data_dir=tmp_path, train_dates=train_dates, target_var=target_var, x_variables=x_variables, static_variables=static_variables, ).normalization_dict assert len([stn for stn in get_basins(tmp_path)]) == n_stations for basin in get_basins(tmp_path): dataset = CAMELSCSV( data_dir=tmp_path, basin=basin, train_dates=train_dates, normalization_dict=normalization_dict, is_train=is_train, target_var=target_var, x_variables=x_variables, static_variables=static_variables, seq_length=seq_length, with_static=with_static, concat_static=concat_static, ) x = dataset.x y = dataset.y static = dataset.attributes scaler = dataset.normalization_dict assert x.shape == (n_times, seq_length, n_features) assert y.shape == (n_times, 1) assert static.shape == (1, n_static_features) expected = [ "static_means", "static_stds", "target_mean", "target_std", "dynamic_stds", "dynamic_means", "x_variables", "target_var", "static_variables", ] assert all( np.isin([k for k in scaler.keys()], expected) ), f"Expected: {expected} Got: {[k for k in scaler.keys()]}"
def test(self, tmp_path, with_static, concat_static): self._initialise_data(tmp_path) # SETTINGS with_basin_str = True train_dates = [2000, 2002] target_var = "discharge_spec" x_variables = ["precipitation", "peti"] static_variables = ["pet_mean", "aridity", "p_seasonality"] seq_length = 5 with_static = with_static concat_static = concat_static basins = get_basins(tmp_path) # EXPECTED out_file = tmp_path / "features/features.h5" static_data_path = tmp_path / "interim/static/data.nc" n_variables = len(x_variables) n_static_features = len(static_variables) # INITIALIZE engineer = RunoffEngineer( data_dir=tmp_path, basins=basins, train_dates=train_dates, with_basin_str=with_basin_str, target_var=target_var, x_variables=x_variables, static_variables=static_variables, ignore_static_vars=None, seq_length=seq_length, with_static=with_static, concat_static=concat_static, ) engineer.create_training_data() data = CamelsH5( data_dir=tmp_path, basins=basins, concat_static=concat_static, cache=True, with_static=with_static, train_dates=train_dates, ) iterate = [d for d in data] assert ( len(iterate) == ((3 * 365) + 1) * 2 ), "Should be 3 years (365 days) + 1 leap days, for two basins" assert data.h5_file == out_file assert data.static_data_path == static_data_path for index in [0, -1]: x = data[index][0] q_stds = data[index][-2] y = data[index][-1] assert q_stds.numpy().shape == (1,) assert y.numpy().shape == (1,) if (with_static) & (not concat_static): static = data[index][1] assert len(data[index]) == 4 assert x.shape == (seq_length, n_variables) assert static.shape == (1, n_static_features) if (with_static) & (concat_static): assert len(data[index]) == 3 assert x.shape == (seq_length, n_variables + n_static_features) if not with_static: assert len(data[index]) == 3 assert x.shape == (seq_length, n_variables) assert data.static_variables == static_variables assert data.target_var == target_var loader = DataLoader(data, batch_size=32, shuffle=True, num_workers=1)