def feature_reduction(basin_list: Union[str, Path], run_dir: Union[str, Path], figure_path: Union[str, Path]): if isinstance(basin_list, str): basin_list = Path(basin_list) elif not isinstance(basin_list, Path): raise TypeError( f"basin_list must be str or Path, not {type(basin_list)}") if isinstance(run_dir, str): run_dir = Path(run_dir) elif not isinstance(run_dir, Path): raise TypeError(f"run_dir must be str or Path, not {type(run_dir)}") if isinstance(figure_path, str): figure_path = Path(figure_path) elif not isinstance(figure_path, Path): raise TypeError(f"figure_path must be str or Path, not {type(figure)}") basins = np.loadtxt(basin_list, dtype=str) print(basins) features = load_attributes(db_path=run_dir / "attributes.db", basins=basins) print(features) corr = features.corr() corr_linkage = spc.hierarchy.ward(corr) print(corr_linkage) fig, ax = plt.subplots(1, 1) dendro = spc.hierarchy.dendrogram(corr_linkage, labels=features.columns, ax=ax) #plt.show() fig.tight_layout() fig.savefig(figure_path) print(list(features.columns)) matching = [s for s in list(features.columns) if "_missing" in s] print(matching)
def evaluate(user_cfg: Dict): """Train model for a single epoch. Parameters ---------- user_cfg : Dict Dictionary containing the user entered evaluation config """ with open(user_cfg["run_dir"] / 'cfg.json', 'r') as fp: run_cfg = json.load(fp) basins = run_cfg["basins"] # get attribute means/stds db_path = str(user_cfg["run_dir"] / "attributes.db") attributes = load_attributes(db_path=db_path, basins=basins, drop_lat_lon=True) means = attributes.mean() stds = attributes.std() # create model input_size_stat = 0 if run_cfg["no_static"] else 27 input_size_dyn = 5 if (run_cfg["no_static"] or not run_cfg["concat_static"]) else 32 model = Model(input_size_dyn=input_size_dyn, input_size_stat=input_size_stat, hidden_size=run_cfg["hidden_size"], dropout=run_cfg["dropout"], concat_static=run_cfg["concat_static"], no_static=run_cfg["no_static"]).to(DEVICE) # load trained model weight_file = user_cfg["run_dir"] / 'model_epoch30.pt' model.load_state_dict(torch.load(weight_file, map_location=DEVICE)) date_range = pd.date_range(start=GLOBAL_SETTINGS["val_start"], end=GLOBAL_SETTINGS["val_end"]) results = {} for basin in tqdm(basins): ds_test = CamelsTXT(camels_root=user_cfg["camels_root"], basin=basin, dates=[GLOBAL_SETTINGS["val_start"], GLOBAL_SETTINGS["val_end"]], is_train=False, seq_length=run_cfg["seq_length"], with_attributes=True, attribute_means=means, attribute_stds=stds, concat_static=run_cfg["concat_static"], db_path=db_path) loader = DataLoader(ds_test, batch_size=1024, shuffle=False, num_workers=4) preds, obs = evaluate_basin(model, loader) df = pd.DataFrame(data={'qobs': obs.flatten(), 'qsim': preds.flatten()}, index=date_range) results[basin] = df _store_results(user_cfg, run_cfg, results)
def evaluate(user_cfg: Dict): """Train model for a single epoch. Parameters ---------- user_cfg : Dict Dictionary containing the user entered evaluation config """ with open(user_cfg["run_dir"] / 'cfg.json', 'r') as fp: run_cfg = json.load(fp) basins = run_cfg["basins"] # get attribute means/stds db_path = str(user_cfg["run_dir"] / "attributes.db") attributes = load_attributes(db_path=db_path, basins=basins, drop_lat_lon=True) means = attributes.mean() stds = attributes.std() # load trained model model_file = user_cfg["run_dir"] / 'model.pkl' model = pickle.load(open(model_file, 'rb')) date_range = pd.date_range(start=GLOBAL_SETTINGS["val_start"], end=GLOBAL_SETTINGS["val_end"]) results = {} for basin in tqdm(basins): ds_test = CamelsTXT( camels_root=user_cfg["camels_root"], basin=basin, dates=[GLOBAL_SETTINGS["val_start"], GLOBAL_SETTINGS["val_end"]], is_train=False, seq_length=run_cfg["seq_length"], with_attributes=True, attribute_means=means, attribute_stds=stds, concat_static=False, db_path=db_path) preds, obs = evaluate_basin(model, ds_test, run_cfg["no_static"]) df = pd.DataFrame(data={ 'qobs': obs.flatten(), 'qsim': preds.flatten() }, index=date_range) results[basin] = df _store_results(user_cfg, run_cfg, results)
def static_feature_relation(run_dir: Union[str, Path], camels_dir: Union[str, Path]) -> pd.DataFrame: if isinstance(run_dir, str): run_dir = Path(run_dir) elif not isinstance(run_dir, Path): raise TypeError(f"run_dir must be str or Path, not {type(run_dir)}") if isinstance(camels_dir, str): camels_dir = Path(camels_dir) elif not isinstance(camels_dir, Path): raise TypeError( f"camels_dir must be str or Path, not {type(camels_dir)}") seed = str(run_dir).split("_")[-1] results = eval_lstm_models([run_dir], calc_nse)[seed] # for basin in results.keys(): # print(f"Basin {basin}: {results[basin]}") features = load_attributes(run_dir / "attributes.db", basins=list(results.keys())).sort_index() df_results = pd.DataFrame.from_dict(results, orient="index", columns=["NSE"]) df_results.index.name = "gauge_id" df_results = df_results.sort_index() df = pd.concat([features, df_results], axis=1) return df
def eval_robustness(user_cfg: Dict): """Evaluate model robustness of EA-LSTM In this experiment, gaussian noise with increasing scale is added to the static features to evaluate the model robustness against pertubations of the static catchment characteristics. For each scale, 50 noise vectors are drawn. Parameters ---------- user_cfg : Dict Dictionary containing the user entered evaluation config Raises ------ NotImplementedError If the run_dir specified points not to a EA-LSTM model folder. """ random.seed(user_cfg["seed"]) np.random.seed(user_cfg["seed"]) # fixed settings for this analysis n_repetitions = 50 scales = [0.1 * i for i in range(11)] with open(user_cfg["run_dir"] / 'cfg.json', 'r') as fp: run_cfg = json.load(fp) if run_cfg["concat_static"] or run_cfg["no_static"]: raise NotImplementedError("This function is only implemented for EA-LSTM models") basins = run_cfg["basins"] # get attribute means/stds db_path = str(user_cfg["run_dir"] / "attributes.db") attributes = load_attributes(db_path=db_path, basins=basins, drop_lat_lon=True) means = attributes.mean() stds = attributes.std() # initialize Model model = Model(input_size_dyn=5, input_size_stat=27, hidden_size=run_cfg["hidden_size"], dropout=run_cfg["dropout"]).to(DEVICE) weight_file = user_cfg["run_dir"] / "model_epoch30.pt" model.load_state_dict(torch.load(weight_file, map_location=DEVICE)) overall_results = {} # process bar handle pbar = tqdm(basins, file=sys.stdout) for basin in pbar: ds_test = CamelsTXT(camels_root=user_cfg["camels_root"], basin=basin, dates=[GLOBAL_SETTINGS["val_start"], GLOBAL_SETTINGS["val_end"]], is_train=False, with_attributes=True, attribute_means=means, attribute_stds=stds, db_path=db_path) loader = DataLoader(ds_test, batch_size=len(ds_test), shuffle=False, num_workers=0) basin_results = defaultdict(list) step = 1 for scale in scales: for _ in range(1 if scale == 0.0 else n_repetitions): noise = np.random.normal(loc=0, scale=scale, size=27).astype(np.float32) noise = torch.from_numpy(noise).to(DEVICE) nse = eval_with_added_noise(model, loader, noise) basin_results[scale].append(nse) pbar.set_postfix_str(f"Basin progress: {step}/{(len(scales)-1)*n_repetitions+1}") step += 1 overall_results[basin] = basin_results out_file = (Path(__file__).absolute().parent / f'results/{user_cfg["run_dir"].name}_model_robustness.p') if not out_file.parent.is_dir(): out_file.parent.mkdir(parents=True) with out_file.open("wb") as fp: pickle.dump(overall_results, fp)
def evaluate(user_cfg: Dict): """Train model for a single epoch. Parameters ---------- user_cfg : Dict Dictionary containing the user entered evaluation config """ with open(user_cfg["run_dir"] / 'cfg.json', 'r') as fp: run_cfg = json.load(fp) if user_cfg["split_file"] is not None: with Path(user_cfg["split_file"]).open('rb') as fp: splits = pickle.load(fp) basins = splits[run_cfg["split"]]["test"] else: basins = get_basin_list(basin_list_file_evaluate) # get attribute means/stds db_path = str(user_cfg["run_dir"] / "attributes.db") attributes = load_attributes(db_path=db_path, basins=basins, drop_lat_lon=True, keep_features=user_cfg["camels_attr"]) # get remaining scaler from pickle file scaler_file = user_cfg["run_dir"] / "data" / "train" / "scaler.p" with open(scaler_file, "rb") as fp: scaler = pickle.load(fp) scaler["camels_attr_mean"] = attributes.mean() scaler["camels_attr_std"] = attributes.std() # create model if run_cfg["concat_static"] and not run_cfg["embedding_hiddens"]: input_size_stat = 0 input_size_dyn = (len(run_cfg["dynamic_inputs"]) + len(run_cfg["camels_attr"]) + len(run_cfg["static_inputs"])) concat_static = True else: input_size_stat = len(run_cfg["camels_attr"]) + len( run_cfg["static_inputs"]) input_size_dyn = len(run_cfg["dynamic_inputs"]) concat_static = False model = Model(input_size_dyn=input_size_dyn, input_size_stat=input_size_stat, hidden_size=run_cfg["hidden_size"], dropout=run_cfg["dropout"], concat_static=run_cfg["concat_static"], embedding_hiddens=run_cfg["embedding_hiddens"]).to(DEVICE) # load trained model weight_file = user_cfg["run_dir"] / 'model_epoch30.pt' model.load_state_dict(torch.load(weight_file, map_location=DEVICE)) date_range = pd.date_range(start=user_cfg["val_start"], end=user_cfg["val_end"]) results = {} cell_states = {} embeddings = {} nses = [] file_name = Path(__file__).parent / 'data' / 'dynamic_features_nwm_v2.p' with file_name.open("rb") as fp: additional_features = pickle.load(fp) # ad hoc static climate indices # requres the training period for this experiment # overwrites the *_dyn type climate indices in 'additional_features' if not user_cfg['use_dynamic_climate']: if user_cfg['static_climate'].lower() == 'test': eval_clim_indexes = training_period_climate_indices( db_path=db_path, camels_root=user_cfg['camels_root'], basins=basins, start_date=user_cfg['val_start'], end_date=user_cfg['val_end']) elif user_cfg['static_climate'].lower() == 'train': eval_clim_indexes = training_period_climate_indices( db_path=db_path, camels_root=user_cfg['camels_root'], basins=basins, start_date=user_cfg['train_start'], end_date=user_cfg['train_end']) else: raise RuntimeError(f"Unknown static_climate variable.") for basin in basins: for col in eval_clim_indexes[basin].columns: additional_features[basin][col] = np.tile( eval_clim_indexes[basin][col].values, [additional_features[basin].shape[0], 1]) for basin in tqdm(basins): ds_test = CamelsTXTv2( camels_root=user_cfg["camels_root"], basin=basin, dates=[user_cfg["val_start"], user_cfg["val_end"]], is_train=False, dynamic_inputs=user_cfg["dynamic_inputs"], camels_attr=user_cfg["camels_attr"], static_inputs=user_cfg["static_inputs"], additional_features=additional_features[basin], scaler=scaler, seq_length=run_cfg["seq_length"], concat_static=concat_static, db_path=db_path) loader = DataLoader(ds_test, batch_size=2500, shuffle=False, num_workers=user_cfg["num_workers"]) preds, obs, cells, embeds = evaluate_basin(model, loader) # rescale predictions preds = preds * scaler["q_std"] + scaler["q_mean"] # store predictions # set discharges < 0 to zero preds[preds < 0] = 0 nses.append(calc_nse(obs[obs >= 0], preds[obs >= 0])) df = pd.DataFrame(data={ 'qobs': obs.flatten(), 'qsim': preds.flatten() }, index=date_range) results[basin] = df # store cell states and embedding values cell_states[basin] = pd.DataFrame(data=cells, index=date_range) embeddings[basin] = pd.DataFrame(data=embeds, index=date_range) print(f"Mean NSE {np.mean(nses)}, median NSE {np.median(nses)}") _store_results(user_cfg, run_cfg, results, cell_states, embeddings)
def train(cfg): """Train model. Parameters ---------- cfg : Dict Dictionary containing the run config """ # fix random seeds random.seed(cfg["seed"]) np.random.seed(cfg["seed"]) torch.cuda.manual_seed(cfg["seed"]) torch.manual_seed(cfg["seed"]) if cfg["split_file"] is not None: with Path(cfg["split_file"]).open('rb') as fp: splits = pickle.load(fp) basins = splits[cfg["split"]]["train"] else: basins = get_basin_list(basin_list_file_train) #basins = basins[:30] # create folder structure for this run cfg = _setup_run(cfg) # prepare data for training cfg = _prepare_data(cfg=cfg, basins=basins) with open(cfg["scaler_file"], 'rb') as fp: scaler = pickle.load(fp) camels_attr = load_attributes(cfg["db_path"], basins, drop_lat_lon=True, keep_features=cfg["camels_attr"]) scaler["camels_attr_mean"] = camels_attr.mean() scaler["camels_attr_std"] = camels_attr.std() # create model and optimizer if cfg["concat_static"] and not cfg["embedding_hiddens"]: input_size_stat = 0 input_size_dyn = (len(cfg["dynamic_inputs"]) + len(cfg["camels_attr"]) + len(cfg["static_inputs"])) concat_static = True else: input_size_stat = len(cfg["camels_attr"]) + len(cfg["static_inputs"]) input_size_dyn = len(cfg["dynamic_inputs"]) concat_static = False model = Model(input_size_dyn=input_size_dyn, input_size_stat=input_size_stat, hidden_size=cfg["hidden_size"], initial_forget_bias=cfg["initial_forget_gate_bias"], embedding_hiddens=cfg["embedding_hiddens"], dropout=cfg["dropout"], concat_static=cfg["concat_static"]).to(DEVICE) optimizer = torch.optim.Adam(model.parameters(), lr=cfg["learning_rate"]) # prepare PyTorch DataLoader ds = CamelsH5v2(h5_file=cfg["train_file"], basins=basins, db_path=cfg["db_path"], concat_static=concat_static, cache=cfg["cache_data"], camels_attr=cfg["camels_attr"], scaler=scaler) loader = DataLoader(ds, batch_size=cfg["batch_size"], shuffle=True, num_workers=cfg["num_workers"]) # define loss function if cfg["use_mse"]: loss_func = nn.MSELoss() else: loss_func = NSELoss() # reduce learning rates after each 10 epochs learning_rates = {11: 5e-4, 21: 1e-4} for epoch in range(1, cfg["epochs"] + 1): # set new learning rate if epoch in learning_rates.keys(): for param_group in optimizer.param_groups: param_group["lr"] = learning_rates[epoch] train_epoch(model, optimizer, loss_func, loader, cfg, epoch, cfg["use_mse"]) model_path = cfg["run_dir"] / f"model_epoch{epoch}.pt" torch.save(model.state_dict(), str(model_path))
# Convert to PosixPaths CAMELS_DIR = Path(CAMELS_DIR) BASE_RUN_DIR = Path(BASE_RUN_DIR) # load run config with open(BASE_RUN_DIR / "cfg.json", "r") as fp: cfg = json.load(fp) cfg # get list of modeled basins basins = get_basin_list() nbasins = len(basins) # load means/stds from training period attributes = load_attributes(db_path=str(BASE_RUN_DIR / "attributes.db"), basins=basins, keep_features=cfg["camels_attr"], drop_lat_lon=True) # Initialize new model print('initializing model ...') model = Model(input_size_dyn=51, input_size_stat=0, hidden_size=cfg["hidden_size"], dropout=cfg["dropout"]).to(DEVICE) print('finished initializing model') # load pre-trained weights print('loading model ...') weight_file = BASE_RUN_DIR / "model_epoch30.pt" model.load_state_dict(torch.load(weight_file, map_location=DEVICE)) print('finished loading model')
def predict_basin( basin: str, run_dir: Union[str, Path], camels_dir: Union[str, Path], period: str = "train", epoch: int = 30, ): if isinstance(run_dir, str): run_dir = Path(run_dir) elif not isinstance(run_dir, Path): raise TypeError(f"run_dir must be str or Path, not {type(run_dir)}") if isinstance(camels_dir, str): camels_dir = Path(camels_dir) elif not isinstance(camels_dir, Path): raise TypeError(f"run_dir must be str or Path, not {type(camels_dir)}") with open(run_dir / "cfg.json", "r") as fp: run_cfg = json.load(fp) if not period in ["train", "val"]: raise ValueError("period must be either train or val") basins = get_basin_list() db_path = str(run_dir / "attributes.db") attributes = load_attributes(db_path=db_path, basins=basins, drop_lat_lon=True) means = attributes.mean() stds = attributes.std() attrs_count = len(attributes.columns) timeseries_count = 6 input_size_stat = timeseries_count if run_cfg["no_static"] else attrs_count input_size_dyn = (timeseries_count if (run_cfg["no_static"] or not run_cfg["concat_static"]) else timeseries_count + attrs_count) model = Model( input_size_dyn=input_size_dyn, input_size_stat=input_size_stat, hidden_size=run_cfg["hidden_size"], dropout=run_cfg["dropout"], concat_static=run_cfg["concat_static"], no_static=run_cfg["no_static"], ).to(DEVICE) # load trained model weight_file = run_dir / f"model_epoch{epoch}.pt" model.load_state_dict(torch.load(weight_file, map_location=DEVICE)) ds_test = CamelsTXT( camels_root=camels_dir, basin=basin, dates=[ GLOBAL_SETTINGS[f"{period}_start"], GLOBAL_SETTINGS[f"{period}_end"] ], is_train=False, seq_length=run_cfg["seq_length"], with_attributes=True, attribute_means=means, attribute_stds=stds, concat_static=run_cfg["concat_static"], db_path=db_path, ) date_range = ds_test.dates_index[run_cfg["seq_length"] - 1:] loader = DataLoader(ds_test, batch_size=1024, shuffle=False, num_workers=4) preds, obs = evaluate_basin(model, loader) df = pd.DataFrame(data={ "qobs": obs.flatten(), "qsim": preds.flatten() }, index=date_range) results = df # plt.plot(date_range, results["qobs"], label="Obs") # plt.plot(date_range, results["qsim"], label="Preds") # plt.legend() # plt.savefig(f"{run_dir}/pred_basin_{basin}.pdf") # plt.close() return results, date_range