Пример #1
0
def feature_reduction(basin_list: Union[str, Path], run_dir: Union[str, Path],
                      figure_path: Union[str, Path]):
    if isinstance(basin_list, str):
        basin_list = Path(basin_list)
    elif not isinstance(basin_list, Path):
        raise TypeError(
            f"basin_list must be str or Path, not {type(basin_list)}")
    if isinstance(run_dir, str):
        run_dir = Path(run_dir)
    elif not isinstance(run_dir, Path):
        raise TypeError(f"run_dir must be str or Path, not {type(run_dir)}")
    if isinstance(figure_path, str):
        figure_path = Path(figure_path)
    elif not isinstance(figure_path, Path):
        raise TypeError(f"figure_path must be str or Path, not {type(figure)}")
    basins = np.loadtxt(basin_list, dtype=str)
    print(basins)
    features = load_attributes(db_path=run_dir / "attributes.db",
                               basins=basins)
    print(features)
    corr = features.corr()
    corr_linkage = spc.hierarchy.ward(corr)
    print(corr_linkage)
    fig, ax = plt.subplots(1, 1)
    dendro = spc.hierarchy.dendrogram(corr_linkage,
                                      labels=features.columns,
                                      ax=ax)
    #plt.show()
    fig.tight_layout()
    fig.savefig(figure_path)
    print(list(features.columns))
    matching = [s for s in list(features.columns) if "_missing" in s]
    print(matching)
Пример #2
0
def evaluate(user_cfg: Dict):
    """Train model for a single epoch.

    Parameters
    ----------
    user_cfg : Dict
        Dictionary containing the user entered evaluation config
        
    """
    with open(user_cfg["run_dir"] / 'cfg.json', 'r') as fp:
        run_cfg = json.load(fp)

    basins = run_cfg["basins"]

    # get attribute means/stds
    db_path = str(user_cfg["run_dir"] / "attributes.db")
    attributes = load_attributes(db_path=db_path, 
                                 basins=basins,
                                 drop_lat_lon=True)
    means = attributes.mean()
    stds = attributes.std()

    # create model
    input_size_stat = 0 if run_cfg["no_static"] else 27
    input_size_dyn = 5 if (run_cfg["no_static"] or not run_cfg["concat_static"]) else 32
    model = Model(input_size_dyn=input_size_dyn,
                  input_size_stat=input_size_stat,
                  hidden_size=run_cfg["hidden_size"],
                  dropout=run_cfg["dropout"],
                  concat_static=run_cfg["concat_static"],
                  no_static=run_cfg["no_static"]).to(DEVICE)

    # load trained model
    weight_file = user_cfg["run_dir"] / 'model_epoch30.pt'
    model.load_state_dict(torch.load(weight_file, map_location=DEVICE))

    date_range = pd.date_range(start=GLOBAL_SETTINGS["val_start"], end=GLOBAL_SETTINGS["val_end"])
    results = {}
    for basin in tqdm(basins):
        ds_test = CamelsTXT(camels_root=user_cfg["camels_root"],
                            basin=basin,
                            dates=[GLOBAL_SETTINGS["val_start"], GLOBAL_SETTINGS["val_end"]],
                            is_train=False,
                            seq_length=run_cfg["seq_length"],
                            with_attributes=True,
                            attribute_means=means,
                            attribute_stds=stds,
                            concat_static=run_cfg["concat_static"],
                            db_path=db_path)
        loader = DataLoader(ds_test, batch_size=1024, shuffle=False, num_workers=4)

        preds, obs = evaluate_basin(model, loader)

        df = pd.DataFrame(data={'qobs': obs.flatten(), 'qsim': preds.flatten()}, index=date_range)

        results[basin] = df

    _store_results(user_cfg, run_cfg, results)
def evaluate(user_cfg: Dict):
    """Train model for a single epoch.

    Parameters
    ----------
    user_cfg : Dict
        Dictionary containing the user entered evaluation config
        
    """
    with open(user_cfg["run_dir"] / 'cfg.json', 'r') as fp:
        run_cfg = json.load(fp)

    basins = run_cfg["basins"]

    # get attribute means/stds
    db_path = str(user_cfg["run_dir"] / "attributes.db")
    attributes = load_attributes(db_path=db_path,
                                 basins=basins,
                                 drop_lat_lon=True)
    means = attributes.mean()
    stds = attributes.std()

    # load trained model
    model_file = user_cfg["run_dir"] / 'model.pkl'
    model = pickle.load(open(model_file, 'rb'))

    date_range = pd.date_range(start=GLOBAL_SETTINGS["val_start"],
                               end=GLOBAL_SETTINGS["val_end"])
    results = {}
    for basin in tqdm(basins):
        ds_test = CamelsTXT(
            camels_root=user_cfg["camels_root"],
            basin=basin,
            dates=[GLOBAL_SETTINGS["val_start"], GLOBAL_SETTINGS["val_end"]],
            is_train=False,
            seq_length=run_cfg["seq_length"],
            with_attributes=True,
            attribute_means=means,
            attribute_stds=stds,
            concat_static=False,
            db_path=db_path)

        preds, obs = evaluate_basin(model, ds_test, run_cfg["no_static"])

        df = pd.DataFrame(data={
            'qobs': obs.flatten(),
            'qsim': preds.flatten()
        },
                          index=date_range)

        results[basin] = df

    _store_results(user_cfg, run_cfg, results)
Пример #4
0
def static_feature_relation(run_dir: Union[str, Path],
                            camels_dir: Union[str, Path]) -> pd.DataFrame:
    if isinstance(run_dir, str):
        run_dir = Path(run_dir)
    elif not isinstance(run_dir, Path):
        raise TypeError(f"run_dir must be str or Path, not {type(run_dir)}")
    if isinstance(camels_dir, str):
        camels_dir = Path(camels_dir)
    elif not isinstance(camels_dir, Path):
        raise TypeError(
            f"camels_dir must be str or Path, not {type(camels_dir)}")
    seed = str(run_dir).split("_")[-1]
    results = eval_lstm_models([run_dir], calc_nse)[seed]
    # for basin in results.keys():
    # print(f"Basin {basin}: {results[basin]}")
    features = load_attributes(run_dir / "attributes.db",
                               basins=list(results.keys())).sort_index()
    df_results = pd.DataFrame.from_dict(results,
                                        orient="index",
                                        columns=["NSE"])
    df_results.index.name = "gauge_id"
    df_results = df_results.sort_index()
    df = pd.concat([features, df_results], axis=1)
    return df
Пример #5
0
def eval_robustness(user_cfg: Dict):
    """Evaluate model robustness of EA-LSTM

    In this experiment, gaussian noise with increasing scale is added to the static features to
    evaluate the model robustness against pertubations of the static catchment characteristics.
    For each scale, 50 noise vectors are drawn.
    
    Parameters
    ----------
    user_cfg : Dict
        Dictionary containing the user entered evaluation config
    
    Raises
    ------
    NotImplementedError
        If the run_dir specified points not to a EA-LSTM model folder.
    """
    random.seed(user_cfg["seed"])
    np.random.seed(user_cfg["seed"])

    # fixed settings for this analysis
    n_repetitions = 50
    scales = [0.1 * i for i in range(11)]

    with open(user_cfg["run_dir"] / 'cfg.json', 'r') as fp:
        run_cfg = json.load(fp)

    if run_cfg["concat_static"] or run_cfg["no_static"]:
        raise NotImplementedError("This function is only implemented for EA-LSTM models")

    basins = run_cfg["basins"]

    # get attribute means/stds
    db_path = str(user_cfg["run_dir"] / "attributes.db")
    attributes = load_attributes(db_path=db_path, 
                                 basins=basins,
                                 drop_lat_lon=True)
    means = attributes.mean()
    stds = attributes.std()

    # initialize Model
    model = Model(input_size_dyn=5,
                  input_size_stat=27,
                  hidden_size=run_cfg["hidden_size"],
                  dropout=run_cfg["dropout"]).to(DEVICE)
    weight_file = user_cfg["run_dir"] / "model_epoch30.pt"
    model.load_state_dict(torch.load(weight_file, map_location=DEVICE))

    overall_results = {}
    # process bar handle
    pbar = tqdm(basins, file=sys.stdout)
    for basin in pbar:
        ds_test = CamelsTXT(camels_root=user_cfg["camels_root"],
                            basin=basin,
                            dates=[GLOBAL_SETTINGS["val_start"], GLOBAL_SETTINGS["val_end"]],
                            is_train=False,
                            with_attributes=True,
                            attribute_means=means,
                            attribute_stds=stds,
                            db_path=db_path)
        loader = DataLoader(ds_test, batch_size=len(ds_test), shuffle=False, num_workers=0)
        basin_results = defaultdict(list)
        step = 1
        for scale in scales:
            for _ in range(1 if scale == 0.0 else n_repetitions):
                noise = np.random.normal(loc=0, scale=scale, size=27).astype(np.float32)
                noise = torch.from_numpy(noise).to(DEVICE)
                nse = eval_with_added_noise(model, loader, noise)
                basin_results[scale].append(nse)
                pbar.set_postfix_str(f"Basin progress: {step}/{(len(scales)-1)*n_repetitions+1}")
                step += 1

        overall_results[basin] = basin_results
    out_file = (Path(__file__).absolute().parent /
                f'results/{user_cfg["run_dir"].name}_model_robustness.p')
    if not out_file.parent.is_dir():
        out_file.parent.mkdir(parents=True)
    with out_file.open("wb") as fp:
        pickle.dump(overall_results, fp)
Пример #6
0
def evaluate(user_cfg: Dict):
    """Train model for a single epoch.

    Parameters
    ----------
    user_cfg : Dict
        Dictionary containing the user entered evaluation config
        
    """
    with open(user_cfg["run_dir"] / 'cfg.json', 'r') as fp:
        run_cfg = json.load(fp)

    if user_cfg["split_file"] is not None:
        with Path(user_cfg["split_file"]).open('rb') as fp:
            splits = pickle.load(fp)
        basins = splits[run_cfg["split"]]["test"]
    else:
        basins = get_basin_list(basin_list_file_evaluate)

    # get attribute means/stds
    db_path = str(user_cfg["run_dir"] / "attributes.db")
    attributes = load_attributes(db_path=db_path,
                                 basins=basins,
                                 drop_lat_lon=True,
                                 keep_features=user_cfg["camels_attr"])

    # get remaining scaler from pickle file
    scaler_file = user_cfg["run_dir"] / "data" / "train" / "scaler.p"
    with open(scaler_file, "rb") as fp:
        scaler = pickle.load(fp)
    scaler["camels_attr_mean"] = attributes.mean()
    scaler["camels_attr_std"] = attributes.std()

    # create model
    if run_cfg["concat_static"] and not run_cfg["embedding_hiddens"]:
        input_size_stat = 0
        input_size_dyn = (len(run_cfg["dynamic_inputs"]) +
                          len(run_cfg["camels_attr"]) +
                          len(run_cfg["static_inputs"]))
        concat_static = True
    else:
        input_size_stat = len(run_cfg["camels_attr"]) + len(
            run_cfg["static_inputs"])
        input_size_dyn = len(run_cfg["dynamic_inputs"])
        concat_static = False
    model = Model(input_size_dyn=input_size_dyn,
                  input_size_stat=input_size_stat,
                  hidden_size=run_cfg["hidden_size"],
                  dropout=run_cfg["dropout"],
                  concat_static=run_cfg["concat_static"],
                  embedding_hiddens=run_cfg["embedding_hiddens"]).to(DEVICE)

    # load trained model
    weight_file = user_cfg["run_dir"] / 'model_epoch30.pt'
    model.load_state_dict(torch.load(weight_file, map_location=DEVICE))

    date_range = pd.date_range(start=user_cfg["val_start"],
                               end=user_cfg["val_end"])
    results = {}
    cell_states = {}
    embeddings = {}
    nses = []

    file_name = Path(__file__).parent / 'data' / 'dynamic_features_nwm_v2.p'
    with file_name.open("rb") as fp:
        additional_features = pickle.load(fp)

    # ad hoc static climate indices
    # requres the training period for this experiment
    # overwrites the *_dyn type climate indices in 'additional_features'
    if not user_cfg['use_dynamic_climate']:
        if user_cfg['static_climate'].lower() == 'test':
            eval_clim_indexes = training_period_climate_indices(
                db_path=db_path,
                camels_root=user_cfg['camels_root'],
                basins=basins,
                start_date=user_cfg['val_start'],
                end_date=user_cfg['val_end'])
        elif user_cfg['static_climate'].lower() == 'train':
            eval_clim_indexes = training_period_climate_indices(
                db_path=db_path,
                camels_root=user_cfg['camels_root'],
                basins=basins,
                start_date=user_cfg['train_start'],
                end_date=user_cfg['train_end'])
        else:
            raise RuntimeError(f"Unknown static_climate variable.")

        for basin in basins:
            for col in eval_clim_indexes[basin].columns:
                additional_features[basin][col] = np.tile(
                    eval_clim_indexes[basin][col].values,
                    [additional_features[basin].shape[0], 1])

    for basin in tqdm(basins):
        ds_test = CamelsTXTv2(
            camels_root=user_cfg["camels_root"],
            basin=basin,
            dates=[user_cfg["val_start"], user_cfg["val_end"]],
            is_train=False,
            dynamic_inputs=user_cfg["dynamic_inputs"],
            camels_attr=user_cfg["camels_attr"],
            static_inputs=user_cfg["static_inputs"],
            additional_features=additional_features[basin],
            scaler=scaler,
            seq_length=run_cfg["seq_length"],
            concat_static=concat_static,
            db_path=db_path)
        loader = DataLoader(ds_test,
                            batch_size=2500,
                            shuffle=False,
                            num_workers=user_cfg["num_workers"])

        preds, obs, cells, embeds = evaluate_basin(model, loader)

        # rescale predictions
        preds = preds * scaler["q_std"] + scaler["q_mean"]

        # store predictions
        # set discharges < 0 to zero
        preds[preds < 0] = 0
        nses.append(calc_nse(obs[obs >= 0], preds[obs >= 0]))
        df = pd.DataFrame(data={
            'qobs': obs.flatten(),
            'qsim': preds.flatten()
        },
                          index=date_range)
        results[basin] = df

        # store cell states and embedding values
        cell_states[basin] = pd.DataFrame(data=cells, index=date_range)
        embeddings[basin] = pd.DataFrame(data=embeds, index=date_range)

    print(f"Mean NSE {np.mean(nses)}, median NSE {np.median(nses)}")

    _store_results(user_cfg, run_cfg, results, cell_states, embeddings)
Пример #7
0
def train(cfg):
    """Train model.

    Parameters
    ----------
    cfg : Dict
        Dictionary containing the run config
    """
    # fix random seeds
    random.seed(cfg["seed"])
    np.random.seed(cfg["seed"])
    torch.cuda.manual_seed(cfg["seed"])
    torch.manual_seed(cfg["seed"])

    if cfg["split_file"] is not None:
        with Path(cfg["split_file"]).open('rb') as fp:
            splits = pickle.load(fp)
        basins = splits[cfg["split"]]["train"]
    else:
        basins = get_basin_list(basin_list_file_train)
        #basins = basins[:30]

    # create folder structure for this run
    cfg = _setup_run(cfg)

    # prepare data for training
    cfg = _prepare_data(cfg=cfg, basins=basins)

    with open(cfg["scaler_file"], 'rb') as fp:
        scaler = pickle.load(fp)

    camels_attr = load_attributes(cfg["db_path"],
                                  basins,
                                  drop_lat_lon=True,
                                  keep_features=cfg["camels_attr"])
    scaler["camels_attr_mean"] = camels_attr.mean()
    scaler["camels_attr_std"] = camels_attr.std()

    # create model and optimizer
    if cfg["concat_static"] and not cfg["embedding_hiddens"]:
        input_size_stat = 0
        input_size_dyn = (len(cfg["dynamic_inputs"]) +
                          len(cfg["camels_attr"]) + len(cfg["static_inputs"]))
        concat_static = True
    else:
        input_size_stat = len(cfg["camels_attr"]) + len(cfg["static_inputs"])
        input_size_dyn = len(cfg["dynamic_inputs"])
        concat_static = False
    model = Model(input_size_dyn=input_size_dyn,
                  input_size_stat=input_size_stat,
                  hidden_size=cfg["hidden_size"],
                  initial_forget_bias=cfg["initial_forget_gate_bias"],
                  embedding_hiddens=cfg["embedding_hiddens"],
                  dropout=cfg["dropout"],
                  concat_static=cfg["concat_static"]).to(DEVICE)
    optimizer = torch.optim.Adam(model.parameters(), lr=cfg["learning_rate"])

    # prepare PyTorch DataLoader
    ds = CamelsH5v2(h5_file=cfg["train_file"],
                    basins=basins,
                    db_path=cfg["db_path"],
                    concat_static=concat_static,
                    cache=cfg["cache_data"],
                    camels_attr=cfg["camels_attr"],
                    scaler=scaler)
    loader = DataLoader(ds,
                        batch_size=cfg["batch_size"],
                        shuffle=True,
                        num_workers=cfg["num_workers"])

    # define loss function
    if cfg["use_mse"]:
        loss_func = nn.MSELoss()
    else:
        loss_func = NSELoss()

    # reduce learning rates after each 10 epochs
    learning_rates = {11: 5e-4, 21: 1e-4}

    for epoch in range(1, cfg["epochs"] + 1):
        # set new learning rate
        if epoch in learning_rates.keys():
            for param_group in optimizer.param_groups:
                param_group["lr"] = learning_rates[epoch]

        train_epoch(model, optimizer, loss_func, loader, cfg, epoch,
                    cfg["use_mse"])

        model_path = cfg["run_dir"] / f"model_epoch{epoch}.pt"
        torch.save(model.state_dict(), str(model_path))
Пример #8
0
# Convert to PosixPaths
CAMELS_DIR = Path(CAMELS_DIR)
BASE_RUN_DIR = Path(BASE_RUN_DIR)

# load run config
with open(BASE_RUN_DIR / "cfg.json", "r") as fp:
    cfg = json.load(fp)
cfg

# get list of modeled basins
basins = get_basin_list()
nbasins = len(basins)

# load means/stds from training period
attributes = load_attributes(db_path=str(BASE_RUN_DIR / "attributes.db"),
                             basins=basins,
                             keep_features=cfg["camels_attr"],
                             drop_lat_lon=True)

# Initialize new model
print('initializing model ...')
model = Model(input_size_dyn=51,
              input_size_stat=0,
              hidden_size=cfg["hidden_size"],
              dropout=cfg["dropout"]).to(DEVICE)
print('finished initializing model')

# load pre-trained weights
print('loading model ...')
weight_file = BASE_RUN_DIR / "model_epoch30.pt"
model.load_state_dict(torch.load(weight_file, map_location=DEVICE))
print('finished loading model')
def predict_basin(
    basin: str,
    run_dir: Union[str, Path],
    camels_dir: Union[str, Path],
    period: str = "train",
    epoch: int = 30,
):
    if isinstance(run_dir, str):
        run_dir = Path(run_dir)
    elif not isinstance(run_dir, Path):
        raise TypeError(f"run_dir must be str or Path, not {type(run_dir)}")
    if isinstance(camels_dir, str):
        camels_dir = Path(camels_dir)
    elif not isinstance(camels_dir, Path):
        raise TypeError(f"run_dir must be str or Path, not {type(camels_dir)}")

    with open(run_dir / "cfg.json", "r") as fp:
        run_cfg = json.load(fp)

    if not period in ["train", "val"]:
        raise ValueError("period must be either train or val")
    basins = get_basin_list()
    db_path = str(run_dir / "attributes.db")
    attributes = load_attributes(db_path=db_path,
                                 basins=basins,
                                 drop_lat_lon=True)
    means = attributes.mean()
    stds = attributes.std()
    attrs_count = len(attributes.columns)
    timeseries_count = 6
    input_size_stat = timeseries_count if run_cfg["no_static"] else attrs_count
    input_size_dyn = (timeseries_count if
                      (run_cfg["no_static"] or not run_cfg["concat_static"])
                      else timeseries_count + attrs_count)
    model = Model(
        input_size_dyn=input_size_dyn,
        input_size_stat=input_size_stat,
        hidden_size=run_cfg["hidden_size"],
        dropout=run_cfg["dropout"],
        concat_static=run_cfg["concat_static"],
        no_static=run_cfg["no_static"],
    ).to(DEVICE)

    # load trained model
    weight_file = run_dir / f"model_epoch{epoch}.pt"
    model.load_state_dict(torch.load(weight_file, map_location=DEVICE))

    ds_test = CamelsTXT(
        camels_root=camels_dir,
        basin=basin,
        dates=[
            GLOBAL_SETTINGS[f"{period}_start"],
            GLOBAL_SETTINGS[f"{period}_end"]
        ],
        is_train=False,
        seq_length=run_cfg["seq_length"],
        with_attributes=True,
        attribute_means=means,
        attribute_stds=stds,
        concat_static=run_cfg["concat_static"],
        db_path=db_path,
    )
    date_range = ds_test.dates_index[run_cfg["seq_length"] - 1:]
    loader = DataLoader(ds_test, batch_size=1024, shuffle=False, num_workers=4)
    preds, obs = evaluate_basin(model, loader)
    df = pd.DataFrame(data={
        "qobs": obs.flatten(),
        "qsim": preds.flatten()
    },
                      index=date_range)

    results = df
    # plt.plot(date_range, results["qobs"], label="Obs")
    # plt.plot(date_range, results["qsim"], label="Preds")
    # plt.legend()
    # plt.savefig(f"{run_dir}/pred_basin_{basin}.pdf")
    # plt.close()
    return results, date_range