예제 #1
0
 def test_read_sites_id_see_dor(self):
     exp_lst = ["exp18", "exp19", "exp20", "exp21", "exp22", "exp23"]
     sub_lst = ["0", "1"]
     diff_lst = [
         "dictTimeSpace.json", "test_dictTimeSpace.json",
         "test_dictTimeSpace_2.json"
     ]
     for exp_str in exp_lst:
         for sub_str in sub_lst:
             comp_sites = []
             for item in diff_lst:
                 gage_id_file = os.path.join(
                     self.config_data.config_file["ROOT_DIR"], "temp",
                     "gages", "ecoregion", exp_str, sub_str, item)
                 usgs_id = unserialize_json(gage_id_file)["sites_id"]
                 assert (all(x < y for x, y in zip(usgs_id, usgs_id[1:])))
                 comp_sites.append(usgs_id)
                 # mm/year 1-km grid,  megaliters total storage per sq km  (1 megaliters = 1,000,000 liters = 1,000 cubic meters)
                 # attr_lst = ["RUNAVE7100", "STOR_NID_2009"]
                 attr_lst = ["RUNAVE7100", "STOR_NOR_2009"]
                 source_data = GagesSource.choose_some_basins(
                     self.config_data,
                     self.config_data.model_dict["data"]["tRangeTrain"],
                     screen_basin_area_huc4=False,
                     sites_id=usgs_id)
                 data_attr, var_dict, f_dict = source_data.read_attr(
                     usgs_id, attr_lst)
                 run_avg = data_attr[:, 0] * (10**(-3)) * (10**6
                                                           )  # m^3 per year
                 nor_storage = data_attr[:, 1] * 1000  # m^3
                 dors = nor_storage / run_avg
                 results = [round(i, 3) for i in dors]
                 hydro_logger.info(
                     exp_str + "-" + sub_str + "-" + item + " DOR: %s",
                     results)
             hydro_logger.info(
                 "the intersection of each pair of sites: %s, %s, %s",
                 np.intersect1d(comp_sites[0], comp_sites[1]),
                 np.intersect1d(comp_sites[0], comp_sites[2]),
                 np.intersect1d(comp_sites[1], comp_sites[2]))
예제 #2
0
 def test_show_multi_exps_results(self):
     periods = [["1980-01-01", "1990-01-01"], ["1990-01-01", "2000-01-01"],
                ["2000-01-01", "2010-01-01"], ["2010-01-01", "2020-01-01"]]
     train_test_period_pairs = list(itertools.permutations(periods, 2))
     sub_lst = [
         "basic/exp6", "basic/exp46", "basic/exp47", "basic/exp7",
         "basic/exp37", "basic/exp1", "basic/exp48", "basic/exp8",
         "basic/exp5", "basic/exp49", "basic/exp50", "basic/exp9"
     ]
     exp_lst = [["basic_exp6"], ["basic_exp46"], ["basic_exp47"],
                ["basic_exp7"], ["basic_exp37"], ["basic_exp1"],
                ["basic_exp48"], ["basic_exp8"], ["basic_exp5"],
                ["basic_exp49"], ["basic_exp50"], ["basic_exp9"]]
     for i in range(len(exp_lst)):
         config_file = copy.deepcopy(cfg)
         args = cmd(
             sub=sub_lst[i],
             train_period=train_test_period_pairs[i][0],
             train_mode=0,
             test_period=train_test_period_pairs[i][1],
             quick_data=0,
             cache_state=1,
             flow_screen={
                 'missing_data_ratio': 1,
                 'zero_value_ratio': 1
             },
             te=300,
             gage_id_file=
             "/mnt/data/owen411/code/hydro-anthropogenic-lstm/example/output/gages/basic/exp37/3557basins_ID_NSE_DOR.csv"
         )
         update_cfg(config_file, args)
         config_data = GagesConfig(config_file)
         test_epoch = config_data.config_file.TEST_EPOCH
         inds_df, pred_mean, obs_mean = load_ensemble_result(
             config_file, exp_lst[i], test_epoch, return_value=True)
         hydro_logger.info("the median NSE of %s is %s", sub_lst[i],
                           inds_df["NSE"].median())
예제 #3
0
    gages_model_test = GagesModel.update_data_model(
        all_config_Data,
        data_model_test,
        data_attr_update=True,
        train_stat_dict=gages_model_train.stat_dict,
        screen_basin_area_huc4=False)
    save_datamodel(gages_model_test,
                   data_source_file_name='test_data_source.txt',
                   stat_file_name='test_Statistics.json',
                   flow_file_name='test_flow',
                   forcing_file_name='test_forcing',
                   attr_file_name='test_attr',
                   f_dict_file_name='test_dictFactorize.json',
                   var_dict_file_name='test_dictAttribute.json',
                   t_s_dict_file_name='test_dictTimeSpace.json')
    hydro_logger.info("read and save gages conus data model")

    camels531_gageid_file = os.path.join(config_data.data_path["DB"],
                                         "camels531", "camels531.txt")
    gauge_df = pd.read_csv(camels531_gageid_file, dtype={"GaugeID": str})
    gauge_list = gauge_df["GaugeID"].values
    all_sites_camels_531 = np.sort(
        [str(gauge).zfill(8) for gauge in gauge_list])
    gages_model = GagesModels(config_data,
                              screen_basin_area_huc4=False,
                              sites_id=all_sites_camels_531.tolist())
    save_datamodel(gages_model.data_model_test,
                   data_source_file_name='test_data_source.txt',
                   stat_file_name='test_Statistics.json',
                   flow_file_name='test_flow',
                   forcing_file_name='test_forcing',
예제 #4
0
train_set = "training"
test_set = "testing"
show_ind_key = "NSE"

# fig = plt.figure(figsize=(12, 4))
# gs = gridspec.GridSpec(1, 11)
fig = plt.figure(figsize=(8, 9))
gs = gridspec.GridSpec(2, 2)
titles = ["(a)", "(b)", "(c)", "(d)"]

colors = ["Greens", "Blues", "Reds", "Greys"]
sns.set(font_scale=1)

for k in range(len(exp_lst)):
    if k == len(exp_lst) - 1:
        hydro_logger.info("camels pub")
        # ax_k = plt.subplot(gs[k * 3: k * 3 + 2])
        ax_k = plt.subplot(gs[1, 1])
        ax_k.set_title(titles[k])
        frames_camels_pub = []

        # only camels test, no pub
        # inds_df_camels, pred_mean, obs_mean = load_ensemble_result(camels_exp_lst, test_epoch, return_value=True)
        # df_camels_pub = pd.DataFrame({train_set: np.full([inds_df_camels.shape[0]], train_data_name_lst[k][0]),
        #                               test_set: np.full([inds_df_camels.shape[0]], test_data_name_lst[k][0]),
        #                               show_ind_key: inds_df_camels[show_ind_key]})
        # frames_camels_pub.append(df_camels_pub)
        # pub in camels
        config_data = load_dataconfig_case_exp(cfg, exp_lst[k][0])
        preds = []
        obss = []
    plot_gages_map_and_scatter(data_df, [show_ind_key, "lat", "lon", "slope"],
                               idx_lst,
                               cmap_strs=["Reds", "Blues"],
                               labels=["zero-dor", "small-dor"],
                               scatter_label=[attr_lst[0], show_ind_key],
                               wspace=2,
                               hspace=1.5,
                               legend_y=.8,
                               sub_fig_ratio=[6, 4, 1])
    plt.tight_layout()
    plt.savefig(os.path.join(conus_config_data.data_path["Out"],
                             'zero-small-dor_western_map_comp.png'),
                dpi=FIGURE_DPI,
                bbox_inches="tight")
elif compare_item == 4:
    hydro_logger.info("Are the differences significant?")
    inds_df_pair1 = load_ensemble_result(cfg, pair1_exps, test_epoch)
    inds_df_pair2 = load_ensemble_result(cfg, pair2_exps, test_epoch)
    inds_df_pair3 = load_ensemble_result(cfg, pair3_exps, test_epoch)
    inds_df_conus = load_ensemble_result(cfg, conus_exps, test_epoch)

    keys_nse = "NSE"

    attr_nodam = "zero_dor"
    cases_exps_legends_nodam = ["LSTM-Z", "LSTM-ZS", "LSTM-ZL", "LSTM-CONUS"]
    inds_df_nodam = load_ensemble_result(cfg, nodam_exp_lst, test_epoch)

    np_nodam_alone_nse = inds_df_nodam[keys_nse]
    np_nodam_in_pair1_nse = inds_df_pair1[keys_nse].iloc[
        idx_lst_nodam_in_pair1]
    np_nodam_in_pair2_nse = inds_df_pair2[keys_nse].iloc[
예제 #6
0
    def test_plot_each_symmetric_exp(self):
        train_set = self.train_set
        test_set = self.test_set
        show_ind_key = self.show_ind_key
        test_epoch = self.test_epoch
        split_num = self.split_num
        exp_lst = self.symmetric_exp_lst
        train_data_name_lst = self.symmetric_train_data_name_lst
        test_data_name_lst = self.symmetric_test_data_name_lst

        colors = "Greens"
        sns.set(font_scale=1)
        fig = plt.figure()
        ax_k = fig.add_axes()
        frames = []
        for j in range(len(exp_lst)):
            config_data = load_dataconfig_case_exp(cfg, exp_lst[j])
            preds = []
            obss = []
            preds2 = []
            obss2 = []
            predsbase = []
            obssbase = []
            for i in range(split_num):
                data_model_base = GagesModel.load_datamodel(
                    config_data.data_path["Temp"],
                    str(i),
                    data_source_file_name='test_data_source_base.txt',
                    stat_file_name='test_Statistics_base.json',
                    flow_file_name='test_flow_base.npy',
                    forcing_file_name='test_forcing_base.npy',
                    attr_file_name='test_attr_base.npy',
                    f_dict_file_name='test_dictFactorize_base.json',
                    var_dict_file_name='test_dictAttribute_base.json',
                    t_s_dict_file_name='test_dictTimeSpace_base.json')
                data_model = GagesModel.load_datamodel(
                    config_data.data_path["Temp"],
                    str(i),
                    data_source_file_name='test_data_source.txt',
                    stat_file_name='test_Statistics.json',
                    flow_file_name='test_flow.npy',
                    forcing_file_name='test_forcing.npy',
                    attr_file_name='test_attr.npy',
                    f_dict_file_name='test_dictFactorize.json',
                    var_dict_file_name='test_dictAttribute.json',
                    t_s_dict_file_name='test_dictTimeSpace.json')
                data_model_2 = GagesModel.load_datamodel(
                    config_data.data_path["Temp"],
                    str(i),
                    data_source_file_name='test_data_source_2.txt',
                    stat_file_name='test_Statistics_2.json',
                    flow_file_name='test_flow_2.npy',
                    forcing_file_name='test_forcing_2.npy',
                    attr_file_name='test_attr_2.npy',
                    f_dict_file_name='test_dictFactorize_2.json',
                    var_dict_file_name='test_dictAttribute_2.json',
                    t_s_dict_file_name='test_dictTimeSpace_2.json')
                pred_base, obs_base = load_result(
                    data_model_base.data_source.data_config.data_path['Temp'],
                    test_epoch,
                    pred_name='flow_pred_base',
                    obs_name='flow_obs_base')
                pred_base = pred_base.reshape(pred_base.shape[0],
                                              pred_base.shape[1])
                obs_base = obs_base.reshape(obs_base.shape[0],
                                            obs_base.shape[1])
                hydro_logger.info("the size of %s %s Train-base %s", j, i,
                                  pred_base.shape[0])
                predsbase.append(pred_base)
                obssbase.append(obs_base)

                pred_i, obs_i = load_result(
                    data_model.data_source.data_config.data_path['Temp'],
                    test_epoch)
                pred_i = pred_i.reshape(pred_i.shape[0], pred_i.shape[1])
                obs_i = obs_i.reshape(obs_i.shape[0], obs_i.shape[1])
                hydro_logger.info("the size of %s %s PUB-1 %s", j, i,
                                  pred_i.shape[0])
                preds.append(pred_i)
                obss.append(obs_i)

                pred_2, obs_2 = load_result(
                    data_model_2.data_source.data_config.data_path['Temp'],
                    test_epoch,
                    pred_name='flow_pred_2',
                    obs_name='flow_obs_2')
                pred_2 = pred_2.reshape(pred_2.shape[0], pred_2.shape[1])
                obs_2 = obs_2.reshape(obs_2.shape[0], obs_2.shape[1])
                hydro_logger.info("the size of %s %s PUB-2 %s", j, i,
                                  pred_2.shape[0])
                preds2.append(pred_2)
                obss2.append(obs_2)

            predsbase_np = reduce(lambda a, b: np.vstack((a, b)), predsbase)
            obssbase_np = reduce(lambda a, b: np.vstack((a, b)), obssbase)
            indsbase = statError(obssbase_np, predsbase_np)
            inds_df_abase = pd.DataFrame(indsbase)

            preds_np = reduce(lambda a, b: np.vstack((a, b)), preds)
            obss_np = reduce(lambda a, b: np.vstack((a, b)), obss)
            inds = statError(obss_np, preds_np)
            inds_df_a = pd.DataFrame(inds)

            preds2_np = reduce(lambda a, b: np.vstack((a, b)), preds2)
            obss2_np = reduce(lambda a, b: np.vstack((a, b)), obss2)
            inds2 = statError(obss2_np, preds2_np)
            inds_df_a2 = pd.DataFrame(inds2)

            if j == 0 or j == 1:
                df_abase = pd.DataFrame({
                    train_set:
                    np.full([inds_df_abase.shape[0]], train_data_name_lst[j]),
                    test_set:
                    np.full([inds_df_abase.shape[0]], test_data_name_lst[j]),
                    show_ind_key:
                    inds_df_abase[show_ind_key]
                })
                frames.append(df_abase)
            if j == 1:
                df_a = pd.DataFrame({
                    train_set:
                    np.full([inds_df_a.shape[0]], train_data_name_lst[j]),
                    test_set:
                    np.full([inds_df_a.shape[0]], test_data_name_lst[3]),
                    show_ind_key:
                    inds_df_a[show_ind_key]
                })
                df_a2 = pd.DataFrame({
                    train_set:
                    np.full([inds_df_a2.shape[0]], train_data_name_lst[j]),
                    test_set:
                    np.full([inds_df_a2.shape[0]], test_data_name_lst[2]),
                    show_ind_key:
                    inds_df_a2[show_ind_key]
                })
            else:
                df_a = pd.DataFrame({
                    train_set:
                    np.full([inds_df_a.shape[0]], train_data_name_lst[j]),
                    test_set:
                    np.full([inds_df_a.shape[0]], test_data_name_lst[2]),
                    show_ind_key:
                    inds_df_a[show_ind_key]
                })
                df_a2 = pd.DataFrame({
                    train_set:
                    np.full([inds_df_a2.shape[0]], train_data_name_lst[j]),
                    test_set:
                    np.full([inds_df_a2.shape[0]], test_data_name_lst[3]),
                    show_ind_key:
                    inds_df_a2[show_ind_key]
                })
            frames.append(df_a)
            frames.append(df_a2)

        result = pd.concat(frames)
        sns_box = sns.boxplot(
            ax=ax_k,
            x=train_set,
            y=show_ind_key,
            hue=test_set,  # hue_order=test_data_name_lst,
            data=result,
            showfliers=False,
            palette=colors)  # , width=0.8
        medians = result.groupby([train_set, test_set],
                                 sort=False)[show_ind_key].median().values
        hydro_logger.info(medians)
        create_median_labels(sns_box.axes, has_fliers=False)

        sns.despine()
        plt.tight_layout()
        plt.show()
        hydro_logger.debug("plot successfully")
예제 #7
0
    def test_diff_dor_fig2_in_the_paper(self):
        data_model = GagesModel.load_datamodel(
            self.config_data.data_path["Temp"],
            data_source_file_name='data_source.txt',
            stat_file_name='Statistics.json',
            flow_file_name='flow.npy',
            forcing_file_name='forcing.npy',
            attr_file_name='attr.npy',
            f_dict_file_name='dictFactorize.json',
            var_dict_file_name='dictAttribute.json',
            t_s_dict_file_name='dictTimeSpace.json')
        config_data = self.config_data
        config_file = self.config_file
        test_epoch = self.test_epoch
        exp_lst = self.exp_lst
        figure_dpi = self.FIGURE_DPI
        inds_df, pred_mean, obs_mean = load_ensemble_result(config_file,
                                                            exp_lst,
                                                            test_epoch,
                                                            return_value=True)
        diversion_yes = True
        diversion_no = False
        source_data_diversion = GagesSource.choose_some_basins(
            config_data,
            config_data.model_dict["data"]["tRangeTrain"],
            screen_basin_area_huc4=False,
            diversion=diversion_yes)
        source_data_nodivert = GagesSource.choose_some_basins(
            config_data,
            config_data.model_dict["data"]["tRangeTrain"],
            screen_basin_area_huc4=False,
            diversion=diversion_no)
        sites_id_nodivert = source_data_nodivert.all_configs[
            'flow_screen_gage_id']
        sites_id_diversion = source_data_diversion.all_configs[
            'flow_screen_gage_id']

        dor_1 = -self.dor
        dor_2 = self.dor
        source_data_dor1 = GagesSource.choose_some_basins(
            config_data,
            config_data.model_dict["data"]["tRangeTrain"],
            screen_basin_area_huc4=False,
            DOR=dor_1)
        source_data_dor2 = GagesSource.choose_some_basins(
            config_data,
            config_data.model_dict["data"]["tRangeTrain"],
            screen_basin_area_huc4=False,
            DOR=dor_2)
        sites_id_dor1 = source_data_dor1.all_configs['flow_screen_gage_id']
        sites_id_dor2 = source_data_dor2.all_configs['flow_screen_gage_id']

        # basins with dams
        source_data_withdams = GagesSource.choose_some_basins(
            config_data,
            config_data.model_dict["data"]["tRangeTrain"],
            screen_basin_area_huc4=False,
            dam_num=[1, 100000])
        sites_id_withdams = source_data_withdams.all_configs[
            'flow_screen_gage_id']
        sites_id_dor1 = np.intersect1d(np.array(sites_id_dor1),
                                       np.array(sites_id_withdams)).tolist()

        no_divert_small_dor = np.intersect1d(sites_id_nodivert, sites_id_dor1)
        no_divert_large_dor = np.intersect1d(sites_id_nodivert, sites_id_dor2)
        diversion_small_dor = np.intersect1d(sites_id_diversion, sites_id_dor1)
        diversion_large_dor = np.intersect1d(sites_id_diversion, sites_id_dor2)

        all_sites = data_model.t_s_dict["sites_id"]
        idx_lst_nodivert_smalldor = [
            i for i in range(len(all_sites))
            if all_sites[i] in no_divert_small_dor
        ]
        idx_lst_nodivert_largedor = [
            i for i in range(len(all_sites))
            if all_sites[i] in no_divert_large_dor
        ]
        idx_lst_diversion_smalldor = [
            i for i in range(len(all_sites))
            if all_sites[i] in diversion_small_dor
        ]
        idx_lst_diversion_largedor = [
            i for i in range(len(all_sites))
            if all_sites[i] in diversion_large_dor
        ]

        keys_nse = "NSE"
        xs = []
        ys = []
        cases_exps_legends_together = [
            "not_diverted_small_dor", "not_diverted_large_dor",
            "diversion_small_dor", "diversion_large_dor", "CONUS"
        ]

        x1, y1 = ecdf(inds_df[keys_nse].iloc[idx_lst_nodivert_smalldor])
        xs.append(x1)
        ys.append(y1)

        x2, y2 = ecdf(inds_df[keys_nse].iloc[idx_lst_nodivert_largedor])
        xs.append(x2)
        ys.append(y2)

        x3, y3 = ecdf(inds_df[keys_nse].iloc[idx_lst_diversion_smalldor])
        xs.append(x3)
        ys.append(y3)

        x4, y4 = ecdf(inds_df[keys_nse].iloc[idx_lst_diversion_largedor])
        xs.append(x4)
        ys.append(y4)

        x_conus, y_conus = ecdf(inds_df[keys_nse])
        xs.append(x_conus)
        ys.append(y_conus)
        hydro_logger.info(
            "The median NSEs of all five curves (%s) are \n %.2f, %.2f, %.2f, %.2f, %.2f",
            cases_exps_legends_together, np.median(x1), np.median(x2),
            np.median(x3), np.median(x4), np.median(x_conus))
        # plot_ecdfs_matplot(xs, ys, cases_exps_legends_together,
        #                    colors=["#1f77b4", "#ff7f0e", "#2ca02c", "#d62728", "grey"],
        #                    dash_lines=[False, False, False, False, True], x_str="NSE", y_str="CDF")
        # plot using two linestyles and two colors for dor and diversion.
        # plot_ecdfs(xs, ys, cases_exps_legends_together, x_str="NSE", y_str="CDF")
        # define color scheme and line style
        colors = ["#1f77b4", "#d62728"]
        linestyles = ['-', "--"]
        markers = ["", "."]

        fig = plt.figure(figsize=(8, 6))
        axes = fig.add_axes([0.1, 0.1, 0.8, 0.8])
        # for i, marker in enumerate(markers):
        for i, linestyle in enumerate(linestyles):
            for j, color in enumerate(colors):
                plt.plot(
                    xs[i * 2 + j],
                    ys[i * 2 + j],
                    color=color,
                    ls=linestyle,  # marker=marker,
                    label=cases_exps_legends_together[i * 2 + j])
        line_i, = axes.plot(x_conus,
                            y_conus,
                            color="grey",
                            label=cases_exps_legends_together[4])
        line_i.set_dashes([2, 2, 10, 2])

        x_str = "NSE"
        y_str = "CDF"
        x_lim = (0, 1)
        y_lim = (0, 1)
        x_interval = 0.1
        y_interval = 0.1
        plt.xlabel(x_str, fontsize=18)
        plt.ylabel(y_str, fontsize=18)
        axes.set_xlim(x_lim[0], x_lim[1])
        axes.set_ylim(y_lim[0], y_lim[1])
        # set x y number font size
        plt.xticks(np.arange(x_lim[0], x_lim[1] + x_lim[1] / 100, x_interval),
                   fontsize=16)
        plt.yticks(np.arange(y_lim[0], y_lim[1] + y_lim[1] / 100, y_interval),
                   fontsize=16)
        plt.grid()
        # Hide the right and top spines
        axes.spines['right'].set_visible(False)
        axes.spines['top'].set_visible(False)
        axes.legend()
        plt.legend(prop={'size': 16})
        plt.savefig(os.path.join(config_data.data_path["Out"],
                                 'new_dor_divert_comp_matplotlib.png'),
                    dpi=figure_dpi,
                    bbox_inches="tight")
        plt.show()
예제 #8
0
    def test_gages_nse_dam_attr(self):
        figure_dpi = 600
        config_data = self.config_data
        data_dir = config_data.data_path["Temp"]
        data_model = GagesModel.load_datamodel(
            data_dir,
            data_source_file_name='test_data_source.txt',
            stat_file_name='test_Statistics.json',
            flow_file_name='test_flow.npy',
            forcing_file_name='test_forcing.npy',
            attr_file_name='test_attr.npy',
            f_dict_file_name='test_dictFactorize.json',
            var_dict_file_name='test_dictAttribute.json',
            t_s_dict_file_name='test_dictTimeSpace.json')
        gages_id = data_model.t_s_dict["sites_id"]

        exp_lst = [
            "basic_exp37", "basic_exp39", "basic_exp40", "basic_exp41",
            "basic_exp42", "basic_exp43"
        ]
        self.inds_df, pred_mean, obs_mean = load_ensemble_result(
            config_data.config_file,
            exp_lst,
            config_data.config_file.TEST_EPOCH,
            return_value=True)
        show_ind_key = 'NSE'

        plt.rcParams['font.family'] = 'serif'
        plt.rcParams['font.serif'] = ['Times New Roman'
                                      ] + plt.rcParams['font.serif']
        # plot NSE-DOR
        attr_lst = ["RUNAVE7100", "STOR_NOR_2009"]
        attrs_runavg_stor = data_model.data_source.read_attr(
            gages_id, attr_lst, is_return_dict=False)
        run_avg = attrs_runavg_stor[:, 0] * (10**(-3)) * (10**6
                                                          )  # m^3 per year
        nor_storage = attrs_runavg_stor[:, 1] * 1000  # m^3
        dors = nor_storage / run_avg
        # dor = 0 is not totally same with dam_num=0 (some dammed basins' dor is about 0.00),
        # here for zero-dor we mainly rely on dam_num = 0
        attr_dam_num = ["NDAMS_2009"]
        attrs_dam_num = data_model.data_source.read_attr(gages_id,
                                                         attr_dam_num,
                                                         is_return_dict=False)
        df = pd.DataFrame({
            "DOR": dors,
            "DAM_NUM": attrs_dam_num[:, 0],
            show_ind_key: self.inds_df[show_ind_key].values
        })
        hydro_logger.info("statistics of dors:\n %s", df.describe())
        hydro_logger.info("percentiles of dors:\n %s", df.quantile(q=0.95))
        hydro_logger.info("ecdf of dors:\n %s", ecdf(dors))

        # boxplot
        # add a column to represent the dor range for the df
        dor_value_range_lst = [[0, 0], [0, 0.02], [0.02, 0.05], [0.05, 0.1],
                               [0.1, 0.2], [0.2, 0.4], [0.4, 0.8],
                               [0.8, 10000]]
        dor_range_lst = ["0"] + [
            str(dor_value_range_lst[i][0]) + "-" +
            str(dor_value_range_lst[i][1])
            for i in range(1,
                           len(dor_value_range_lst) - 1)
        ] + [">" + str(dor_value_range_lst[-1][0])]

        # add a column to represent the dam_num range for the df
        dam_num_value_range_lst = [[0, 0], [0, 1], [1, 3], [3, 5], [5, 10],
                                   [10, 20], [20, 50], [50, 10000]]
        dam_num_range_lst = ["0", "1"] + [
            str(dam_num_value_range_lst[i][0]) + "-" +
            str(dam_num_value_range_lst[i][1])
            for i in range(2,
                           len(dam_num_value_range_lst) - 1)
        ] + [">" + str(dam_num_value_range_lst[-1][0])]

        def in_which_range(value_temp):
            if value_temp == 0:
                return "0"
            the_range = [
                a_range for a_range in dor_value_range_lst
                if a_range[0] < value_temp <= a_range[1]
            ]
            if the_range[0][0] == dor_value_range_lst[-1][0]:
                the_range_str = ">" + str(the_range[0][0])
            else:
                the_range_str = str(the_range[0][0]) + "-" + str(
                    the_range[0][1])
            return the_range_str

        def in_which_dam_num_range(value_tmp):
            if value_tmp == 0:
                return "0"
            if value_tmp == 1:
                return "1"
            the_ran = [
                a_ran for a_ran in dam_num_value_range_lst
                if a_ran[0] < value_tmp <= a_ran[1]
            ]
            if the_ran[0][0] == dam_num_value_range_lst[-1][0]:
                the_ran_str = ">" + str(the_ran[0][0])
            else:
                the_ran_str = str(the_ran[0][0]) + "-" + str(the_ran[0][1])
            return the_ran_str

        df["DOR_RANGE"] = df["DOR"].apply(in_which_range)
        df["DAM_NUM_RANGE"] = df["DAM_NUM"].apply(in_which_dam_num_range)
        df.loc[(df["DAM_NUM"] > 0) & (df["DOR_RANGE"] == "0"),
               "DOR_RANGE"] = dor_range_lst[1]
        shown_nse_range_boxplots = [-0.5, 1.0]
        sns.set(font="serif", font_scale=1.5, color_codes=True)
        plot_boxs(df,
                  "DOR_RANGE",
                  show_ind_key,
                  ylim=shown_nse_range_boxplots,
                  order=dor_range_lst)
        plt.savefig(os.path.join(
            config_data.data_path["Out"],
            'NSE~DOR-boxplots-' + str(shown_nse_range_boxplots) + '.png'),
                    dpi=figure_dpi,
                    bbox_inches="tight")
        plt.figure()
        shown_nse_range_boxplots = [0, 1.0]
        sns.set(font="serif", font_scale=1.5, color_codes=True)
        plot_boxs(df,
                  "DAM_NUM_RANGE",
                  show_ind_key,
                  ylim=shown_nse_range_boxplots,
                  order=dam_num_range_lst)
        plt.savefig(os.path.join(
            config_data.data_path["Out"],
            'NSE~DAM_NUM-boxplots-' + str(shown_nse_range_boxplots) + '.png'),
                    dpi=figure_dpi,
                    bbox_inches="tight")
        nums_in_dor_range = [
            df[df["DOR_RANGE"] == a_range_rmp].shape[0]
            for a_range_rmp in dor_range_lst
        ]
        ratios_in_dor_range = [
            a_num / df.shape[0] for a_num in nums_in_dor_range
        ]
        hydro_logger.info(
            "the number and ratio of basins in each dor range\n: %s \n %s",
            nums_in_dor_range, ratios_in_dor_range)

        nums_in_dam_num_range = [
            df[df["DAM_NUM_RANGE"] == a_range_rmp].shape[0]
            for a_range_rmp in dam_num_range_lst
        ]
        ratios_in_dam_num_range = [
            a_num / df.shape[0] for a_num in nums_in_dam_num_range
        ]
        hydro_logger.info(
            "the number and ratio of basins in each dam_num range\n: %s \n %s",
            nums_in_dam_num_range, ratios_in_dam_num_range)

        # regplot
        plt.figure()
        sns.set(font="serif", font_scale=1.5, color_codes=True)
        sr = sns.regplot(x="DOR",
                         y=show_ind_key,
                         data=df[df[show_ind_key] >= 0],
                         scatter_kws={'s': 10})
        show_dor_max = df.quantile(
            q=0.95)["DOR"]  # 30  # max(dors)  # 0.8  # 10
        show_dor_min = min(dors)
        plt.ylim(0, 1)
        plt.xlim(show_dor_min, show_dor_max)
        plt.savefig(os.path.join(
            config_data.data_path["Out"],
            'NSE~DOR-shown-max-' + str(show_dor_max) + '.png'),
                    dpi=figure_dpi,
                    bbox_inches="tight")

        # jointplot
        # dor_range = [0.2, 0.9]
        dor_range = [0.002, 0.2]
        # plt.figure()
        sns.set(font="serif", font_scale=1.5, color_codes=True)
        # g = sns.jointplot(x="DOR", y=show_ind_key, data=df[(df["DOR"] < 1) & (df[show_ind_key] >= 0)], kind="reg",
        #                   marginal_kws=dict(bins=25))
        # g = sns.jointplot(x="DOR", y=show_ind_key, data=df[(df["DOR"] < 1) & (df[show_ind_key] >= 0)], kind="hex",
        #                   color="b", marginal_kws=dict(bins=50))
        g = sns.jointplot(
            x="DOR",
            y=show_ind_key,
            data=df[(df["DOR"] < dor_range[1]) & (df["DOR"] > dor_range[0]) &
                    (df[show_ind_key] >= 0)],
            kind="hex",
            color="b")
        g.ax_marg_x.set_xlim(dor_range[0], dor_range[1])
        # g.ax_marg_y.set_ylim(-0.5, 1)
        plt.savefig(os.path.join(
            config_data.data_path["Out"],
            'NSE~DOR(range-)' + str(dor_range) + '-jointplot.png'),
                    dpi=figure_dpi,
                    bbox_inches="tight")

        nid_dir = os.path.join(
            "/".join(self.config_data.data_path["DB"].split("/")[:-1]), "nid",
            "test")
        nid_input = NidModel.load_nidmodel(
            nid_dir,
            nid_source_file_name='nid_source.txt',
            nid_data_file_name='nid_data.shp')
        gage_main_dam_purpose = unserialize_json(
            os.path.join(nid_dir, "dam_main_purpose_dict.json"))
        data_input = GagesDamDataModel(data_model, nid_input,
                                       gage_main_dam_purpose)
        dam_coords = unserialize_json_ordered(
            os.path.join(nid_dir, "dam_points_dict.json"))
        dam_storages = unserialize_json_ordered(
            os.path.join(nid_dir, "dam_storages_dict.json"))
        dam_ids_1 = list(gage_main_dam_purpose.keys())
        dam_ids_2 = list(dam_coords.keys())
        dam_ids_3 = list(dam_storages.keys())
        assert (all(x < y for x, y in zip(dam_ids_1, dam_ids_1[1:])))
        assert (all(x < y for x, y in zip(dam_ids_2, dam_ids_2[1:])))
        assert (all(x < y for x, y in zip(dam_ids_3, dam_ids_3[1:])))

        sites = list(dam_coords.keys())
        c, ind1, idx_lst_nse_range = np.intersect1d(sites,
                                                    gages_id,
                                                    return_indices=True)

        std_storage_in_a_basin = list(map(np.std, dam_storages.values()))
        log_std_storage_in_a_basin = list(
            map(np.log,
                np.array(std_storage_in_a_basin) + 1))
        nse_values = self.inds_df["NSE"].values[idx_lst_nse_range]
        df = pd.DataFrame({
            "DAM_STORAGE_STD": log_std_storage_in_a_basin,
            show_ind_key: nse_values
        })
        plt.figure()
        sns.set(font="serif", font_scale=1.5, color_codes=True)
        g = sns.regplot(x="DAM_STORAGE_STD",
                        y=show_ind_key,
                        data=df[df[show_ind_key] >= 0],
                        scatter_kws={'s': 10})
        show_max = max(log_std_storage_in_a_basin)
        show_min = min(log_std_storage_in_a_basin)
        if show_min < 0:
            show_min = 0
        # g.ax_marg_x.set_xlim(show_min, show_max)
        # g.ax_marg_y.set_ylim(0, 1)
        plt.ylim(0, 1)
        plt.xlim(show_min, show_max)
        plt.savefig(os.path.join(config_data.data_path["Out"],
                                 'NSE~' + "DAM_STORAGE_STD" + '.png'),
                    dpi=figure_dpi,
                    bbox_inches="tight")

        gages_loc_lat = data_model.data_source.gage_dict["LAT_GAGE"]
        gages_loc_lon = data_model.data_source.gage_dict["LNG_GAGE"]
        gages_loc = [[gages_loc_lat[i], gages_loc_lon[i]]
                     for i in range(len(gages_id))]
        # calculate index of dispersion, then plot the NSE-dispersion scatterplot
        # Geo coord system of gages_loc and dam_coords are both NAD83
        coefficient_of_var = list(
            map(coefficient_of_variation, gages_loc, dam_coords.values()))
        coefficient_of_var_min = min(coefficient_of_var)
        coefficient_of_var_max = max(coefficient_of_var)
        dispersion_var = "DAM_GAGE_DIS_VAR"
        nse_values = self.inds_df["NSE"].values[idx_lst_nse_range]
        df = pd.DataFrame({
            dispersion_var: coefficient_of_var,
            show_ind_key: nse_values
        })
        plt.figure()
        sns.set(font="serif", font_scale=1.5, color_codes=True)
        g = sns.regplot(x=dispersion_var,
                        y=show_ind_key,
                        data=df[df[show_ind_key] >= 0],
                        scatter_kws={'s': 10})
        show_max = coefficient_of_var_max
        show_min = coefficient_of_var_min
        if show_min < 0:
            show_min = 0
        # g.ax_marg_x.set_xlim(show_min, show_max)
        # g.ax_marg_y.set_ylim(0, 1)
        plt.ylim(0, 1)
        plt.xlim(show_min, show_max)
        plt.savefig(os.path.join(config_data.data_path["Out"],
                                 'NSE~' + dispersion_var + '.png'),
                    dpi=figure_dpi,
                    bbox_inches="tight")

        idx_dispersions = list(
            map(ind_of_dispersion, gages_loc, dam_coords.values()))
        idx_dispersion_min = min(idx_dispersions)
        idx_dispersion_max = max(idx_dispersions)
        dispersion_var = "DAM_DISPERSION_BASIN"
        # nse_range = [0, 1]
        # idx_lst_nse_range = inds_df_now[(inds_df_now[show_ind_key] >= nse_range[0]) & (inds_df_now[show_ind_key] < nse_range[1])].index.tolist()
        nse_values = self.inds_df["NSE"].values[idx_lst_nse_range]
        df = pd.DataFrame({
            dispersion_var: idx_dispersions,
            show_ind_key: nse_values
        })
        # g = sns.regplot(x=dispersion_var, y=show_ind_key, data=df[df[show_ind_key] >= 0], scatter_kws={'s': 10})
        if idx_dispersion_min < 0:
            idx_dispersion_min = 0
        plt.ylim(0, 1)
        plt.xlim(idx_dispersion_min, idx_dispersion_max)
        # plt.figure()
        sns.set(font="serif", font_scale=1.5, color_codes=True)
        g = sns.jointplot(x=dispersion_var,
                          y=show_ind_key,
                          data=df[df[show_ind_key] >= 0],
                          kind="reg")
        g.ax_marg_x.set_xlim(idx_dispersion_min, idx_dispersion_max)
        g.ax_marg_y.set_ylim(0, 1)
        plt.show()
예제 #9
0
    xs.append(x2)
    ys.append(y2)

    x3, y3 = ecdf(inds_df[keys_nse].iloc[idx_lst_diversion_smalldor])
    xs.append(x3)
    ys.append(y3)

    x4, y4 = ecdf(inds_df[keys_nse].iloc[idx_lst_diversion_largedor])
    xs.append(x4)
    ys.append(y4)

    x_conus, y_conus = ecdf(inds_df[keys_nse])
    xs.append(x_conus)
    ys.append(y_conus)
    hydro_logger.info(
        "The median NSEs of all five curves (%s) are \n %.2f, %.2f, %.2f, %.2f, %.2f",
        cases_exps_legends_together, np.median(x1), np.median(x2),
        np.median(x3), np.median(x4), np.median(x_conus))
    plot_ecdfs_matplot(
        xs,
        ys,
        cases_exps_legends_together,
        colors=["#1f77b4", "#ff7f0e", "#2ca02c", "#d62728", "grey"],
        dash_lines=[False, False, False, False, True],
        x_str="NSE",
        y_str="CDF")
    plt.savefig(os.path.join(config_data.data_path["Out"],
                             'dor_divert_comp_matplotlib.png'),
                dpi=FIGURE_DPI,
                bbox_inches="tight")

    ############################ plot map  ###########################
예제 #10
0
    def read_usge_gage(self, huc, usgs_id, t_lst):
        """read data for one gage"""
        hydro_logger.info("reading %s streamflow data", usgs_id)
        dir_gage_flow = self.all_configs.get("flow_dir")
        usgs_file = os.path.join(dir_gage_flow, str(huc), usgs_id + '.txt')
        # ignore the comment lines and the first non-value row
        df_flow = pd.read_csv(usgs_file,
                              comment='#',
                              sep='\t',
                              dtype={
                                  'site_no': str
                              }).iloc[1:, :]
        # change the original column names
        columns_names = df_flow.columns.tolist()
        columns_flow = []
        columns_flow_cd = []
        for column_name in columns_names:
            # 00060 means "discharge",00003 represents "mean value"
            # one special case: 126801       00060     00003     Discharge, cubic feet per second (Mean) and
            # 126805       00060     00003     Discharge, cubic feet per second (Mean), PUBLISHED
            # Both are mean values, here I will choose the column with more records
            if '_00060_00003' in column_name and '_00060_00003_cd' not in column_name:
                columns_flow.append(column_name)
        for column_name in columns_names:
            if '_00060_00003_cd' in column_name:
                columns_flow_cd.append(column_name)
        if len(columns_flow) > 1:
            hydro_logger.debug("there are some columns for flow, choose one\n")
            df_date_temp = df_flow['datetime']
            date_temp = pd.to_datetime(df_date_temp).values.astype(
                'datetime64[D]')
            c_temp, ind1_temp, ind2_temp = np.intersect1d(date_temp,
                                                          t_lst,
                                                          return_indices=True)
            num_nan_lst = []
            for i in range(len(columns_flow)):
                out_temp = np.full([len(t_lst)], np.nan)
                df_flow_temp = df_flow[columns_flow[i]].copy()
                df_flow_temp.loc[df_flow_temp == "Rat"] = np.nan
                df_flow_temp.loc[df_flow_temp == "Dis"] = np.nan
                df_flow_temp.loc[df_flow_temp == "Ice"] = np.nan
                df_flow_temp.loc[df_flow_temp == "Ssn"] = np.nan
                out_temp[ind2_temp] = df_flow_temp.iloc[ind1_temp]
                num_nan = np.isnan(out_temp).sum()
                num_nan_lst.append(num_nan)
            num_nan_np = np.array(num_nan_lst)
            index_flow_num = np.argmin(num_nan_np)
            df_flow.rename(columns={columns_flow[index_flow_num]: 'flow'},
                           inplace=True)
            df_flow.rename(columns={columns_flow_cd[index_flow_num]: 'mode'},
                           inplace=True)
        else:
            for column_name in columns_names:
                if '_00060_00003' in column_name and '_00060_00003_cd' not in column_name:
                    df_flow.rename(columns={column_name: 'flow'}, inplace=True)
                    break
            for column_name in columns_names:
                if '_00060_00003_cd' in column_name:
                    df_flow.rename(columns={column_name: 'mode'}, inplace=True)
                    break

        columns = ['agency_cd', 'site_no', 'datetime', 'flow', 'mode']
        if df_flow.empty:
            df_flow = pd.DataFrame(columns=columns)
        if not ('flow' in df_flow.columns.intersection(columns)):
            data_temp = df_flow.loc[:, df_flow.columns.intersection(columns)]
            # add nan column to data_temp
            data_temp = pd.concat(
                [data_temp, pd.DataFrame(columns=['flow', 'mode'])])
        else:
            data_temp = df_flow.loc[:, columns]
        # fix flow which is not numeric data
        data_temp.loc[data_temp['flow'] == "Ice", 'flow'] = np.nan
        data_temp.loc[data_temp['flow'] == "Ssn", 'flow'] = np.nan
        data_temp.loc[data_temp['flow'] == "Tst", 'flow'] = np.nan
        data_temp.loc[data_temp['flow'] == "Eqp", 'flow'] = np.nan
        data_temp.loc[data_temp['flow'] == "Rat", 'flow'] = np.nan
        data_temp.loc[data_temp['flow'] == "Dis", 'flow'] = np.nan
        data_temp.loc[data_temp['flow'] == "Bkw", 'flow'] = np.nan
        data_temp.loc[data_temp['flow'] == "***", 'flow'] = np.nan
        data_temp.loc[data_temp['flow'] == "Mnt", 'flow'] = np.nan
        data_temp.loc[data_temp['flow'] == "ZFL", 'flow'] = np.nan
        # set negative value -- nan
        obs = data_temp['flow'].astype('float').values
        obs[obs < 0] = np.nan
        # time range intersection. set points without data nan values
        nt = len(t_lst)
        out = np.full([nt], np.nan)
        # date in df is str,so transform them to datetime
        df_date = data_temp['datetime']
        date = pd.to_datetime(df_date).values.astype('datetime64[D]')
        c, ind1, ind2 = np.intersect1d(date, t_lst, return_indices=True)
        out[ind2] = obs[ind1]
        return out
예제 #11
0
from data.data_input import GagesModel
from data.gages_input_dataset import load_dataconfig_case_exp, load_ensemble_result
from data.config import cfg, update_cfg, cmd
from utils.hydro_util import hydro_logger
from visual.plot_model import plot_sites_and_attr, plot_scatter_multi_attrs

FIGURE_DPI = 600

# cite data from this paper(https://doi.org/10.1029/2007WR005971) according to its table 1
data_validate = pd.read_csv("paper10.1029_2007WR005971-table1.csv")

# statistical analysis for NSq. NSq-i, NSq-a: Model performance when respectively Ignoring and Accounting for the
# volume variations in the reservoirs in control mode.
nsqi = data_validate['NSq‐i'].astype(float)
nsqa = data_validate['NSq‐a'].astype(float)
hydro_logger.info("nsq-i MEDIAN value is %s", np.nanmedian(nsqi.values))
hydro_logger.info("nsq-i MEAN value is %s", np.nanmean(nsqi.values))
hydro_logger.info("nsq-a MEDIAN value is %s", np.nanmedian(nsqa.values))
hydro_logger.info("nsq-a MEAN value is %s", np.nanmean(nsqa.values))

# calculate the dor values of all basins
idx4paper = 0
paper_dors = []
while idx4paper < data_validate.shape[0]:
    dam_num_tmp = data_validate['Number of Main dams'][idx4paper]
    if math.isnan(dam_num_tmp):
        hydro_logger.error("miss it")
    else:
        dam_num_tmp = int(dam_num_tmp)
        if type(data_validate['Watershed Area, km2'][idx4paper]) == str:
            watershed_area = float(data_validate['Watershed Area, km2'][idx4paper].replace(',', ''))