def test_read_sites_id_see_dor(self): exp_lst = ["exp18", "exp19", "exp20", "exp21", "exp22", "exp23"] sub_lst = ["0", "1"] diff_lst = [ "dictTimeSpace.json", "test_dictTimeSpace.json", "test_dictTimeSpace_2.json" ] for exp_str in exp_lst: for sub_str in sub_lst: comp_sites = [] for item in diff_lst: gage_id_file = os.path.join( self.config_data.config_file["ROOT_DIR"], "temp", "gages", "ecoregion", exp_str, sub_str, item) usgs_id = unserialize_json(gage_id_file)["sites_id"] assert (all(x < y for x, y in zip(usgs_id, usgs_id[1:]))) comp_sites.append(usgs_id) # mm/year 1-km grid, megaliters total storage per sq km (1 megaliters = 1,000,000 liters = 1,000 cubic meters) # attr_lst = ["RUNAVE7100", "STOR_NID_2009"] attr_lst = ["RUNAVE7100", "STOR_NOR_2009"] source_data = GagesSource.choose_some_basins( self.config_data, self.config_data.model_dict["data"]["tRangeTrain"], screen_basin_area_huc4=False, sites_id=usgs_id) data_attr, var_dict, f_dict = source_data.read_attr( usgs_id, attr_lst) run_avg = data_attr[:, 0] * (10**(-3)) * (10**6 ) # m^3 per year nor_storage = data_attr[:, 1] * 1000 # m^3 dors = nor_storage / run_avg results = [round(i, 3) for i in dors] hydro_logger.info( exp_str + "-" + sub_str + "-" + item + " DOR: %s", results) hydro_logger.info( "the intersection of each pair of sites: %s, %s, %s", np.intersect1d(comp_sites[0], comp_sites[1]), np.intersect1d(comp_sites[0], comp_sites[2]), np.intersect1d(comp_sites[1], comp_sites[2]))
def test_show_multi_exps_results(self): periods = [["1980-01-01", "1990-01-01"], ["1990-01-01", "2000-01-01"], ["2000-01-01", "2010-01-01"], ["2010-01-01", "2020-01-01"]] train_test_period_pairs = list(itertools.permutations(periods, 2)) sub_lst = [ "basic/exp6", "basic/exp46", "basic/exp47", "basic/exp7", "basic/exp37", "basic/exp1", "basic/exp48", "basic/exp8", "basic/exp5", "basic/exp49", "basic/exp50", "basic/exp9" ] exp_lst = [["basic_exp6"], ["basic_exp46"], ["basic_exp47"], ["basic_exp7"], ["basic_exp37"], ["basic_exp1"], ["basic_exp48"], ["basic_exp8"], ["basic_exp5"], ["basic_exp49"], ["basic_exp50"], ["basic_exp9"]] for i in range(len(exp_lst)): config_file = copy.deepcopy(cfg) args = cmd( sub=sub_lst[i], train_period=train_test_period_pairs[i][0], train_mode=0, test_period=train_test_period_pairs[i][1], quick_data=0, cache_state=1, flow_screen={ 'missing_data_ratio': 1, 'zero_value_ratio': 1 }, te=300, gage_id_file= "/mnt/data/owen411/code/hydro-anthropogenic-lstm/example/output/gages/basic/exp37/3557basins_ID_NSE_DOR.csv" ) update_cfg(config_file, args) config_data = GagesConfig(config_file) test_epoch = config_data.config_file.TEST_EPOCH inds_df, pred_mean, obs_mean = load_ensemble_result( config_file, exp_lst[i], test_epoch, return_value=True) hydro_logger.info("the median NSE of %s is %s", sub_lst[i], inds_df["NSE"].median())
gages_model_test = GagesModel.update_data_model( all_config_Data, data_model_test, data_attr_update=True, train_stat_dict=gages_model_train.stat_dict, screen_basin_area_huc4=False) save_datamodel(gages_model_test, data_source_file_name='test_data_source.txt', stat_file_name='test_Statistics.json', flow_file_name='test_flow', forcing_file_name='test_forcing', attr_file_name='test_attr', f_dict_file_name='test_dictFactorize.json', var_dict_file_name='test_dictAttribute.json', t_s_dict_file_name='test_dictTimeSpace.json') hydro_logger.info("read and save gages conus data model") camels531_gageid_file = os.path.join(config_data.data_path["DB"], "camels531", "camels531.txt") gauge_df = pd.read_csv(camels531_gageid_file, dtype={"GaugeID": str}) gauge_list = gauge_df["GaugeID"].values all_sites_camels_531 = np.sort( [str(gauge).zfill(8) for gauge in gauge_list]) gages_model = GagesModels(config_data, screen_basin_area_huc4=False, sites_id=all_sites_camels_531.tolist()) save_datamodel(gages_model.data_model_test, data_source_file_name='test_data_source.txt', stat_file_name='test_Statistics.json', flow_file_name='test_flow', forcing_file_name='test_forcing',
train_set = "training" test_set = "testing" show_ind_key = "NSE" # fig = plt.figure(figsize=(12, 4)) # gs = gridspec.GridSpec(1, 11) fig = plt.figure(figsize=(8, 9)) gs = gridspec.GridSpec(2, 2) titles = ["(a)", "(b)", "(c)", "(d)"] colors = ["Greens", "Blues", "Reds", "Greys"] sns.set(font_scale=1) for k in range(len(exp_lst)): if k == len(exp_lst) - 1: hydro_logger.info("camels pub") # ax_k = plt.subplot(gs[k * 3: k * 3 + 2]) ax_k = plt.subplot(gs[1, 1]) ax_k.set_title(titles[k]) frames_camels_pub = [] # only camels test, no pub # inds_df_camels, pred_mean, obs_mean = load_ensemble_result(camels_exp_lst, test_epoch, return_value=True) # df_camels_pub = pd.DataFrame({train_set: np.full([inds_df_camels.shape[0]], train_data_name_lst[k][0]), # test_set: np.full([inds_df_camels.shape[0]], test_data_name_lst[k][0]), # show_ind_key: inds_df_camels[show_ind_key]}) # frames_camels_pub.append(df_camels_pub) # pub in camels config_data = load_dataconfig_case_exp(cfg, exp_lst[k][0]) preds = [] obss = []
plot_gages_map_and_scatter(data_df, [show_ind_key, "lat", "lon", "slope"], idx_lst, cmap_strs=["Reds", "Blues"], labels=["zero-dor", "small-dor"], scatter_label=[attr_lst[0], show_ind_key], wspace=2, hspace=1.5, legend_y=.8, sub_fig_ratio=[6, 4, 1]) plt.tight_layout() plt.savefig(os.path.join(conus_config_data.data_path["Out"], 'zero-small-dor_western_map_comp.png'), dpi=FIGURE_DPI, bbox_inches="tight") elif compare_item == 4: hydro_logger.info("Are the differences significant?") inds_df_pair1 = load_ensemble_result(cfg, pair1_exps, test_epoch) inds_df_pair2 = load_ensemble_result(cfg, pair2_exps, test_epoch) inds_df_pair3 = load_ensemble_result(cfg, pair3_exps, test_epoch) inds_df_conus = load_ensemble_result(cfg, conus_exps, test_epoch) keys_nse = "NSE" attr_nodam = "zero_dor" cases_exps_legends_nodam = ["LSTM-Z", "LSTM-ZS", "LSTM-ZL", "LSTM-CONUS"] inds_df_nodam = load_ensemble_result(cfg, nodam_exp_lst, test_epoch) np_nodam_alone_nse = inds_df_nodam[keys_nse] np_nodam_in_pair1_nse = inds_df_pair1[keys_nse].iloc[ idx_lst_nodam_in_pair1] np_nodam_in_pair2_nse = inds_df_pair2[keys_nse].iloc[
def test_plot_each_symmetric_exp(self): train_set = self.train_set test_set = self.test_set show_ind_key = self.show_ind_key test_epoch = self.test_epoch split_num = self.split_num exp_lst = self.symmetric_exp_lst train_data_name_lst = self.symmetric_train_data_name_lst test_data_name_lst = self.symmetric_test_data_name_lst colors = "Greens" sns.set(font_scale=1) fig = plt.figure() ax_k = fig.add_axes() frames = [] for j in range(len(exp_lst)): config_data = load_dataconfig_case_exp(cfg, exp_lst[j]) preds = [] obss = [] preds2 = [] obss2 = [] predsbase = [] obssbase = [] for i in range(split_num): data_model_base = GagesModel.load_datamodel( config_data.data_path["Temp"], str(i), data_source_file_name='test_data_source_base.txt', stat_file_name='test_Statistics_base.json', flow_file_name='test_flow_base.npy', forcing_file_name='test_forcing_base.npy', attr_file_name='test_attr_base.npy', f_dict_file_name='test_dictFactorize_base.json', var_dict_file_name='test_dictAttribute_base.json', t_s_dict_file_name='test_dictTimeSpace_base.json') data_model = GagesModel.load_datamodel( config_data.data_path["Temp"], str(i), data_source_file_name='test_data_source.txt', stat_file_name='test_Statistics.json', flow_file_name='test_flow.npy', forcing_file_name='test_forcing.npy', attr_file_name='test_attr.npy', f_dict_file_name='test_dictFactorize.json', var_dict_file_name='test_dictAttribute.json', t_s_dict_file_name='test_dictTimeSpace.json') data_model_2 = GagesModel.load_datamodel( config_data.data_path["Temp"], str(i), data_source_file_name='test_data_source_2.txt', stat_file_name='test_Statistics_2.json', flow_file_name='test_flow_2.npy', forcing_file_name='test_forcing_2.npy', attr_file_name='test_attr_2.npy', f_dict_file_name='test_dictFactorize_2.json', var_dict_file_name='test_dictAttribute_2.json', t_s_dict_file_name='test_dictTimeSpace_2.json') pred_base, obs_base = load_result( data_model_base.data_source.data_config.data_path['Temp'], test_epoch, pred_name='flow_pred_base', obs_name='flow_obs_base') pred_base = pred_base.reshape(pred_base.shape[0], pred_base.shape[1]) obs_base = obs_base.reshape(obs_base.shape[0], obs_base.shape[1]) hydro_logger.info("the size of %s %s Train-base %s", j, i, pred_base.shape[0]) predsbase.append(pred_base) obssbase.append(obs_base) pred_i, obs_i = load_result( data_model.data_source.data_config.data_path['Temp'], test_epoch) pred_i = pred_i.reshape(pred_i.shape[0], pred_i.shape[1]) obs_i = obs_i.reshape(obs_i.shape[0], obs_i.shape[1]) hydro_logger.info("the size of %s %s PUB-1 %s", j, i, pred_i.shape[0]) preds.append(pred_i) obss.append(obs_i) pred_2, obs_2 = load_result( data_model_2.data_source.data_config.data_path['Temp'], test_epoch, pred_name='flow_pred_2', obs_name='flow_obs_2') pred_2 = pred_2.reshape(pred_2.shape[0], pred_2.shape[1]) obs_2 = obs_2.reshape(obs_2.shape[0], obs_2.shape[1]) hydro_logger.info("the size of %s %s PUB-2 %s", j, i, pred_2.shape[0]) preds2.append(pred_2) obss2.append(obs_2) predsbase_np = reduce(lambda a, b: np.vstack((a, b)), predsbase) obssbase_np = reduce(lambda a, b: np.vstack((a, b)), obssbase) indsbase = statError(obssbase_np, predsbase_np) inds_df_abase = pd.DataFrame(indsbase) preds_np = reduce(lambda a, b: np.vstack((a, b)), preds) obss_np = reduce(lambda a, b: np.vstack((a, b)), obss) inds = statError(obss_np, preds_np) inds_df_a = pd.DataFrame(inds) preds2_np = reduce(lambda a, b: np.vstack((a, b)), preds2) obss2_np = reduce(lambda a, b: np.vstack((a, b)), obss2) inds2 = statError(obss2_np, preds2_np) inds_df_a2 = pd.DataFrame(inds2) if j == 0 or j == 1: df_abase = pd.DataFrame({ train_set: np.full([inds_df_abase.shape[0]], train_data_name_lst[j]), test_set: np.full([inds_df_abase.shape[0]], test_data_name_lst[j]), show_ind_key: inds_df_abase[show_ind_key] }) frames.append(df_abase) if j == 1: df_a = pd.DataFrame({ train_set: np.full([inds_df_a.shape[0]], train_data_name_lst[j]), test_set: np.full([inds_df_a.shape[0]], test_data_name_lst[3]), show_ind_key: inds_df_a[show_ind_key] }) df_a2 = pd.DataFrame({ train_set: np.full([inds_df_a2.shape[0]], train_data_name_lst[j]), test_set: np.full([inds_df_a2.shape[0]], test_data_name_lst[2]), show_ind_key: inds_df_a2[show_ind_key] }) else: df_a = pd.DataFrame({ train_set: np.full([inds_df_a.shape[0]], train_data_name_lst[j]), test_set: np.full([inds_df_a.shape[0]], test_data_name_lst[2]), show_ind_key: inds_df_a[show_ind_key] }) df_a2 = pd.DataFrame({ train_set: np.full([inds_df_a2.shape[0]], train_data_name_lst[j]), test_set: np.full([inds_df_a2.shape[0]], test_data_name_lst[3]), show_ind_key: inds_df_a2[show_ind_key] }) frames.append(df_a) frames.append(df_a2) result = pd.concat(frames) sns_box = sns.boxplot( ax=ax_k, x=train_set, y=show_ind_key, hue=test_set, # hue_order=test_data_name_lst, data=result, showfliers=False, palette=colors) # , width=0.8 medians = result.groupby([train_set, test_set], sort=False)[show_ind_key].median().values hydro_logger.info(medians) create_median_labels(sns_box.axes, has_fliers=False) sns.despine() plt.tight_layout() plt.show() hydro_logger.debug("plot successfully")
def test_diff_dor_fig2_in_the_paper(self): data_model = GagesModel.load_datamodel( self.config_data.data_path["Temp"], data_source_file_name='data_source.txt', stat_file_name='Statistics.json', flow_file_name='flow.npy', forcing_file_name='forcing.npy', attr_file_name='attr.npy', f_dict_file_name='dictFactorize.json', var_dict_file_name='dictAttribute.json', t_s_dict_file_name='dictTimeSpace.json') config_data = self.config_data config_file = self.config_file test_epoch = self.test_epoch exp_lst = self.exp_lst figure_dpi = self.FIGURE_DPI inds_df, pred_mean, obs_mean = load_ensemble_result(config_file, exp_lst, test_epoch, return_value=True) diversion_yes = True diversion_no = False source_data_diversion = GagesSource.choose_some_basins( config_data, config_data.model_dict["data"]["tRangeTrain"], screen_basin_area_huc4=False, diversion=diversion_yes) source_data_nodivert = GagesSource.choose_some_basins( config_data, config_data.model_dict["data"]["tRangeTrain"], screen_basin_area_huc4=False, diversion=diversion_no) sites_id_nodivert = source_data_nodivert.all_configs[ 'flow_screen_gage_id'] sites_id_diversion = source_data_diversion.all_configs[ 'flow_screen_gage_id'] dor_1 = -self.dor dor_2 = self.dor source_data_dor1 = GagesSource.choose_some_basins( config_data, config_data.model_dict["data"]["tRangeTrain"], screen_basin_area_huc4=False, DOR=dor_1) source_data_dor2 = GagesSource.choose_some_basins( config_data, config_data.model_dict["data"]["tRangeTrain"], screen_basin_area_huc4=False, DOR=dor_2) sites_id_dor1 = source_data_dor1.all_configs['flow_screen_gage_id'] sites_id_dor2 = source_data_dor2.all_configs['flow_screen_gage_id'] # basins with dams source_data_withdams = GagesSource.choose_some_basins( config_data, config_data.model_dict["data"]["tRangeTrain"], screen_basin_area_huc4=False, dam_num=[1, 100000]) sites_id_withdams = source_data_withdams.all_configs[ 'flow_screen_gage_id'] sites_id_dor1 = np.intersect1d(np.array(sites_id_dor1), np.array(sites_id_withdams)).tolist() no_divert_small_dor = np.intersect1d(sites_id_nodivert, sites_id_dor1) no_divert_large_dor = np.intersect1d(sites_id_nodivert, sites_id_dor2) diversion_small_dor = np.intersect1d(sites_id_diversion, sites_id_dor1) diversion_large_dor = np.intersect1d(sites_id_diversion, sites_id_dor2) all_sites = data_model.t_s_dict["sites_id"] idx_lst_nodivert_smalldor = [ i for i in range(len(all_sites)) if all_sites[i] in no_divert_small_dor ] idx_lst_nodivert_largedor = [ i for i in range(len(all_sites)) if all_sites[i] in no_divert_large_dor ] idx_lst_diversion_smalldor = [ i for i in range(len(all_sites)) if all_sites[i] in diversion_small_dor ] idx_lst_diversion_largedor = [ i for i in range(len(all_sites)) if all_sites[i] in diversion_large_dor ] keys_nse = "NSE" xs = [] ys = [] cases_exps_legends_together = [ "not_diverted_small_dor", "not_diverted_large_dor", "diversion_small_dor", "diversion_large_dor", "CONUS" ] x1, y1 = ecdf(inds_df[keys_nse].iloc[idx_lst_nodivert_smalldor]) xs.append(x1) ys.append(y1) x2, y2 = ecdf(inds_df[keys_nse].iloc[idx_lst_nodivert_largedor]) xs.append(x2) ys.append(y2) x3, y3 = ecdf(inds_df[keys_nse].iloc[idx_lst_diversion_smalldor]) xs.append(x3) ys.append(y3) x4, y4 = ecdf(inds_df[keys_nse].iloc[idx_lst_diversion_largedor]) xs.append(x4) ys.append(y4) x_conus, y_conus = ecdf(inds_df[keys_nse]) xs.append(x_conus) ys.append(y_conus) hydro_logger.info( "The median NSEs of all five curves (%s) are \n %.2f, %.2f, %.2f, %.2f, %.2f", cases_exps_legends_together, np.median(x1), np.median(x2), np.median(x3), np.median(x4), np.median(x_conus)) # plot_ecdfs_matplot(xs, ys, cases_exps_legends_together, # colors=["#1f77b4", "#ff7f0e", "#2ca02c", "#d62728", "grey"], # dash_lines=[False, False, False, False, True], x_str="NSE", y_str="CDF") # plot using two linestyles and two colors for dor and diversion. # plot_ecdfs(xs, ys, cases_exps_legends_together, x_str="NSE", y_str="CDF") # define color scheme and line style colors = ["#1f77b4", "#d62728"] linestyles = ['-', "--"] markers = ["", "."] fig = plt.figure(figsize=(8, 6)) axes = fig.add_axes([0.1, 0.1, 0.8, 0.8]) # for i, marker in enumerate(markers): for i, linestyle in enumerate(linestyles): for j, color in enumerate(colors): plt.plot( xs[i * 2 + j], ys[i * 2 + j], color=color, ls=linestyle, # marker=marker, label=cases_exps_legends_together[i * 2 + j]) line_i, = axes.plot(x_conus, y_conus, color="grey", label=cases_exps_legends_together[4]) line_i.set_dashes([2, 2, 10, 2]) x_str = "NSE" y_str = "CDF" x_lim = (0, 1) y_lim = (0, 1) x_interval = 0.1 y_interval = 0.1 plt.xlabel(x_str, fontsize=18) plt.ylabel(y_str, fontsize=18) axes.set_xlim(x_lim[0], x_lim[1]) axes.set_ylim(y_lim[0], y_lim[1]) # set x y number font size plt.xticks(np.arange(x_lim[0], x_lim[1] + x_lim[1] / 100, x_interval), fontsize=16) plt.yticks(np.arange(y_lim[0], y_lim[1] + y_lim[1] / 100, y_interval), fontsize=16) plt.grid() # Hide the right and top spines axes.spines['right'].set_visible(False) axes.spines['top'].set_visible(False) axes.legend() plt.legend(prop={'size': 16}) plt.savefig(os.path.join(config_data.data_path["Out"], 'new_dor_divert_comp_matplotlib.png'), dpi=figure_dpi, bbox_inches="tight") plt.show()
def test_gages_nse_dam_attr(self): figure_dpi = 600 config_data = self.config_data data_dir = config_data.data_path["Temp"] data_model = GagesModel.load_datamodel( data_dir, data_source_file_name='test_data_source.txt', stat_file_name='test_Statistics.json', flow_file_name='test_flow.npy', forcing_file_name='test_forcing.npy', attr_file_name='test_attr.npy', f_dict_file_name='test_dictFactorize.json', var_dict_file_name='test_dictAttribute.json', t_s_dict_file_name='test_dictTimeSpace.json') gages_id = data_model.t_s_dict["sites_id"] exp_lst = [ "basic_exp37", "basic_exp39", "basic_exp40", "basic_exp41", "basic_exp42", "basic_exp43" ] self.inds_df, pred_mean, obs_mean = load_ensemble_result( config_data.config_file, exp_lst, config_data.config_file.TEST_EPOCH, return_value=True) show_ind_key = 'NSE' plt.rcParams['font.family'] = 'serif' plt.rcParams['font.serif'] = ['Times New Roman' ] + plt.rcParams['font.serif'] # plot NSE-DOR attr_lst = ["RUNAVE7100", "STOR_NOR_2009"] attrs_runavg_stor = data_model.data_source.read_attr( gages_id, attr_lst, is_return_dict=False) run_avg = attrs_runavg_stor[:, 0] * (10**(-3)) * (10**6 ) # m^3 per year nor_storage = attrs_runavg_stor[:, 1] * 1000 # m^3 dors = nor_storage / run_avg # dor = 0 is not totally same with dam_num=0 (some dammed basins' dor is about 0.00), # here for zero-dor we mainly rely on dam_num = 0 attr_dam_num = ["NDAMS_2009"] attrs_dam_num = data_model.data_source.read_attr(gages_id, attr_dam_num, is_return_dict=False) df = pd.DataFrame({ "DOR": dors, "DAM_NUM": attrs_dam_num[:, 0], show_ind_key: self.inds_df[show_ind_key].values }) hydro_logger.info("statistics of dors:\n %s", df.describe()) hydro_logger.info("percentiles of dors:\n %s", df.quantile(q=0.95)) hydro_logger.info("ecdf of dors:\n %s", ecdf(dors)) # boxplot # add a column to represent the dor range for the df dor_value_range_lst = [[0, 0], [0, 0.02], [0.02, 0.05], [0.05, 0.1], [0.1, 0.2], [0.2, 0.4], [0.4, 0.8], [0.8, 10000]] dor_range_lst = ["0"] + [ str(dor_value_range_lst[i][0]) + "-" + str(dor_value_range_lst[i][1]) for i in range(1, len(dor_value_range_lst) - 1) ] + [">" + str(dor_value_range_lst[-1][0])] # add a column to represent the dam_num range for the df dam_num_value_range_lst = [[0, 0], [0, 1], [1, 3], [3, 5], [5, 10], [10, 20], [20, 50], [50, 10000]] dam_num_range_lst = ["0", "1"] + [ str(dam_num_value_range_lst[i][0]) + "-" + str(dam_num_value_range_lst[i][1]) for i in range(2, len(dam_num_value_range_lst) - 1) ] + [">" + str(dam_num_value_range_lst[-1][0])] def in_which_range(value_temp): if value_temp == 0: return "0" the_range = [ a_range for a_range in dor_value_range_lst if a_range[0] < value_temp <= a_range[1] ] if the_range[0][0] == dor_value_range_lst[-1][0]: the_range_str = ">" + str(the_range[0][0]) else: the_range_str = str(the_range[0][0]) + "-" + str( the_range[0][1]) return the_range_str def in_which_dam_num_range(value_tmp): if value_tmp == 0: return "0" if value_tmp == 1: return "1" the_ran = [ a_ran for a_ran in dam_num_value_range_lst if a_ran[0] < value_tmp <= a_ran[1] ] if the_ran[0][0] == dam_num_value_range_lst[-1][0]: the_ran_str = ">" + str(the_ran[0][0]) else: the_ran_str = str(the_ran[0][0]) + "-" + str(the_ran[0][1]) return the_ran_str df["DOR_RANGE"] = df["DOR"].apply(in_which_range) df["DAM_NUM_RANGE"] = df["DAM_NUM"].apply(in_which_dam_num_range) df.loc[(df["DAM_NUM"] > 0) & (df["DOR_RANGE"] == "0"), "DOR_RANGE"] = dor_range_lst[1] shown_nse_range_boxplots = [-0.5, 1.0] sns.set(font="serif", font_scale=1.5, color_codes=True) plot_boxs(df, "DOR_RANGE", show_ind_key, ylim=shown_nse_range_boxplots, order=dor_range_lst) plt.savefig(os.path.join( config_data.data_path["Out"], 'NSE~DOR-boxplots-' + str(shown_nse_range_boxplots) + '.png'), dpi=figure_dpi, bbox_inches="tight") plt.figure() shown_nse_range_boxplots = [0, 1.0] sns.set(font="serif", font_scale=1.5, color_codes=True) plot_boxs(df, "DAM_NUM_RANGE", show_ind_key, ylim=shown_nse_range_boxplots, order=dam_num_range_lst) plt.savefig(os.path.join( config_data.data_path["Out"], 'NSE~DAM_NUM-boxplots-' + str(shown_nse_range_boxplots) + '.png'), dpi=figure_dpi, bbox_inches="tight") nums_in_dor_range = [ df[df["DOR_RANGE"] == a_range_rmp].shape[0] for a_range_rmp in dor_range_lst ] ratios_in_dor_range = [ a_num / df.shape[0] for a_num in nums_in_dor_range ] hydro_logger.info( "the number and ratio of basins in each dor range\n: %s \n %s", nums_in_dor_range, ratios_in_dor_range) nums_in_dam_num_range = [ df[df["DAM_NUM_RANGE"] == a_range_rmp].shape[0] for a_range_rmp in dam_num_range_lst ] ratios_in_dam_num_range = [ a_num / df.shape[0] for a_num in nums_in_dam_num_range ] hydro_logger.info( "the number and ratio of basins in each dam_num range\n: %s \n %s", nums_in_dam_num_range, ratios_in_dam_num_range) # regplot plt.figure() sns.set(font="serif", font_scale=1.5, color_codes=True) sr = sns.regplot(x="DOR", y=show_ind_key, data=df[df[show_ind_key] >= 0], scatter_kws={'s': 10}) show_dor_max = df.quantile( q=0.95)["DOR"] # 30 # max(dors) # 0.8 # 10 show_dor_min = min(dors) plt.ylim(0, 1) plt.xlim(show_dor_min, show_dor_max) plt.savefig(os.path.join( config_data.data_path["Out"], 'NSE~DOR-shown-max-' + str(show_dor_max) + '.png'), dpi=figure_dpi, bbox_inches="tight") # jointplot # dor_range = [0.2, 0.9] dor_range = [0.002, 0.2] # plt.figure() sns.set(font="serif", font_scale=1.5, color_codes=True) # g = sns.jointplot(x="DOR", y=show_ind_key, data=df[(df["DOR"] < 1) & (df[show_ind_key] >= 0)], kind="reg", # marginal_kws=dict(bins=25)) # g = sns.jointplot(x="DOR", y=show_ind_key, data=df[(df["DOR"] < 1) & (df[show_ind_key] >= 0)], kind="hex", # color="b", marginal_kws=dict(bins=50)) g = sns.jointplot( x="DOR", y=show_ind_key, data=df[(df["DOR"] < dor_range[1]) & (df["DOR"] > dor_range[0]) & (df[show_ind_key] >= 0)], kind="hex", color="b") g.ax_marg_x.set_xlim(dor_range[0], dor_range[1]) # g.ax_marg_y.set_ylim(-0.5, 1) plt.savefig(os.path.join( config_data.data_path["Out"], 'NSE~DOR(range-)' + str(dor_range) + '-jointplot.png'), dpi=figure_dpi, bbox_inches="tight") nid_dir = os.path.join( "/".join(self.config_data.data_path["DB"].split("/")[:-1]), "nid", "test") nid_input = NidModel.load_nidmodel( nid_dir, nid_source_file_name='nid_source.txt', nid_data_file_name='nid_data.shp') gage_main_dam_purpose = unserialize_json( os.path.join(nid_dir, "dam_main_purpose_dict.json")) data_input = GagesDamDataModel(data_model, nid_input, gage_main_dam_purpose) dam_coords = unserialize_json_ordered( os.path.join(nid_dir, "dam_points_dict.json")) dam_storages = unserialize_json_ordered( os.path.join(nid_dir, "dam_storages_dict.json")) dam_ids_1 = list(gage_main_dam_purpose.keys()) dam_ids_2 = list(dam_coords.keys()) dam_ids_3 = list(dam_storages.keys()) assert (all(x < y for x, y in zip(dam_ids_1, dam_ids_1[1:]))) assert (all(x < y for x, y in zip(dam_ids_2, dam_ids_2[1:]))) assert (all(x < y for x, y in zip(dam_ids_3, dam_ids_3[1:]))) sites = list(dam_coords.keys()) c, ind1, idx_lst_nse_range = np.intersect1d(sites, gages_id, return_indices=True) std_storage_in_a_basin = list(map(np.std, dam_storages.values())) log_std_storage_in_a_basin = list( map(np.log, np.array(std_storage_in_a_basin) + 1)) nse_values = self.inds_df["NSE"].values[idx_lst_nse_range] df = pd.DataFrame({ "DAM_STORAGE_STD": log_std_storage_in_a_basin, show_ind_key: nse_values }) plt.figure() sns.set(font="serif", font_scale=1.5, color_codes=True) g = sns.regplot(x="DAM_STORAGE_STD", y=show_ind_key, data=df[df[show_ind_key] >= 0], scatter_kws={'s': 10}) show_max = max(log_std_storage_in_a_basin) show_min = min(log_std_storage_in_a_basin) if show_min < 0: show_min = 0 # g.ax_marg_x.set_xlim(show_min, show_max) # g.ax_marg_y.set_ylim(0, 1) plt.ylim(0, 1) plt.xlim(show_min, show_max) plt.savefig(os.path.join(config_data.data_path["Out"], 'NSE~' + "DAM_STORAGE_STD" + '.png'), dpi=figure_dpi, bbox_inches="tight") gages_loc_lat = data_model.data_source.gage_dict["LAT_GAGE"] gages_loc_lon = data_model.data_source.gage_dict["LNG_GAGE"] gages_loc = [[gages_loc_lat[i], gages_loc_lon[i]] for i in range(len(gages_id))] # calculate index of dispersion, then plot the NSE-dispersion scatterplot # Geo coord system of gages_loc and dam_coords are both NAD83 coefficient_of_var = list( map(coefficient_of_variation, gages_loc, dam_coords.values())) coefficient_of_var_min = min(coefficient_of_var) coefficient_of_var_max = max(coefficient_of_var) dispersion_var = "DAM_GAGE_DIS_VAR" nse_values = self.inds_df["NSE"].values[idx_lst_nse_range] df = pd.DataFrame({ dispersion_var: coefficient_of_var, show_ind_key: nse_values }) plt.figure() sns.set(font="serif", font_scale=1.5, color_codes=True) g = sns.regplot(x=dispersion_var, y=show_ind_key, data=df[df[show_ind_key] >= 0], scatter_kws={'s': 10}) show_max = coefficient_of_var_max show_min = coefficient_of_var_min if show_min < 0: show_min = 0 # g.ax_marg_x.set_xlim(show_min, show_max) # g.ax_marg_y.set_ylim(0, 1) plt.ylim(0, 1) plt.xlim(show_min, show_max) plt.savefig(os.path.join(config_data.data_path["Out"], 'NSE~' + dispersion_var + '.png'), dpi=figure_dpi, bbox_inches="tight") idx_dispersions = list( map(ind_of_dispersion, gages_loc, dam_coords.values())) idx_dispersion_min = min(idx_dispersions) idx_dispersion_max = max(idx_dispersions) dispersion_var = "DAM_DISPERSION_BASIN" # nse_range = [0, 1] # idx_lst_nse_range = inds_df_now[(inds_df_now[show_ind_key] >= nse_range[0]) & (inds_df_now[show_ind_key] < nse_range[1])].index.tolist() nse_values = self.inds_df["NSE"].values[idx_lst_nse_range] df = pd.DataFrame({ dispersion_var: idx_dispersions, show_ind_key: nse_values }) # g = sns.regplot(x=dispersion_var, y=show_ind_key, data=df[df[show_ind_key] >= 0], scatter_kws={'s': 10}) if idx_dispersion_min < 0: idx_dispersion_min = 0 plt.ylim(0, 1) plt.xlim(idx_dispersion_min, idx_dispersion_max) # plt.figure() sns.set(font="serif", font_scale=1.5, color_codes=True) g = sns.jointplot(x=dispersion_var, y=show_ind_key, data=df[df[show_ind_key] >= 0], kind="reg") g.ax_marg_x.set_xlim(idx_dispersion_min, idx_dispersion_max) g.ax_marg_y.set_ylim(0, 1) plt.show()
xs.append(x2) ys.append(y2) x3, y3 = ecdf(inds_df[keys_nse].iloc[idx_lst_diversion_smalldor]) xs.append(x3) ys.append(y3) x4, y4 = ecdf(inds_df[keys_nse].iloc[idx_lst_diversion_largedor]) xs.append(x4) ys.append(y4) x_conus, y_conus = ecdf(inds_df[keys_nse]) xs.append(x_conus) ys.append(y_conus) hydro_logger.info( "The median NSEs of all five curves (%s) are \n %.2f, %.2f, %.2f, %.2f, %.2f", cases_exps_legends_together, np.median(x1), np.median(x2), np.median(x3), np.median(x4), np.median(x_conus)) plot_ecdfs_matplot( xs, ys, cases_exps_legends_together, colors=["#1f77b4", "#ff7f0e", "#2ca02c", "#d62728", "grey"], dash_lines=[False, False, False, False, True], x_str="NSE", y_str="CDF") plt.savefig(os.path.join(config_data.data_path["Out"], 'dor_divert_comp_matplotlib.png'), dpi=FIGURE_DPI, bbox_inches="tight") ############################ plot map ###########################
def read_usge_gage(self, huc, usgs_id, t_lst): """read data for one gage""" hydro_logger.info("reading %s streamflow data", usgs_id) dir_gage_flow = self.all_configs.get("flow_dir") usgs_file = os.path.join(dir_gage_flow, str(huc), usgs_id + '.txt') # ignore the comment lines and the first non-value row df_flow = pd.read_csv(usgs_file, comment='#', sep='\t', dtype={ 'site_no': str }).iloc[1:, :] # change the original column names columns_names = df_flow.columns.tolist() columns_flow = [] columns_flow_cd = [] for column_name in columns_names: # 00060 means "discharge",00003 represents "mean value" # one special case: 126801 00060 00003 Discharge, cubic feet per second (Mean) and # 126805 00060 00003 Discharge, cubic feet per second (Mean), PUBLISHED # Both are mean values, here I will choose the column with more records if '_00060_00003' in column_name and '_00060_00003_cd' not in column_name: columns_flow.append(column_name) for column_name in columns_names: if '_00060_00003_cd' in column_name: columns_flow_cd.append(column_name) if len(columns_flow) > 1: hydro_logger.debug("there are some columns for flow, choose one\n") df_date_temp = df_flow['datetime'] date_temp = pd.to_datetime(df_date_temp).values.astype( 'datetime64[D]') c_temp, ind1_temp, ind2_temp = np.intersect1d(date_temp, t_lst, return_indices=True) num_nan_lst = [] for i in range(len(columns_flow)): out_temp = np.full([len(t_lst)], np.nan) df_flow_temp = df_flow[columns_flow[i]].copy() df_flow_temp.loc[df_flow_temp == "Rat"] = np.nan df_flow_temp.loc[df_flow_temp == "Dis"] = np.nan df_flow_temp.loc[df_flow_temp == "Ice"] = np.nan df_flow_temp.loc[df_flow_temp == "Ssn"] = np.nan out_temp[ind2_temp] = df_flow_temp.iloc[ind1_temp] num_nan = np.isnan(out_temp).sum() num_nan_lst.append(num_nan) num_nan_np = np.array(num_nan_lst) index_flow_num = np.argmin(num_nan_np) df_flow.rename(columns={columns_flow[index_flow_num]: 'flow'}, inplace=True) df_flow.rename(columns={columns_flow_cd[index_flow_num]: 'mode'}, inplace=True) else: for column_name in columns_names: if '_00060_00003' in column_name and '_00060_00003_cd' not in column_name: df_flow.rename(columns={column_name: 'flow'}, inplace=True) break for column_name in columns_names: if '_00060_00003_cd' in column_name: df_flow.rename(columns={column_name: 'mode'}, inplace=True) break columns = ['agency_cd', 'site_no', 'datetime', 'flow', 'mode'] if df_flow.empty: df_flow = pd.DataFrame(columns=columns) if not ('flow' in df_flow.columns.intersection(columns)): data_temp = df_flow.loc[:, df_flow.columns.intersection(columns)] # add nan column to data_temp data_temp = pd.concat( [data_temp, pd.DataFrame(columns=['flow', 'mode'])]) else: data_temp = df_flow.loc[:, columns] # fix flow which is not numeric data data_temp.loc[data_temp['flow'] == "Ice", 'flow'] = np.nan data_temp.loc[data_temp['flow'] == "Ssn", 'flow'] = np.nan data_temp.loc[data_temp['flow'] == "Tst", 'flow'] = np.nan data_temp.loc[data_temp['flow'] == "Eqp", 'flow'] = np.nan data_temp.loc[data_temp['flow'] == "Rat", 'flow'] = np.nan data_temp.loc[data_temp['flow'] == "Dis", 'flow'] = np.nan data_temp.loc[data_temp['flow'] == "Bkw", 'flow'] = np.nan data_temp.loc[data_temp['flow'] == "***", 'flow'] = np.nan data_temp.loc[data_temp['flow'] == "Mnt", 'flow'] = np.nan data_temp.loc[data_temp['flow'] == "ZFL", 'flow'] = np.nan # set negative value -- nan obs = data_temp['flow'].astype('float').values obs[obs < 0] = np.nan # time range intersection. set points without data nan values nt = len(t_lst) out = np.full([nt], np.nan) # date in df is str,so transform them to datetime df_date = data_temp['datetime'] date = pd.to_datetime(df_date).values.astype('datetime64[D]') c, ind1, ind2 = np.intersect1d(date, t_lst, return_indices=True) out[ind2] = obs[ind1] return out
from data.data_input import GagesModel from data.gages_input_dataset import load_dataconfig_case_exp, load_ensemble_result from data.config import cfg, update_cfg, cmd from utils.hydro_util import hydro_logger from visual.plot_model import plot_sites_and_attr, plot_scatter_multi_attrs FIGURE_DPI = 600 # cite data from this paper(https://doi.org/10.1029/2007WR005971) according to its table 1 data_validate = pd.read_csv("paper10.1029_2007WR005971-table1.csv") # statistical analysis for NSq. NSq-i, NSq-a: Model performance when respectively Ignoring and Accounting for the # volume variations in the reservoirs in control mode. nsqi = data_validate['NSq‐i'].astype(float) nsqa = data_validate['NSq‐a'].astype(float) hydro_logger.info("nsq-i MEDIAN value is %s", np.nanmedian(nsqi.values)) hydro_logger.info("nsq-i MEAN value is %s", np.nanmean(nsqi.values)) hydro_logger.info("nsq-a MEDIAN value is %s", np.nanmedian(nsqa.values)) hydro_logger.info("nsq-a MEAN value is %s", np.nanmean(nsqa.values)) # calculate the dor values of all basins idx4paper = 0 paper_dors = [] while idx4paper < data_validate.shape[0]: dam_num_tmp = data_validate['Number of Main dams'][idx4paper] if math.isnan(dam_num_tmp): hydro_logger.error("miss it") else: dam_num_tmp = int(dam_num_tmp) if type(data_validate['Watershed Area, km2'][idx4paper]) == str: watershed_area = float(data_validate['Watershed Area, km2'][idx4paper].replace(',', ''))