示例#1
0
 def test_insert_leap_year_value(self):
     """interpolation for the 12.31 data in leap year"""
     data_dir = os.path.join(self.config_data.data_path["DB"],
                             "basin_mean_forcing", "daymet")
     subdir_str = [
         "01", "02", "03", "04", "05", "06", "07", "08", "09", "10L", "10U",
         "11", "12", "13", "14", "15", "16", "17", "18"
     ]
     t_range = ["1980-01-01", "2020-01-01"]
     col_lst = [
         "dayl(s)", "prcp(mm/day)", "srad(W/m2)", "swe(mm)", "tmax(C)",
         "tmin(C)", "vp(Pa)"
     ]
     for i in range(len(subdir_str)):
         subdir = os.path.join(data_dir, subdir_str[i])
         path_list = os.listdir(subdir)
         path_list.sort()  # 对读取的路径进行排序
         for filename in path_list:
             data_file = os.path.join(subdir, filename)
             is_leap_file_name = data_file[-8:]
             if "leap" in is_leap_file_name:
                 continue
             print("reading", data_file)
             data_temp = pd.read_csv(data_file, sep=r'\s+')
             data_temp.rename(columns={'Mnth': 'Month'}, inplace=True)
             df_date = data_temp[['Year', 'Month', 'Day']]
             date = pd.to_datetime(df_date).values.astype('datetime64[D]')
             # daymet file not for leap year, there is no data in 12.31 in leap year
             assert (all(x < y for x, y in zip(date, date[1:])))
             t_range_list = hydro_time.t_range_days(t_range)
             [c, ind1, ind2] = np.intersect1d(date,
                                              t_range_list,
                                              return_indices=True)
             assert date[0] <= t_range_list[0] and date[-1] >= t_range_list[
                 -1]
             nt = t_range_list.size
             out = np.full([nt, 7], np.nan)
             out[ind2, :] = data_temp[col_lst].values[ind1]
             x = pd.DataFrame(out, columns=col_lst)
             x_intepolate = x.interpolate(method='linear',
                                          limit_direction='forward',
                                          axis=0)
             csv_date = pd.to_datetime(t_range_list)
             year_month_day_hour = pd.DataFrame(
                 [[dt.year, dt.month, dt.day, dt.hour] for dt in csv_date],
                 columns=['Year', 'Mnth', 'Day', "Hr"])
             # concat
             new_data_df = pd.concat([year_month_day_hour, x_intepolate],
                                     axis=1)
             output_file = data_file[:-4] + "_leap.txt"
             new_data_df.to_csv(output_file,
                                header=True,
                                index=False,
                                sep=' ',
                                float_format='%.2f')
             os.remove(data_file)
示例#2
0
 def test_check_streamflow_data(self):
     source_data = GagesSource(
         self.config_data,
         self.config_data.model_dict["data"]["tRangeTrain"],
         screen_basin_area_huc4=False)
     t_range_list = hydro_time.t_range_days(["1990-01-01", "2010-01-01"])
     # data_temp = source_data.read_usge_gage("01", '01052500', t_range_list)
     data_temp = source_data.read_usge_gage("08", '08013000', t_range_list)
     print(data_temp)
     print(np.argwhere(np.isnan(data_temp)))
示例#3
0
    def data_models_of_train_test(cls, data_model, t_train, t_test):
        """split the data_model that will be used in LSTM according to train and test
        Notice: you can't know anything about test dataset before evaluating, so we should use the statistic value of
        training period for normalization in test period"""
        def select_by_time(data_flow_temp,
                           data_forcing_temp,
                           data_model_origin,
                           t_temp,
                           train_stat_dict=None):
            data_attr_temp = data_model_origin.data_attr[:, :]
            stat_dict_temp = {}
            t_s_dict_temp = {}
            source_data_temp = copy.deepcopy(data_model_origin.data_source)
            source_data_temp.t_range = t_temp
            f_dict_temp = data_model_origin.f_dict
            var_dict_temp = data_model_origin.var_dict
            data_model_temp = cls(source_data_temp, data_flow_temp,
                                  data_forcing_temp, data_attr_temp,
                                  var_dict_temp, f_dict_temp, stat_dict_temp,
                                  t_s_dict_temp)
            t_s_dict_temp['sites_id'] = data_model_origin.t_s_dict['sites_id']
            t_s_dict_temp['t_final_range'] = t_temp
            data_model_temp.t_s_dict = t_s_dict_temp
            if train_stat_dict is None:
                stat_dict_temp = data_model_temp.cal_stat_all()
            else:
                stat_dict_temp = train_stat_dict
            data_model_temp.stat_dict = stat_dict_temp
            return data_model_temp

        t_lst_train = hydro_time.t_range_days(t_train)
        t_train_final_index = t_lst_train.size
        data_flow_train = data_model.data_flow[:, :t_train_final_index]
        data_forcing_train = data_model.data_forcing[:, :
                                                     t_train_final_index, :]
        data_model_train = select_by_time(data_flow_train, data_forcing_train,
                                          data_model, t_train)

        data_flow_test = data_model.data_flow[:, t_train_final_index:]
        data_forcing_test = data_model.data_forcing[:, t_train_final_index:, :]
        data_model_test = select_by_time(data_flow_test, data_forcing_test,
                                         data_model, t_test,
                                         data_model_train.stat_dict)
        return data_model_train, data_model_test
示例#4
0
def plot_gages_map_and_ts(data_model,
                          obs,
                          pred,
                          inds_df,
                          show_ind_key,
                          idx_lst,
                          pertile_range,
                          plot_ts=True,
                          fig_size=(8, 8),
                          cmap_str="viridis"):
    data_map = (inds_df.loc[idx_lst])[show_ind_key].values
    all_lat = data_model.data_source.gage_dict["LAT_GAGE"]
    all_lon = data_model.data_source.gage_dict["LNG_GAGE"]
    all_sites_id = data_model.data_source.gage_dict["STAID"]
    sites = np.array(data_model.t_s_dict['sites_id'])[idx_lst]
    sites_index = np.array([np.where(all_sites_id == i)
                            for i in sites]).flatten()
    lat = all_lat[sites_index]
    lon = all_lon[sites_index]
    data_ts_obs_np = obs[idx_lst, :]
    data_ts_pred_np = pred[idx_lst, :]
    data_ts = [[data_ts_obs_np[i], data_ts_pred_np[i]]
               for i in range(data_ts_obs_np.shape[0])]
    t = hydro_time.t_range_days(data_model.t_s_dict["t_final_range"]).tolist()
    if plot_ts:
        plot_ts_map(data_map.tolist(),
                    data_ts,
                    lat,
                    lon,
                    t,
                    sites.tolist(),
                    pertile_range=pertile_range)
    else:
        f = plot_map_carto(data_map,
                           lat=lat,
                           lon=lon,
                           pertile_range=pertile_range,
                           fig_size=(fig_size[0], fig_size[1] - 2),
                           cmap_str=cmap_str)
        return f
示例#5
0
 def test_test_gages(self):
     data_model_origin = GagesModel.load_datamodel(
         self.config_data.data_path["Temp"],
         data_source_file_name='test_data_source.txt',
         stat_file_name='test_Statistics.json',
         flow_file_name='test_flow.npy',
         forcing_file_name='test_forcing.npy',
         attr_file_name='test_attr.npy',
         f_dict_file_name='test_dictFactorize.json',
         var_dict_file_name='test_dictAttribute.json',
         t_s_dict_file_name='test_dictTimeSpace.json')
     warmup_len = 120
     t_range_all = data_model_origin.t_s_dict["t_final_range"]
     t_range_lst = hydro_time.t_range_days(t_range_all)
     t_range_warmup = hydro_time.t_days_lst2range(t_range_lst[:warmup_len])
     t_range_test = hydro_time.t_days_lst2range(t_range_lst[warmup_len:])
     data_model_warmup, data_model = GagesModel.data_models_of_train_test(
         data_model_origin, t_range_warmup, t_range_test)
     data_model.stat_dict = data_model_origin.stat_dict
     with torch.cuda.device(0):
         pred, obs = master_test(data_model, epoch=self.test_epoch)
         basin_area = data_model.data_source.read_attr(
             data_model.t_s_dict["sites_id"], ['DRAIN_SQKM'],
             is_return_dict=False)
         mean_prep = data_model.data_source.read_attr(
             data_model.t_s_dict["sites_id"], ['PPTAVG_BASIN'],
             is_return_dict=False)
         mean_prep = mean_prep / 365 * 10
         pred = _basin_norm(pred, basin_area, mean_prep, to_norm=False)
         obs = _basin_norm(obs, basin_area, mean_prep, to_norm=False)
         save_result(data_model.data_source.data_config.data_path['Temp'],
                     self.test_epoch, pred, obs)
         plot_we_need(data_model,
                      obs,
                      pred,
                      id_col="STAID",
                      lon_col="LNG_GAGE",
                      lat_col="LAT_GAGE")
示例#6
0
    def update_data_model(cls,
                          config_data,
                          data_model_origin,
                          sites_id_update=None,
                          t_range_update=None,
                          data_attr_update=False,
                          train_stat_dict=None,
                          screen_basin_area_huc4=False):
        t_s_dict_origin = data_model_origin.t_s_dict
        data_flow_origin = data_model_origin.data_flow
        data_forcing_origin = data_model_origin.data_forcing
        data_attr_origin = data_model_origin.data_attr
        var_dict_origin = data_model_origin.var_dict
        f_dict_origin = data_model_origin.f_dict
        stat_dict_origin = data_model_origin.stat_dict
        if sites_id_update is not None:
            t_s_dict = {}
            t_range_origin_cpy = t_s_dict_origin["t_final_range"].copy()
            sites_id_origin_cpy = t_s_dict_origin["sites_id"].copy()
            sites_id_new = sites_id_update
            assert (all(
                x < y
                for x, y in zip(sites_id_origin_cpy, sites_id_origin_cpy[1:])))
            assert (all(x < y for x, y in zip(sites_id_new, sites_id_new[1:])))
            sites_id = np.intersect1d(sites_id_origin_cpy, sites_id_new)
            assert sites_id.size > 0
            new_source_data = GagesSource.choose_some_basins(
                config_data,
                t_range_origin_cpy,
                screen_basin_area_huc4=screen_basin_area_huc4,
                sites_id=sites_id.tolist())
            t_s_dict["t_final_range"] = t_range_origin_cpy
            t_s_dict["sites_id"] = sites_id.tolist()
            chosen_idx = [
                i for i in range(len(sites_id_origin_cpy))
                if sites_id_origin_cpy[i] in sites_id
            ]
            data_flow = data_flow_origin[chosen_idx, :]
            data_forcing = data_forcing_origin[chosen_idx, :, :]
            data_attr = data_attr_origin[chosen_idx, :]
        else:
            t_range_origin_cpy = t_s_dict_origin["t_final_range"].copy()
            t_s_dict = copy.deepcopy(t_s_dict_origin)
            new_source_data = GagesSource.choose_some_basins(
                config_data,
                t_range_origin_cpy,
                screen_basin_area_huc4=screen_basin_area_huc4)
            data_flow = data_flow_origin.copy()
            data_forcing = data_forcing_origin.copy()
            data_attr = data_attr_origin.copy()
        if data_attr_update:
            attr_lst = new_source_data.all_configs.get("attr_chosen")
            data_attr, var_dict, f_dict = new_source_data.read_attr(
                t_s_dict["sites_id"], attr_lst)
        else:
            var_dict = var_dict_origin.copy()
            f_dict = f_dict_origin.copy()
        data_model = cls(new_source_data, data_flow, data_forcing, data_attr,
                         var_dict, f_dict, stat_dict_origin, t_s_dict)
        if t_range_update is not None:
            sites_id_temp = data_model.t_s_dict['sites_id'].copy()
            t_range = t_range_update.copy()
            stat_dict_temp = {}
            t_s_dict_temp = {}
            start_index = int(
                (np.datetime64(t_range[0]) -
                 np.datetime64(data_model.t_s_dict["t_final_range"][0])) /
                np.timedelta64(1, 'D'))
            assert start_index >= 0
            t_lst_temp = hydro_time.t_range_days(t_range)
            end_index = start_index + t_lst_temp.size
            data_flow = data_model.data_flow[:, start_index:end_index]
            data_forcing = data_model.data_forcing[:, start_index:end_index, :]

            data_model = cls(new_source_data, data_flow, data_forcing,
                             data_attr, var_dict, f_dict, stat_dict_temp,
                             t_s_dict_temp)
            t_s_dict_temp['sites_id'] = sites_id_temp
            t_s_dict_temp['t_final_range'] = t_range
            data_model.t_s_dict = t_s_dict_temp
            data_model.data_source.t_range = t_range
        if not data_model.data_source.gage_dict["STAID"].tolist(
        ) == data_model.t_s_dict['sites_id']:
            gage_dict_new = dict()
            usgs_all_sites = data_model.data_source.gage_dict["STAID"]
            sites_chosen = np.zeros(usgs_all_sites.shape[0])
            usgs_ids = data_model.t_s_dict['sites_id']
            sites_index = np.where(np.in1d(usgs_all_sites, usgs_ids))[0]
            sites_chosen[sites_index] = 1
            for key, value in data_model.data_source.gage_dict.items():
                value_new = np.array([
                    value[i] for i in range(len(sites_chosen))
                    if sites_chosen[i] > 0
                ])
                gage_dict_new[key] = value_new
            data_model.data_source.gage_dict = gage_dict_new
            assert (np.array(usgs_ids) == gage_dict_new["STAID"]).all()
        if train_stat_dict is None:
            stat_dict_temp = data_model.cal_stat_all()
        else:
            stat_dict_temp = train_stat_dict
        data_model.stat_dict = stat_dict_temp

        return data_model
示例#7
0
 def usgs_screen_streamflow(self, streamflow, usgs_ids=None, time_range=None):
     usgs_out = None
     gages_chosen_id = self.gage_dict["HUC10"]
     ts = hydro_time.t_range_days(self.t_range)
     return usgs_out, gages_chosen_id, ts
示例#8
0
 def test_t_range_days(self):
     t_range = self.t_range
     t_lst = t_range_days(t_range)
     print(t_lst)