Пример #1
0
 def test_dam_train(self):
     quick_data_dir = os.path.join(self.config_data.data_path["DB"], "quickdata")
     sim_data_dir = os.path.join(quick_data_dir, "allref_85-05_nan-0.1_00-1.0")
     data_dir = os.path.join(quick_data_dir, "allnonref_85-05_nan-0.1_00-1.0")
     data_model_sim8595 = GagesModel.load_datamodel(sim_data_dir,
                                                    data_source_file_name='data_source.txt',
                                                    stat_file_name='Statistics.json', flow_file_name='flow.npy',
                                                    forcing_file_name='forcing.npy', attr_file_name='attr.npy',
                                                    f_dict_file_name='dictFactorize.json',
                                                    var_dict_file_name='dictAttribute.json',
                                                    t_s_dict_file_name='dictTimeSpace.json')
     data_model_8595 = GagesModel.load_datamodel(data_dir,
                                                 data_source_file_name='data_source.txt',
                                                 stat_file_name='Statistics.json', flow_file_name='flow.npy',
                                                 forcing_file_name='forcing.npy', attr_file_name='attr.npy',
                                                 f_dict_file_name='dictFactorize.json',
                                                 var_dict_file_name='dictAttribute.json',
                                                 t_s_dict_file_name='dictTimeSpace.json')
     sim_gages_model_train = GagesModel.update_data_model(self.sim_config_data, data_model_sim8595,
                                                          data_attr_update=True)
     gages_model_train = GagesModel.update_data_model(self.config_data, data_model_8595, data_attr_update=True)
     sim_gages_model_train.update_model_param('train', nEpoch=300)
     nid_dir = os.path.join("/".join(self.config_data.data_path["DB"].split("/")[:-1]), "nid", "quickdata")
     nid_input = NidModel.load_nidmodel(nid_dir, nid_file=self.nid_file,
                                        nid_source_file_name='nid_source.txt', nid_data_file_name='nid_data.shp')
     gage_main_dam_purpose = unserialize_json(os.path.join(nid_dir, "dam_main_purpose_dict.json"))
     data_input = GagesDamDataModel(gages_model_train, nid_input, True, gage_main_dam_purpose)
     gages_input = choose_which_purpose(data_input)
     with torch.cuda.device(2):
         data_model = GagesSimDataModel(sim_gages_model_train, gages_input)
         # pre_trained_model_epoch = 230
         # master_train_natural_flow(data_model, pre_trained_model_epoch=pre_trained_model_epoch)
         master_train_natural_flow(data_model)
Пример #2
0
    def test_gages_data_model_quickdata(self):
        quick_data_dir = os.path.join(self.config_data.data_path["DB"], "quickdata")
        data_dir = os.path.join(quick_data_dir, "conus-all_90-10_nan-0.0_00-1.0")
        data_model_train = GagesModel.load_datamodel(data_dir,
                                                     data_source_file_name='data_source.txt',
                                                     stat_file_name='Statistics.json', flow_file_name='flow.npy',
                                                     forcing_file_name='forcing.npy', attr_file_name='attr.npy',
                                                     f_dict_file_name='dictFactorize.json',
                                                     var_dict_file_name='dictAttribute.json',
                                                     t_s_dict_file_name='dictTimeSpace.json')
        data_model_test = GagesModel.load_datamodel(data_dir,
                                                    data_source_file_name='test_data_source.txt',
                                                    stat_file_name='test_Statistics.json',
                                                    flow_file_name='test_flow.npy',
                                                    forcing_file_name='test_forcing.npy',
                                                    attr_file_name='test_attr.npy',
                                                    f_dict_file_name='test_dictFactorize.json',
                                                    var_dict_file_name='test_dictAttribute.json',
                                                    t_s_dict_file_name='test_dictTimeSpace.json')

        gages_model_train = GagesModel.update_data_model(self.config_data, data_model_train, data_attr_update=True,
                                                         screen_basin_area_huc4=False)
        gages_model_test = GagesModel.update_data_model(self.config_data, data_model_test, data_attr_update=True,
                                                        train_stat_dict=gages_model_train.stat_dict,
                                                        screen_basin_area_huc4=False)
        save_datamodel(gages_model_train, data_source_file_name='data_source.txt',
                       stat_file_name='Statistics.json', flow_file_name='flow', forcing_file_name='forcing',
                       attr_file_name='attr', f_dict_file_name='dictFactorize.json',
                       var_dict_file_name='dictAttribute.json', t_s_dict_file_name='dictTimeSpace.json')
        save_datamodel(gages_model_test, data_source_file_name='test_data_source.txt',
                       stat_file_name='test_Statistics.json', flow_file_name='test_flow',
                       forcing_file_name='test_forcing', attr_file_name='test_attr',
                       f_dict_file_name='test_dictFactorize.json', var_dict_file_name='test_dictAttribute.json',
                       t_s_dict_file_name='test_dictTimeSpace.json')
        print("read and save data model")
Пример #3
0
    def test_dam_test(self):
        quick_data_dir = os.path.join(self.config_data.data_path["DB"],
                                      "quickdata")
        data_dir = os.path.join(quick_data_dir,
                                "conus-all_90-10_nan-0.0_00-1.0")
        data_model_train = GagesModel.load_datamodel(
            data_dir,
            data_source_file_name='data_source.txt',
            stat_file_name='Statistics.json',
            flow_file_name='flow.npy',
            forcing_file_name='forcing.npy',
            attr_file_name='attr.npy',
            f_dict_file_name='dictFactorize.json',
            var_dict_file_name='dictAttribute.json',
            t_s_dict_file_name='dictTimeSpace.json')
        data_model_test = GagesModel.load_datamodel(
            data_dir,
            data_source_file_name='test_data_source.txt',
            stat_file_name='test_Statistics.json',
            flow_file_name='test_flow.npy',
            forcing_file_name='test_forcing.npy',
            attr_file_name='test_attr.npy',
            f_dict_file_name='test_dictFactorize.json',
            var_dict_file_name='test_dictAttribute.json',
            t_s_dict_file_name='test_dictTimeSpace.json')

        gages_model_train = GagesModel.update_data_model(
            self.config_data, data_model_train)
        gages_model_test = GagesModel.update_data_model(
            self.config_data,
            data_model_test,
            train_stat_dict=gages_model_train.stat_dict)
        nid_dir = os.path.join(
            "/".join(self.config_data.data_path["DB"].split("/")[:-1]), "nid",
            "quickdata")
        nid_input = NidModel.load_nidmodel(
            nid_dir,
            nid_file=self.nid_file,
            nid_source_file_name='nid_source.txt',
            nid_data_file_name='nid_data.shp')
        gage_main_dam_purpose = unserialize_json(
            os.path.join(nid_dir, "dam_main_purpose_dict.json"))
        data_input = GagesDamDataModel(gages_model_test, nid_input, True,
                                       gage_main_dam_purpose)
        gages_input = choose_which_purpose(data_input)
        pred, obs = master_test(gages_input)
        basin_area = gages_input.data_source.read_attr(
            gages_input.t_s_dict["sites_id"], ['DRAIN_SQKM'],
            is_return_dict=False)
        mean_prep = gages_input.data_source.read_attr(
            gages_input.t_s_dict["sites_id"], ['PPTAVG_BASIN'],
            is_return_dict=False)
        mean_prep = mean_prep / 365 * 10
        pred = _basin_norm(pred, basin_area, mean_prep, to_norm=False)
        obs = _basin_norm(obs, basin_area, mean_prep, to_norm=False)
        save_result(gages_input.data_source.data_config.data_path['Temp'],
                    self.test_epoch, pred, obs)
Пример #4
0
    def test_dam_train(self):
        quick_data_dir = os.path.join(self.config_data_1.data_path["DB"],
                                      "quickdata")
        data_dir = os.path.join(quick_data_dir,
                                "allnonref_85-05_nan-0.1_00-1.0")
        # for inv model, datamodel of  train and test are same
        data_model_8595 = GagesModel.load_datamodel(
            data_dir,
            data_source_file_name='data_source.txt',
            stat_file_name='Statistics.json',
            flow_file_name='flow.npy',
            forcing_file_name='forcing.npy',
            attr_file_name='attr.npy',
            f_dict_file_name='dictFactorize.json',
            var_dict_file_name='dictAttribute.json',
            t_s_dict_file_name='dictTimeSpace.json')
        t_range1_train = self.config_data_1.model_dict["data"]["tRangeTrain"]
        gages_model1_train = GagesModel.update_data_model(
            self.config_data_1,
            data_model_8595,
            t_range_update=t_range1_train,
            data_attr_update=True)
        t_range2_train = self.config_data_2.model_dict["data"]["tRangeTrain"]
        gages_model2_train = GagesModel.update_data_model(
            self.config_data_2,
            data_model_8595,
            t_range_update=t_range2_train,
            data_attr_update=True)
        nid_dir = os.path.join(
            "/".join(self.config_data_1.data_path["DB"].split("/")[:-1]),
            "nid", "quickdata")
        nid_input = NidModel.load_nidmodel(
            nid_dir,
            nid_file=self.nid_file,
            nid_source_file_name='nid_source.txt',
            nid_data_file_name='nid_data.shp')
        gage_main_dam_purpose = unserialize_json(
            os.path.join(nid_dir, "dam_main_purpose_dict.json"))
        gage_main_dam_purpose_lst = list(gage_main_dam_purpose.values())
        gage_main_dam_purpose_unique = np.unique(gage_main_dam_purpose_lst)

        with torch.cuda.device(1):
            for i in range(0, gage_main_dam_purpose_unique.size):
                data_input1 = GagesDamDataModel(gages_model1_train, nid_input,
                                                True, gage_main_dam_purpose)
                gages_input1 = choose_which_purpose(
                    data_input1, purpose=gage_main_dam_purpose_unique[i])
                data_input2 = GagesDamDataModel(gages_model2_train, nid_input,
                                                True, gage_main_dam_purpose)
                gages_input2 = choose_which_purpose(
                    data_input2, purpose=gage_main_dam_purpose_unique[i])
                data_model = GagesInvDataModel(gages_input1, gages_input2)
                # pre_trained_model_epoch = 165
                train_lstm_inv(data_model)
Пример #5
0
    def test_gages_dam_all_save(self):
        quick_data_dir = os.path.join(self.config_data.data_path["DB"],
                                      "quickdata")
        data_dir = os.path.join(quick_data_dir,
                                "conus-all_90-10_nan-0.0_00-1.0")
        data_model_train = GagesModel.load_datamodel(
            data_dir,
            data_source_file_name='data_source.txt',
            stat_file_name='Statistics.json',
            flow_file_name='flow.npy',
            forcing_file_name='forcing.npy',
            attr_file_name='attr.npy',
            f_dict_file_name='dictFactorize.json',
            var_dict_file_name='dictAttribute.json',
            t_s_dict_file_name='dictTimeSpace.json')

        gages_model_train = GagesModel.update_data_model(
            self.config_data, data_model_train)
        data_model_test = GagesModel.load_datamodel(
            data_dir,
            data_source_file_name='test_data_source.txt',
            stat_file_name='test_Statistics.json',
            flow_file_name='test_flow.npy',
            forcing_file_name='test_forcing.npy',
            attr_file_name='test_attr.npy',
            f_dict_file_name='test_dictFactorize.json',
            var_dict_file_name='test_dictAttribute.json',
            t_s_dict_file_name='test_dictTimeSpace.json')
        gages_model_test = GagesModel.update_data_model(
            self.config_data,
            data_model_test,
            train_stat_dict=gages_model_train.stat_dict)
        nid_dir = os.path.join(
            "/".join(self.config_data.data_path["DB"].split("/")[:-1]), "nid",
            "test")
        nid_input = NidModel.load_nidmodel(
            nid_dir,
            nid_source_file_name='nid_source.txt',
            nid_data_file_name='nid_data.shp')
        gage_main_dam_purpose = unserialize_json(
            os.path.join(nid_dir, "dam_main_purpose_dict.json"))
        data_input = GagesDamDataModel(gages_model_test, nid_input,
                                       gage_main_dam_purpose)
        data_model_dam = choose_which_purpose(data_input)
        save_datamodel(data_model_dam,
                       data_source_file_name='test_data_source.txt',
                       stat_file_name='test_Statistics.json',
                       flow_file_name='test_flow',
                       forcing_file_name='test_forcing',
                       attr_file_name='test_attr',
                       f_dict_file_name='test_dictFactorize.json',
                       var_dict_file_name='test_dictAttribute.json',
                       t_s_dict_file_name='test_dictTimeSpace.json')
Пример #6
0
 def test_da_data_temp(self):
     quick_data_dir = os.path.join(self.config_data.data_path["DB"],
                                   "quickdata")
     data_dir = os.path.join(quick_data_dir,
                             "allnonref_85-05_nan-0.1_00-1.0")
     data_model_train = GagesModel.load_datamodel(
         data_dir,
         data_source_file_name='data_source.txt',
         stat_file_name='Statistics.json',
         flow_file_name='flow.npy',
         forcing_file_name='forcing.npy',
         attr_file_name='attr.npy',
         f_dict_file_name='dictFactorize.json',
         var_dict_file_name='dictAttribute.json',
         t_s_dict_file_name='dictTimeSpace.json')
     data_model_test = GagesModel.load_datamodel(
         data_dir,
         data_source_file_name='test_data_source.txt',
         stat_file_name='test_Statistics.json',
         flow_file_name='test_flow.npy',
         forcing_file_name='test_forcing.npy',
         attr_file_name='test_attr.npy',
         f_dict_file_name='test_dictFactorize.json',
         var_dict_file_name='test_dictAttribute.json',
         t_s_dict_file_name='test_dictTimeSpace.json')
     gages_model_train = GagesModel.update_data_model(
         self.config_data, data_model_train)
     gages_model_test = GagesModel.update_data_model(
         self.config_data, data_model_test)
     save_datamodel(gages_model_train,
                    data_source_file_name='data_source.txt',
                    stat_file_name='Statistics.json',
                    flow_file_name='flow',
                    forcing_file_name='forcing',
                    attr_file_name='attr',
                    f_dict_file_name='dictFactorize.json',
                    var_dict_file_name='dictAttribute.json',
                    t_s_dict_file_name='dictTimeSpace.json')
     save_datamodel(gages_model_test,
                    data_source_file_name='test_data_source.txt',
                    stat_file_name='test_Statistics.json',
                    flow_file_name='test_flow',
                    forcing_file_name='test_forcing',
                    attr_file_name='test_attr',
                    f_dict_file_name='test_dictFactorize.json',
                    var_dict_file_name='test_dictAttribute.json',
                    t_s_dict_file_name='test_dictTimeSpace.json')
     print("read and save data model")
Пример #7
0
 def test_purposes_inds(self):
     quick_data_dir = os.path.join(self.config_data.data_path["DB"],
                                   "quickdata")
     data_dir = os.path.join(quick_data_dir,
                             "allnonref-dam_95-05_nan-0.1_00-1.0")
     data_model = GagesModel.load_datamodel(
         data_dir,
         data_source_file_name='test_data_source.txt',
         stat_file_name='test_Statistics.json',
         flow_file_name='test_flow.npy',
         forcing_file_name='test_forcing.npy',
         attr_file_name='test_attr.npy',
         f_dict_file_name='test_dictFactorize.json',
         var_dict_file_name='test_dictAttribute.json',
         t_s_dict_file_name='test_dictTimeSpace.json')
     gages_data_model = GagesModel.update_data_model(
         self.config_data, data_model)
     nid_dir = os.path.join(
         "/".join(self.config_data.data_path["DB"].split("/")[:-1]), "nid",
         "quickdata")
     gage_main_dam_purpose = unserialize_json(
         os.path.join(nid_dir, "dam_main_purpose_dict.json"))
     gage_main_dam_purpose_lst = list(gage_main_dam_purpose.values())
     gage_main_dam_purpose_unique = np.unique(gage_main_dam_purpose_lst)
     purpose_regions = {}
     for i in range(gage_main_dam_purpose_unique.size):
         sites_id = []
         for key, value in gage_main_dam_purpose.items():
             if value == gage_main_dam_purpose_unique[i]:
                 sites_id.append(key)
         assert (all(x < y for x, y in zip(sites_id, sites_id[1:])))
         purpose_regions[gage_main_dam_purpose_unique[i]] = sites_id
     id_regions_idx = []
     id_regions_sites_ids = []
     df_id_region = np.array(gages_data_model.t_s_dict["sites_id"])
     for key, value in purpose_regions.items():
         gages_id = value
         c, ind1, ind2 = np.intersect1d(df_id_region,
                                        gages_id,
                                        return_indices=True)
         assert (all(x < y for x, y in zip(ind1, ind1[1:])))
         assert (all(x < y for x, y in zip(c, c[1:])))
         id_regions_idx.append(ind1)
         id_regions_sites_ids.append(c)
     preds, obss, inds_dfs = split_results_to_regions(
         gages_data_model, self.test_epoch, id_regions_idx,
         id_regions_sites_ids)
     region_names = list(purpose_regions.keys())
     inds_medians = []
     inds_means = []
     for i in range(len(region_names)):
         inds_medians.append(inds_dfs[i].median(axis=0))
         inds_means.append(inds_dfs[i].mean(axis=0))
     print(inds_medians)
     print(inds_means)
Пример #8
0
    def test_some_reservoirs(self):
        """choose some small reservoirs to train and test"""
        # 读取模型配置文件
        config_data = self.config_data
        source_data = GagesSource.choose_some_basins(config_data, config_data.model_dict["data"]["tRangeTrain"],
                                                     major_dam=1)
        sites_id = source_data.all_configs['flow_screen_gage_id']
        quick_data_dir = os.path.join(self.config_data.data_path["DB"], "quickdata")
        data_dir = os.path.join(quick_data_dir, "allnonref_85-05_nan-0.1_00-1.0")
        data_model_train = GagesModel.load_datamodel(data_dir,
                                                     data_source_file_name='data_source.txt',
                                                     stat_file_name='Statistics.json', flow_file_name='flow.npy',
                                                     forcing_file_name='forcing.npy', attr_file_name='attr.npy',
                                                     f_dict_file_name='dictFactorize.json',
                                                     var_dict_file_name='dictAttribute.json',
                                                     t_s_dict_file_name='dictTimeSpace.json')
        data_model_test = GagesModel.load_datamodel(data_dir,
                                                    data_source_file_name='test_data_source.txt',
                                                    stat_file_name='test_Statistics.json',
                                                    flow_file_name='test_flow.npy',
                                                    forcing_file_name='test_forcing.npy',
                                                    attr_file_name='test_attr.npy',
                                                    f_dict_file_name='test_dictFactorize.json',
                                                    var_dict_file_name='test_dictAttribute.json',
                                                    t_s_dict_file_name='test_dictTimeSpace.json')

        gages_model_train = GagesModel.update_data_model(self.config_data, data_model_train, sites_id_update=sites_id)
        gages_model_test = GagesModel.update_data_model(self.config_data, data_model_test, sites_id_update=sites_id,
                                                        train_stat_dict=gages_model_train.stat_dict)
        save_datamodel(gages_model_train, data_source_file_name='data_source.txt',
                       stat_file_name='Statistics.json', flow_file_name='flow', forcing_file_name='forcing',
                       attr_file_name='attr', f_dict_file_name='dictFactorize.json',
                       var_dict_file_name='dictAttribute.json', t_s_dict_file_name='dictTimeSpace.json')
        save_datamodel(gages_model_test, data_source_file_name='test_data_source.txt',
                       stat_file_name='test_Statistics.json', flow_file_name='test_flow',
                       forcing_file_name='test_forcing', attr_file_name='test_attr',
                       f_dict_file_name='test_dictFactorize.json', var_dict_file_name='test_dictAttribute.json',
                       t_s_dict_file_name='test_dictTimeSpace.json')
        print("read and save data model")
Пример #9
0
    def test_damcls_test_datamodel(self):
        quick_data_dir = os.path.join(self.config_data.data_path["DB"], "quickdata")
        data_dir = os.path.join(quick_data_dir, "allnonref_85-05_nan-0.1_00-1.0")
        data_model_train = GagesModel.load_datamodel(data_dir,
                                                     data_source_file_name='data_source.txt',
                                                     stat_file_name='Statistics.json', flow_file_name='flow.npy',
                                                     forcing_file_name='forcing.npy', attr_file_name='attr.npy',
                                                     f_dict_file_name='dictFactorize.json',
                                                     var_dict_file_name='dictAttribute.json',
                                                     t_s_dict_file_name='dictTimeSpace.json')
        data_model_test = GagesModel.load_datamodel(data_dir,
                                                    data_source_file_name='test_data_source.txt',
                                                    stat_file_name='test_Statistics.json',
                                                    flow_file_name='test_flow.npy',
                                                    forcing_file_name='test_forcing.npy',
                                                    attr_file_name='test_attr.npy',
                                                    f_dict_file_name='test_dictFactorize.json',
                                                    var_dict_file_name='test_dictAttribute.json',
                                                    t_s_dict_file_name='test_dictTimeSpace.json')

        gages_model_train = GagesModel.update_data_model(self.config_data, data_model_train)
        df = GagesModel.update_data_model(self.config_data, data_model_test,
                                          train_stat_dict=gages_model_train.stat_dict)
        nid_dir = os.path.join("/".join(self.config_data.data_path["DB"].split("/")[:-1]), "nid", "quickdata")
        nid_input = NidModel.load_nidmodel(nid_dir, nid_file=self.nid_file,
                                           nid_source_file_name='nid_source.txt', nid_data_file_name='nid_data.shp')
        gage_main_dam_purpose = unserialize_json(os.path.join(nid_dir, "dam_main_purpose_dict.json"))
        gage_main_dam_purpose_lst = list(gage_main_dam_purpose.values())
        gage_main_dam_purpose_unique = np.unique(gage_main_dam_purpose_lst)
        data_input = GagesDamDataModel(df, nid_input, True, gage_main_dam_purpose)
        for i in range(gage_main_dam_purpose_unique.size):
            gages_input = choose_which_purpose(data_input, purpose=gage_main_dam_purpose_unique[i])
            save_datamodel(gages_input, gage_main_dam_purpose_unique[i], data_source_file_name='test_data_source.txt',
                           stat_file_name='test_Statistics.json', flow_file_name='test_flow',
                           forcing_file_name='test_forcing', attr_file_name='test_attr',
                           f_dict_file_name='test_dictFactorize.json', var_dict_file_name='test_dictAttribute.json',
                           t_s_dict_file_name='test_dictTimeSpace.json')
Пример #10
0
 def test_dam_test(self):
     with torch.cuda.device(1):
         quick_data_dir = os.path.join(self.config_data.data_path["DB"],
                                       "quickdata")
         data_dir = os.path.join(quick_data_dir,
                                 "allnonref-dam_95-05_nan-0.1_00-1.0")
         data_model_test = GagesModel.load_datamodel(
             data_dir,
             data_source_file_name='test_data_source.txt',
             stat_file_name='test_Statistics.json',
             flow_file_name='test_flow.npy',
             forcing_file_name='test_forcing.npy',
             attr_file_name='test_attr.npy',
             f_dict_file_name='test_dictFactorize.json',
             var_dict_file_name='test_dictAttribute.json',
             t_s_dict_file_name='test_dictTimeSpace.json')
         gages_input = GagesModel.update_data_model(self.config_data,
                                                    data_model_test)
         pred, obs = master_test(gages_input, epoch=self.test_epoch)
         basin_area = gages_input.data_source.read_attr(
             gages_input.t_s_dict["sites_id"], ['DRAIN_SQKM'],
             is_return_dict=False)
         mean_prep = gages_input.data_source.read_attr(
             gages_input.t_s_dict["sites_id"], ['PPTAVG_BASIN'],
             is_return_dict=False)
         mean_prep = mean_prep / 365 * 10
         pred = _basin_norm(pred, basin_area, mean_prep, to_norm=False)
         obs = _basin_norm(obs, basin_area, mean_prep, to_norm=False)
         save_result(gages_input.data_source.data_config.data_path['Temp'],
                     self.test_epoch, pred, obs)
         plot_we_need(gages_input,
                      obs,
                      pred,
                      id_col="STAID",
                      lon_col="LNG_GAGE",
                      lat_col="LAT_GAGE")
Пример #11
0
    def test_siminv_data_temp(self):
        quick_data_dir = os.path.join(self.config_data_sim.data_path["DB"],
                                      "quickdata")
        data_dir_allref = os.path.join(quick_data_dir,
                                       "allref_85-05_nan-0.1_00-1.0")
        data_dir_allnonref = os.path.join(quick_data_dir,
                                          "allnonref_85-05_nan-0.1_00-1.0")
        data_model_allref_8595 = GagesModel.load_datamodel(
            data_dir_allref,
            data_source_file_name='data_source.txt',
            stat_file_name='Statistics.json',
            flow_file_name='flow.npy',
            forcing_file_name='forcing.npy',
            attr_file_name='attr.npy',
            f_dict_file_name='dictFactorize.json',
            var_dict_file_name='dictAttribute.json',
            t_s_dict_file_name='dictTimeSpace.json')
        data_model_allref_9505 = GagesModel.load_datamodel(
            data_dir_allref,
            data_source_file_name='test_data_source.txt',
            stat_file_name='test_Statistics.json',
            flow_file_name='test_flow.npy',
            forcing_file_name='test_forcing.npy',
            attr_file_name='test_attr.npy',
            f_dict_file_name='test_dictFactorize.json',
            var_dict_file_name='test_dictAttribute.json',
            t_s_dict_file_name='test_dictTimeSpace.json')
        data_model_allnonref_8595 = GagesModel.load_datamodel(
            data_dir_allnonref,
            data_source_file_name='data_source.txt',
            stat_file_name='Statistics.json',
            flow_file_name='flow.npy',
            forcing_file_name='forcing.npy',
            attr_file_name='attr.npy',
            f_dict_file_name='dictFactorize.json',
            var_dict_file_name='dictAttribute.json',
            t_s_dict_file_name='dictTimeSpace.json')
        data_model_allnonref_9505 = GagesModel.load_datamodel(
            data_dir_allnonref,
            data_source_file_name='test_data_source.txt',
            stat_file_name='test_Statistics.json',
            flow_file_name='test_flow.npy',
            forcing_file_name='test_forcing.npy',
            attr_file_name='test_attr.npy',
            f_dict_file_name='test_dictFactorize.json',
            var_dict_file_name='test_dictAttribute.json',
            t_s_dict_file_name='test_dictTimeSpace.json')
        t_range_sim_train = self.config_data_sim.model_dict["data"][
            "tRangeTrain"]
        t_range_sim_test = self.config_data_sim.model_dict["data"][
            "tRangeTest"]
        sim_gages_model_train = GagesModel.update_data_model(
            self.config_data_sim,
            data_model_allref_8595,
            t_range_update=t_range_sim_train,
            data_attr_update=True)
        sim_gages_model_test = GagesModel.update_data_model(
            self.config_data_sim,
            data_model_allref_8595,
            t_range_update=t_range_sim_test,
            data_attr_update=True)
        t_range_inv_train = self.config_data_inv.model_dict["data"][
            "tRangeTrain"]
        t_range_inv_test = self.config_data_inv.model_dict["data"][
            "tRangeTest"]
        inv_gages_model_train = GagesModel.update_data_model(
            self.config_data_inv,
            data_model_allnonref_8595,
            t_range_update=t_range_inv_train,
            data_attr_update=True)
        inv_gages_model_test = GagesModel.update_data_model(
            self.config_data_inv,
            data_model_allnonref_8595,
            t_range_update=t_range_inv_test,
            data_attr_update=True)
        t_range_train = self.config_data.model_dict["data"]["tRangeTrain"]
        t_range_test = self.config_data.model_dict["data"]["tRangeTest"]
        gages_model_train = GagesModel.update_data_model(
            self.config_data,
            data_model_allnonref_8595,
            t_range_update=t_range_train,
            data_attr_update=True)
        gages_model_test = GagesModel.update_data_model(
            self.config_data,
            data_model_allnonref_9505,
            t_range_update=t_range_test,
            data_attr_update=True,
            train_stat_dict=gages_model_train.stat_dict)

        save_datamodel(sim_gages_model_train,
                       "1",
                       data_source_file_name='data_source.txt',
                       stat_file_name='Statistics.json',
                       flow_file_name='flow',
                       forcing_file_name='forcing',
                       attr_file_name='attr',
                       f_dict_file_name='dictFactorize.json',
                       var_dict_file_name='dictAttribute.json',
                       t_s_dict_file_name='dictTimeSpace.json')
        save_datamodel(sim_gages_model_test,
                       "1",
                       data_source_file_name='test_data_source.txt',
                       stat_file_name='test_Statistics.json',
                       flow_file_name='test_flow',
                       forcing_file_name='test_forcing',
                       attr_file_name='test_attr',
                       f_dict_file_name='test_dictFactorize.json',
                       var_dict_file_name='test_dictAttribute.json',
                       t_s_dict_file_name='test_dictTimeSpace.json')
        save_datamodel(inv_gages_model_train,
                       "2",
                       data_source_file_name='data_source.txt',
                       stat_file_name='Statistics.json',
                       flow_file_name='flow',
                       forcing_file_name='forcing',
                       attr_file_name='attr',
                       f_dict_file_name='dictFactorize.json',
                       var_dict_file_name='dictAttribute.json',
                       t_s_dict_file_name='dictTimeSpace.json')
        save_datamodel(inv_gages_model_test,
                       "2",
                       data_source_file_name='test_data_source.txt',
                       stat_file_name='test_Statistics.json',
                       flow_file_name='test_flow',
                       forcing_file_name='test_forcing',
                       attr_file_name='test_attr',
                       f_dict_file_name='test_dictFactorize.json',
                       var_dict_file_name='test_dictAttribute.json',
                       t_s_dict_file_name='test_dictTimeSpace.json')
        save_datamodel(gages_model_train,
                       "3",
                       data_source_file_name='data_source.txt',
                       stat_file_name='Statistics.json',
                       flow_file_name='flow',
                       forcing_file_name='forcing',
                       attr_file_name='attr',
                       f_dict_file_name='dictFactorize.json',
                       var_dict_file_name='dictAttribute.json',
                       t_s_dict_file_name='dictTimeSpace.json')
        save_datamodel(gages_model_test,
                       "3",
                       data_source_file_name='test_data_source.txt',
                       stat_file_name='test_Statistics.json',
                       flow_file_name='test_flow',
                       forcing_file_name='test_forcing',
                       attr_file_name='test_attr',
                       f_dict_file_name='test_dictFactorize.json',
                       var_dict_file_name='test_dictAttribute.json',
                       t_s_dict_file_name='test_dictTimeSpace.json')
        print("read and save data model")
Пример #12
0
 def test_inv_data_temp(self):
     # data1 is historical data as input of LSTM-Inv, which will be a kernel for the second LSTM
     quick_data_dir = os.path.join(self.config_data_1.data_path["DB"],
                                   "quickdata")
     data_dir = os.path.join(quick_data_dir,
                             "allnonref_85-05_nan-0.1_00-1.0")
     # for inv model, datamodel of  train and test are same
     data_model_8595 = GagesModel.load_datamodel(
         data_dir,
         data_source_file_name='data_source.txt',
         stat_file_name='Statistics.json',
         flow_file_name='flow.npy',
         forcing_file_name='forcing.npy',
         attr_file_name='attr.npy',
         f_dict_file_name='dictFactorize.json',
         var_dict_file_name='dictAttribute.json',
         t_s_dict_file_name='dictTimeSpace.json')
     # for 2nd model, datamodel of train and test belong to parts of the test time
     data_model_9505 = GagesModel.load_datamodel(
         data_dir,
         data_source_file_name='test_data_source.txt',
         stat_file_name='test_Statistics.json',
         flow_file_name='test_flow.npy',
         forcing_file_name='test_forcing.npy',
         attr_file_name='test_attr.npy',
         f_dict_file_name='test_dictFactorize.json',
         var_dict_file_name='test_dictAttribute.json',
         t_s_dict_file_name='test_dictTimeSpace.json')
     t_range1_train = self.config_data_1.model_dict["data"]["tRangeTrain"]
     t_range1_test = self.config_data_1.model_dict["data"]["tRangeTest"]
     gages_model1_train = GagesModel.update_data_model(
         self.config_data_1,
         data_model_8595,
         t_range_update=t_range1_train,
         data_attr_update=True)
     # Because we know data of period "90-95", so that we can get its statistics according to this period
     gages_model1_test = GagesModel.update_data_model(
         self.config_data_1,
         data_model_8595,
         t_range_update=t_range1_test,
         data_attr_update=True)
     t_range2_train = self.config_data_2.model_dict["data"]["tRangeTrain"]
     t_range2_test = self.config_data_2.model_dict["data"]["tRangeTest"]
     gages_model2_train = GagesModel.update_data_model(
         self.config_data_2,
         data_model_8595,
         t_range_update=t_range2_train,
         data_attr_update=True)
     gages_model2_test = GagesModel.update_data_model(
         self.config_data_2,
         data_model_9505,
         t_range_update=t_range2_test,
         data_attr_update=True,
         train_stat_dict=gages_model2_train.stat_dict)
     save_datamodel(gages_model1_train,
                    "1",
                    data_source_file_name='data_source.txt',
                    stat_file_name='Statistics.json',
                    flow_file_name='flow',
                    forcing_file_name='forcing',
                    attr_file_name='attr',
                    f_dict_file_name='dictFactorize.json',
                    var_dict_file_name='dictAttribute.json',
                    t_s_dict_file_name='dictTimeSpace.json')
     save_datamodel(gages_model1_test,
                    "1",
                    data_source_file_name='test_data_source.txt',
                    stat_file_name='test_Statistics.json',
                    flow_file_name='test_flow',
                    forcing_file_name='test_forcing',
                    attr_file_name='test_attr',
                    f_dict_file_name='test_dictFactorize.json',
                    var_dict_file_name='test_dictAttribute.json',
                    t_s_dict_file_name='test_dictTimeSpace.json')
     save_datamodel(gages_model2_train,
                    "2",
                    data_source_file_name='data_source.txt',
                    stat_file_name='Statistics.json',
                    flow_file_name='flow',
                    forcing_file_name='forcing',
                    attr_file_name='attr',
                    f_dict_file_name='dictFactorize.json',
                    var_dict_file_name='dictAttribute.json',
                    t_s_dict_file_name='dictTimeSpace.json')
     save_datamodel(gages_model2_test,
                    "2",
                    data_source_file_name='test_data_source.txt',
                    stat_file_name='test_Statistics.json',
                    flow_file_name='test_flow',
                    forcing_file_name='test_forcing',
                    attr_file_name='test_attr',
                    f_dict_file_name='test_dictFactorize.json',
                    var_dict_file_name='test_dictAttribute.json',
                    t_s_dict_file_name='test_dictTimeSpace.json')
     print("read and save data model")
Пример #13
0
 def test_dam_test(self):
     quick_data_dir = os.path.join(self.config_data_1.data_path["DB"],
                                   "quickdata")
     data_dir = os.path.join(quick_data_dir,
                             "allnonref_85-05_nan-0.1_00-1.0")
     # for inv model, datamodel of  train and test are same
     data_model_8595 = GagesModel.load_datamodel(
         data_dir,
         data_source_file_name='data_source.txt',
         stat_file_name='Statistics.json',
         flow_file_name='flow.npy',
         forcing_file_name='forcing.npy',
         attr_file_name='attr.npy',
         f_dict_file_name='dictFactorize.json',
         var_dict_file_name='dictAttribute.json',
         t_s_dict_file_name='dictTimeSpace.json')
     # for 2nd model, datamodel of train and test belong to parts of the test time
     data_model_9505 = GagesModel.load_datamodel(
         data_dir,
         data_source_file_name='test_data_source.txt',
         stat_file_name='test_Statistics.json',
         flow_file_name='test_flow.npy',
         forcing_file_name='test_forcing.npy',
         attr_file_name='test_attr.npy',
         f_dict_file_name='test_dictFactorize.json',
         var_dict_file_name='test_dictAttribute.json',
         t_s_dict_file_name='test_dictTimeSpace.json')
     t_range1_test = self.config_data_1.model_dict["data"]["tRangeTest"]
     # Because we know data of period "90-95", so that we can get its statistics according to this period
     gages_model1_test = GagesModel.update_data_model(
         self.config_data_1,
         data_model_8595,
         t_range_update=t_range1_test,
         data_attr_update=True)
     t_range2_train = self.config_data_2.model_dict["data"]["tRangeTrain"]
     t_range2_test = self.config_data_2.model_dict["data"]["tRangeTest"]
     gages_model2_train = GagesModel.update_data_model(
         self.config_data_2,
         data_model_8595,
         t_range_update=t_range2_train,
         data_attr_update=True)
     gages_model2_test = GagesModel.update_data_model(
         self.config_data_2,
         data_model_9505,
         t_range_update=t_range2_test,
         data_attr_update=True,
         train_stat_dict=gages_model2_train.stat_dict)
     nid_dir = os.path.join(
         "/".join(self.config_data_2.data_path["DB"].split("/")[:-1]),
         "nid", "quickdata")
     nid_input = NidModel.load_nidmodel(
         nid_dir,
         nid_file=self.nid_file,
         nid_source_file_name='nid_source.txt',
         nid_data_file_name='nid_data.shp')
     gage_main_dam_purpose = unserialize_json(
         os.path.join(nid_dir, "dam_main_purpose_dict.json"))
     data_input1 = GagesDamDataModel(gages_model1_test, nid_input, True,
                                     gage_main_dam_purpose)
     df1 = choose_which_purpose(data_input1)
     data_input2 = GagesDamDataModel(gages_model2_test, nid_input, True,
                                     gage_main_dam_purpose)
     df2 = choose_which_purpose(data_input2)
     with torch.cuda.device(2):
         data_model = GagesInvDataModel(df1, df2)
         pred, obs = test_lstm_inv(data_model, epoch=self.test_epoch)
         basin_area = df2.data_source.read_attr(df2.t_s_dict["sites_id"],
                                                ['DRAIN_SQKM'],
                                                is_return_dict=False)
         mean_prep = df2.data_source.read_attr(df2.t_s_dict["sites_id"],
                                               ['PPTAVG_BASIN'],
                                               is_return_dict=False)
         mean_prep = mean_prep / 365 * 10
         pred = _basin_norm(pred, basin_area, mean_prep, to_norm=False)
         obs = _basin_norm(obs, basin_area, mean_prep, to_norm=False)
         save_result(df2.data_source.data_config.data_path['Temp'],
                     self.test_epoch, pred, obs)
Пример #14
0
 for i in range(camels_pub_split_num):
     data_model_i = GagesModel.load_datamodel(
         config_data.data_path["Temp"],
         str(i),
         data_source_file_name='data_source.txt',
         stat_file_name='Statistics.json',
         flow_file_name='flow.npy',
         forcing_file_name='forcing.npy',
         attr_file_name='attr.npy',
         f_dict_file_name='dictFactorize.json',
         var_dict_file_name='dictAttribute.json',
         t_s_dict_file_name='dictTimeSpace.json')
     gages_model_test = GagesModel.update_data_model(
         gages_configs_test[j],
         data_model_test,
         sites_id_update=gages_sites_test[j],
         data_attr_update=True,
         train_stat_dict=data_model_i.stat_dict,
         screen_basin_area_huc4=False)
     with torch.cuda.device(0):
         pretrained_model_file = os.path.join(
             data_model_i.data_source.data_config.data_path["Out"],
             "model_Ep" + str(test_epoch) + ".pt")
         pretrained_model_name = camels_exp_lst[
             0] + "_pretrained_model" + str(i)
         pred, obs = master_test_with_pretrained_model(
             gages_model_test, pretrained_model_file,
             pretrained_model_name)
         basin_area = gages_model_test.data_source.read_attr(
             gages_model_test.t_s_dict["sites_id"], ['DRAIN_SQKM'],
             is_return_dict=False)
Пример #15
0
def synergy_ecoregion(args):
    update_cfg(cfg, args)
    cache = cfg.CACHE.STATE
    train_mode = cfg.TRAIN_MODE
    test_epoch = cfg.TEST_EPOCH
    config_data = GagesConfig(cfg)
    eco_names = [("ECO2_CODE", 5.2), ("ECO2_CODE", 5.3), ("ECO2_CODE", 6.2),
                 ("ECO2_CODE", 7.1), ("ECO2_CODE", 8.1), ("ECO2_CODE", 8.2),
                 ("ECO2_CODE", 8.3), ("ECO2_CODE", 8.4), ("ECO2_CODE", 8.5),
                 ("ECO2_CODE", 9.2), ("ECO2_CODE", 9.3), ("ECO2_CODE", 9.4),
                 ("ECO2_CODE", 9.5), ("ECO2_CODE", 9.6), ("ECO2_CODE", 10.1),
                 ("ECO2_CODE", 10.2), ("ECO2_CODE", 10.4), ("ECO2_CODE", 11.1),
                 ("ECO2_CODE", 12.1), ("ECO2_CODE", 13.1)]

    quick_data_dir = os.path.join(config_data.data_path["DB"], "quickdata")
    data_dir = os.path.join(quick_data_dir, "conus-all_90-10_nan-0.0_00-1.0")
    data_model_train = GagesModel.load_datamodel(
        data_dir,
        data_source_file_name='data_source.txt',
        stat_file_name='Statistics.json',
        flow_file_name='flow.npy',
        forcing_file_name='forcing.npy',
        attr_file_name='attr.npy',
        f_dict_file_name='dictFactorize.json',
        var_dict_file_name='dictAttribute.json',
        t_s_dict_file_name='dictTimeSpace.json')
    data_model_test = GagesModel.load_datamodel(
        data_dir,
        data_source_file_name='test_data_source.txt',
        stat_file_name='test_Statistics.json',
        flow_file_name='test_flow.npy',
        forcing_file_name='test_forcing.npy',
        attr_file_name='test_attr.npy',
        f_dict_file_name='test_dictFactorize.json',
        var_dict_file_name='test_dictAttribute.json',
        t_s_dict_file_name='test_dictTimeSpace.json')

    for eco_name in eco_names:
        source_data = GagesSource.choose_some_basins(
            config_data,
            config_data.model_dict["data"]["tRangeTrain"],
            screen_basin_area_huc4=False,
            ecoregion=eco_name)
        sites_id = source_data.all_configs['flow_screen_gage_id']
        sites_id_inter = np.intersect1d(data_model_train.t_s_dict["sites_id"],
                                        sites_id)
        if sites_id_inter.size < 1:
            continue
        config_data = GagesConfig.set_subdir(cfg, str(eco_name[1]))
        gages_model_train = GagesModel.update_data_model(
            config_data,
            data_model_train,
            sites_id_update=sites_id,
            data_attr_update=True,
            screen_basin_area_huc4=False)
        gages_model_test = GagesModel.update_data_model(
            config_data,
            data_model_test,
            sites_id_update=sites_id,
            data_attr_update=True,
            train_stat_dict=gages_model_train.stat_dict,
            screen_basin_area_huc4=False)
        if cache:
            save_datamodel(gages_model_train,
                           data_source_file_name='data_source.txt',
                           stat_file_name='Statistics.json',
                           flow_file_name='flow',
                           forcing_file_name='forcing',
                           attr_file_name='attr',
                           f_dict_file_name='dictFactorize.json',
                           var_dict_file_name='dictAttribute.json',
                           t_s_dict_file_name='dictTimeSpace.json')
            save_datamodel(gages_model_test,
                           data_source_file_name='test_data_source.txt',
                           stat_file_name='test_Statistics.json',
                           flow_file_name='test_flow',
                           forcing_file_name='test_forcing',
                           attr_file_name='test_attr',
                           f_dict_file_name='test_dictFactorize.json',
                           var_dict_file_name='test_dictAttribute.json',
                           t_s_dict_file_name='test_dictTimeSpace.json')
            print("save ecoregion " + str(eco_name[1]) + " data model")

        with torch.cuda.device(0):
            if train_mode:
                master_train(gages_model_train)
            pred, obs = master_test(gages_model_test, epoch=test_epoch)
            basin_area = gages_model_test.data_source.read_attr(
                gages_model_test.t_s_dict["sites_id"], ['DRAIN_SQKM'],
                is_return_dict=False)
            mean_prep = gages_model_test.data_source.read_attr(
                gages_model_test.t_s_dict["sites_id"], ['PPTAVG_BASIN'],
                is_return_dict=False)
            mean_prep = mean_prep / 365 * 10
            pred = _basin_norm(pred, basin_area, mean_prep, to_norm=False)
            obs = _basin_norm(obs, basin_area, mean_prep, to_norm=False)
            save_result(
                gages_model_test.data_source.data_config.data_path['Temp'],
                test_epoch, pred, obs)
Пример #16
0
 def test_gages_data_model(self):
     config_data = self.config_data
     dam_num = 0
     source_data = GagesSource.choose_some_basins(
         config_data,
         config_data.model_dict["data"]["tRangeTrain"],
         screen_basin_area_huc4=False,
         dam_num=dam_num)
     sites_id = source_data.all_configs['flow_screen_gage_id']
     quick_data_dir = os.path.join(self.config_data.data_path["DB"],
                                   "quickdata")
     data_dir = os.path.join(quick_data_dir,
                             "conus-all_85-05_nan-0.1_00-1.0")
     data_model_train = GagesModel.load_datamodel(
         data_dir,
         data_source_file_name='data_source.txt',
         stat_file_name='Statistics.json',
         flow_file_name='flow.npy',
         forcing_file_name='forcing.npy',
         attr_file_name='attr.npy',
         f_dict_file_name='dictFactorize.json',
         var_dict_file_name='dictAttribute.json',
         t_s_dict_file_name='dictTimeSpace.json')
     data_model_test = GagesModel.load_datamodel(
         data_dir,
         data_source_file_name='test_data_source.txt',
         stat_file_name='test_Statistics.json',
         flow_file_name='test_flow.npy',
         forcing_file_name='test_forcing.npy',
         attr_file_name='test_attr.npy',
         f_dict_file_name='test_dictFactorize.json',
         var_dict_file_name='test_dictAttribute.json',
         t_s_dict_file_name='test_dictTimeSpace.json')
     gages_model_train = GagesModel.update_data_model(
         self.config_data,
         data_model_train,
         sites_id_update=sites_id,
         screen_basin_area_huc4=False)
     gages_model_test = GagesModel.update_data_model(
         self.config_data,
         data_model_test,
         sites_id_update=sites_id,
         train_stat_dict=gages_model_train.stat_dict,
         screen_basin_area_huc4=False)
     save_datamodel(gages_model_train,
                    data_source_file_name='data_source.txt',
                    stat_file_name='Statistics.json',
                    flow_file_name='flow',
                    forcing_file_name='forcing',
                    attr_file_name='attr',
                    f_dict_file_name='dictFactorize.json',
                    var_dict_file_name='dictAttribute.json',
                    t_s_dict_file_name='dictTimeSpace.json')
     save_datamodel(gages_model_test,
                    data_source_file_name='test_data_source.txt',
                    stat_file_name='test_Statistics.json',
                    flow_file_name='test_flow',
                    forcing_file_name='test_forcing',
                    attr_file_name='test_attr',
                    f_dict_file_name='test_dictFactorize.json',
                    var_dict_file_name='test_dictAttribute.json',
                    t_s_dict_file_name='test_dictTimeSpace.json')
     print("read and save data model")
Пример #17
0
 def test_gages_data_model(self):
     config_data = self.config_data
     major_dam_num = [1, 200]  # max major dam num is 155
     if cfg.CACHE.QUICK_DATA:
         source_data = GagesSource.choose_some_basins(
             config_data,
             config_data.model_dict["data"]["tRangeTrain"],
             screen_basin_area_huc4=False,
             major_dam_num=major_dam_num)
         sites_id = source_data.all_configs['flow_screen_gage_id']
         print("The binary data has exsited")
         quick_data_dir = os.path.join(self.config_data.data_path["DB"],
                                       "quickdata")
         # data_dir = os.path.join(quick_data_dir, "conus-all_85-05_nan-0.1_00-1.0")
         data_dir = os.path.join(quick_data_dir,
                                 "conus-all_90-10_nan-0.0_00-1.0")
         data_model_train = GagesModel.load_datamodel(
             data_dir,
             data_source_file_name='data_source.txt',
             stat_file_name='Statistics.json',
             flow_file_name='flow.npy',
             forcing_file_name='forcing.npy',
             attr_file_name='attr.npy',
             f_dict_file_name='dictFactorize.json',
             var_dict_file_name='dictAttribute.json',
             t_s_dict_file_name='dictTimeSpace.json')
         data_model_test = GagesModel.load_datamodel(
             data_dir,
             data_source_file_name='test_data_source.txt',
             stat_file_name='test_Statistics.json',
             flow_file_name='test_flow.npy',
             forcing_file_name='test_forcing.npy',
             attr_file_name='test_attr.npy',
             f_dict_file_name='test_dictFactorize.json',
             var_dict_file_name='test_dictAttribute.json',
             t_s_dict_file_name='test_dictTimeSpace.json')
         gages_model_train = GagesModel.update_data_model(
             self.config_data,
             data_model_train,
             sites_id_update=sites_id,
             screen_basin_area_huc4=False)
         gages_model_test = GagesModel.update_data_model(
             self.config_data,
             data_model_test,
             sites_id_update=sites_id,
             train_stat_dict=gages_model_train.stat_dict,
             screen_basin_area_huc4=False)
     else:
         gages_model = GagesModels(config_data,
                                   screen_basin_area_huc4=False,
                                   major_dam_num=major_dam_num)
         gages_model_train = gages_model.data_model_train
         gages_model_test = gages_model.data_model_test
     if cfg.CACHE.STATE:
         save_datamodel(gages_model_train,
                        data_source_file_name='data_source.txt',
                        stat_file_name='Statistics.json',
                        flow_file_name='flow',
                        forcing_file_name='forcing',
                        attr_file_name='attr',
                        f_dict_file_name='dictFactorize.json',
                        var_dict_file_name='dictAttribute.json',
                        t_s_dict_file_name='dictTimeSpace.json')
         save_datamodel(gages_model_test,
                        data_source_file_name='test_data_source.txt',
                        stat_file_name='test_Statistics.json',
                        flow_file_name='test_flow',
                        forcing_file_name='test_forcing',
                        attr_file_name='test_attr',
                        f_dict_file_name='test_dictFactorize.json',
                        var_dict_file_name='test_dictAttribute.json',
                        t_s_dict_file_name='test_dictTimeSpace.json')
         print("read and save data model")
Пример #18
0
    def test_siminv_data_temp(self):
        quick_data_dir = os.path.join(self.config_data_natflow.data_path["DB"],
                                      "quickdata")
        # data_dir = os.path.join(quick_data_dir, "conus-all_85-05_nan-0.1_00-1.0")
        data_dir = os.path.join(quick_data_dir,
                                "conus-all_90-10_nan-0.0_00-1.0")
        data_model_8595 = GagesModel.load_datamodel(
            data_dir,
            data_source_file_name='data_source.txt',
            stat_file_name='Statistics.json',
            flow_file_name='flow.npy',
            forcing_file_name='forcing.npy',
            attr_file_name='attr.npy',
            f_dict_file_name='dictFactorize.json',
            var_dict_file_name='dictAttribute.json',
            t_s_dict_file_name='dictTimeSpace.json')
        data_model_9505 = GagesModel.load_datamodel(
            data_dir,
            data_source_file_name='test_data_source.txt',
            stat_file_name='test_Statistics.json',
            flow_file_name='test_flow.npy',
            forcing_file_name='test_forcing.npy',
            attr_file_name='test_attr.npy',
            f_dict_file_name='test_dictFactorize.json',
            var_dict_file_name='test_dictAttribute.json',
            t_s_dict_file_name='test_dictTimeSpace.json')
        conus_sites_id = data_model_8595.t_s_dict["sites_id"]
        nomajordam_source_data = GagesSource.choose_some_basins(
            self.config_data_natflow,
            self.config_data_natflow.model_dict["data"]["tRangeTrain"],
            screen_basin_area_huc4=False,
            major_dam_num=0)
        nomajordam_sites_id = nomajordam_source_data.all_configs[
            'flow_screen_gage_id']
        nomajordam_in_conus = np.intersect1d(conus_sites_id,
                                             nomajordam_sites_id)
        majordam_source_data = GagesSource.choose_some_basins(
            self.config_data_natflow,
            self.config_data_natflow.model_dict["data"]["tRangeTrain"],
            screen_basin_area_huc4=False,
            major_dam_num=[1, 2000])
        majordam_sites_id = majordam_source_data.all_configs[
            'flow_screen_gage_id']
        majordam_in_conus = np.intersect1d(conus_sites_id, majordam_sites_id)

        gages_model_train_natflow = GagesModel.update_data_model(
            self.config_data_natflow,
            data_model_8595,
            sites_id_update=nomajordam_in_conus,
            data_attr_update=True,
            screen_basin_area_huc4=False)
        gages_model_test_natflow = GagesModel.update_data_model(
            self.config_data_natflow,
            data_model_9505,
            sites_id_update=nomajordam_in_conus,
            data_attr_update=True,
            train_stat_dict=gages_model_train_natflow.stat_dict,
            screen_basin_area_huc4=False)

        gages_model_train_lstm = GagesModel.update_data_model(
            self.config_data_lstm,
            data_model_8595,
            sites_id_update=majordam_in_conus,
            data_attr_update=True,
            screen_basin_area_huc4=False)

        gages_model_test_lstm = GagesModel.update_data_model(
            self.config_data_lstm,
            data_model_9505,
            sites_id_update=majordam_in_conus,
            data_attr_update=True,
            train_stat_dict=gages_model_train_lstm.stat_dict,
            screen_basin_area_huc4=False)

        save_datamodel(gages_model_train_natflow,
                       "1",
                       data_source_file_name='data_source.txt',
                       stat_file_name='Statistics.json',
                       flow_file_name='flow',
                       forcing_file_name='forcing',
                       attr_file_name='attr',
                       f_dict_file_name='dictFactorize.json',
                       var_dict_file_name='dictAttribute.json',
                       t_s_dict_file_name='dictTimeSpace.json')
        save_datamodel(gages_model_test_natflow,
                       "1",
                       data_source_file_name='test_data_source.txt',
                       stat_file_name='test_Statistics.json',
                       flow_file_name='test_flow',
                       forcing_file_name='test_forcing',
                       attr_file_name='test_attr',
                       f_dict_file_name='test_dictFactorize.json',
                       var_dict_file_name='test_dictAttribute.json',
                       t_s_dict_file_name='test_dictTimeSpace.json')
        save_datamodel(gages_model_train_lstm,
                       "2",
                       data_source_file_name='data_source.txt',
                       stat_file_name='Statistics.json',
                       flow_file_name='flow',
                       forcing_file_name='forcing',
                       attr_file_name='attr',
                       f_dict_file_name='dictFactorize.json',
                       var_dict_file_name='dictAttribute.json',
                       t_s_dict_file_name='dictTimeSpace.json')
        save_datamodel(gages_model_test_lstm,
                       "2",
                       data_source_file_name='test_data_source.txt',
                       stat_file_name='test_Statistics.json',
                       flow_file_name='test_flow',
                       forcing_file_name='test_forcing',
                       attr_file_name='test_attr',
                       f_dict_file_name='test_dictFactorize.json',
                       var_dict_file_name='test_dictAttribute.json',
                       t_s_dict_file_name='test_dictTimeSpace.json')
        print("read and save data model")
Пример #19
0
    def test_some_reservoirs(self):
        config_data = self.config_data

        dam_num = 0
        dor = 0.02
        source_data_dor1 = GagesSource.choose_some_basins(
            config_data,
            config_data.model_dict["data"]["tRangeTrain"],
            screen_basin_area_huc4=False,
            DOR=dor)
        # basins with dams
        source_data_withoutdams = GagesSource.choose_some_basins(
            config_data,
            config_data.model_dict["data"]["tRangeTrain"],
            screen_basin_area_huc4=False,
            dam_num=dam_num)

        sites_id_dor1 = source_data_dor1.all_configs['flow_screen_gage_id']
        sites_id_withoutdams = source_data_withoutdams.all_configs[
            'flow_screen_gage_id']
        sites_id = np.sort(
            np.union1d(np.array(sites_id_dor1),
                       np.array(sites_id_withoutdams))).tolist()

        quick_data_dir = os.path.join(self.config_data.data_path["DB"],
                                      "quickdata")
        data_dir = os.path.join(quick_data_dir,
                                "conus-all_90-10_nan-0.0_00-1.0")
        data_model_train = GagesModel.load_datamodel(
            data_dir,
            data_source_file_name='data_source.txt',
            stat_file_name='Statistics.json',
            flow_file_name='flow.npy',
            forcing_file_name='forcing.npy',
            attr_file_name='attr.npy',
            f_dict_file_name='dictFactorize.json',
            var_dict_file_name='dictAttribute.json',
            t_s_dict_file_name='dictTimeSpace.json')
        data_model_test = GagesModel.load_datamodel(
            data_dir,
            data_source_file_name='test_data_source.txt',
            stat_file_name='test_Statistics.json',
            flow_file_name='test_flow.npy',
            forcing_file_name='test_forcing.npy',
            attr_file_name='test_attr.npy',
            f_dict_file_name='test_dictFactorize.json',
            var_dict_file_name='test_dictAttribute.json',
            t_s_dict_file_name='test_dictTimeSpace.json')

        gages_model_train = GagesModel.update_data_model(
            self.config_data,
            data_model_train,
            sites_id_update=sites_id,
            data_attr_update=True,
            screen_basin_area_huc4=False)
        gages_model_test = GagesModel.update_data_model(
            self.config_data,
            data_model_test,
            sites_id_update=sites_id,
            data_attr_update=True,
            train_stat_dict=gages_model_train.stat_dict,
            screen_basin_area_huc4=False)
        save_datamodel(gages_model_train,
                       data_source_file_name='data_source.txt',
                       stat_file_name='Statistics.json',
                       flow_file_name='flow',
                       forcing_file_name='forcing',
                       attr_file_name='attr',
                       f_dict_file_name='dictFactorize.json',
                       var_dict_file_name='dictAttribute.json',
                       t_s_dict_file_name='dictTimeSpace.json')
        save_datamodel(gages_model_test,
                       data_source_file_name='test_data_source.txt',
                       stat_file_name='test_Statistics.json',
                       flow_file_name='test_flow',
                       forcing_file_name='test_forcing',
                       attr_file_name='test_attr',
                       f_dict_file_name='test_dictFactorize.json',
                       var_dict_file_name='test_dictAttribute.json',
                       t_s_dict_file_name='test_dictTimeSpace.json')
        print("read and save data model")
Пример #20
0
 def test_gages_sim_data_model(self):
     quick_data_dir = os.path.join(self.config_data.data_path["DB"],
                                   "quickdata")
     sim_data_dir = os.path.join(quick_data_dir,
                                 "allref_85-05_nan-0.1_00-1.0")
     data_dir = os.path.join(quick_data_dir,
                             "allnonref_85-05_nan-0.1_00-1.0")
     data_model_sim8595 = GagesModel.load_datamodel(
         sim_data_dir,
         data_source_file_name='data_source.txt',
         stat_file_name='Statistics.json',
         flow_file_name='flow.npy',
         forcing_file_name='forcing.npy',
         attr_file_name='attr.npy',
         f_dict_file_name='dictFactorize.json',
         var_dict_file_name='dictAttribute.json',
         t_s_dict_file_name='dictTimeSpace.json')
     data_model_8595 = GagesModel.load_datamodel(
         data_dir,
         data_source_file_name='data_source.txt',
         stat_file_name='Statistics.json',
         flow_file_name='flow.npy',
         forcing_file_name='forcing.npy',
         attr_file_name='attr.npy',
         f_dict_file_name='dictFactorize.json',
         var_dict_file_name='dictAttribute.json',
         t_s_dict_file_name='dictTimeSpace.json')
     data_model_sim9505 = GagesModel.load_datamodel(
         sim_data_dir,
         data_source_file_name='test_data_source.txt',
         stat_file_name='test_Statistics.json',
         flow_file_name='test_flow.npy',
         forcing_file_name='test_forcing.npy',
         attr_file_name='test_attr.npy',
         f_dict_file_name='test_dictFactorize.json',
         var_dict_file_name='test_dictAttribute.json',
         t_s_dict_file_name='test_dictTimeSpace.json')
     data_model_9505 = GagesModel.load_datamodel(
         data_dir,
         data_source_file_name='test_data_source.txt',
         stat_file_name='test_Statistics.json',
         flow_file_name='test_flow.npy',
         forcing_file_name='test_forcing.npy',
         attr_file_name='test_attr.npy',
         f_dict_file_name='test_dictFactorize.json',
         var_dict_file_name='test_dictAttribute.json',
         t_s_dict_file_name='test_dictTimeSpace.json')
     sim_gages_model_train = GagesModel.update_data_model(
         self.sim_config_data, data_model_sim8595, data_attr_update=True)
     gages_model_train = GagesModel.update_data_model(self.config_data,
                                                      data_model_8595,
                                                      data_attr_update=True)
     sim_gages_model_test = GagesModel.update_data_model(
         self.sim_config_data,
         data_model_sim9505,
         data_attr_update=True,
         train_stat_dict=sim_gages_model_train.stat_dict)
     gages_model_test = GagesModel.update_data_model(
         self.config_data,
         data_model_9505,
         data_attr_update=True,
         train_stat_dict=gages_model_train.stat_dict)
     save_datamodel(sim_gages_model_train,
                    "1",
                    data_source_file_name='data_source.txt',
                    stat_file_name='Statistics.json',
                    flow_file_name='flow',
                    forcing_file_name='forcing',
                    attr_file_name='attr',
                    f_dict_file_name='dictFactorize.json',
                    var_dict_file_name='dictAttribute.json',
                    t_s_dict_file_name='dictTimeSpace.json')
     save_datamodel(sim_gages_model_test,
                    "1",
                    data_source_file_name='test_data_source.txt',
                    stat_file_name='test_Statistics.json',
                    flow_file_name='test_flow',
                    forcing_file_name='test_forcing',
                    attr_file_name='test_attr',
                    f_dict_file_name='test_dictFactorize.json',
                    var_dict_file_name='test_dictAttribute.json',
                    t_s_dict_file_name='test_dictTimeSpace.json')
     save_datamodel(gages_model_train,
                    "2",
                    data_source_file_name='data_source.txt',
                    stat_file_name='Statistics.json',
                    flow_file_name='flow',
                    forcing_file_name='forcing',
                    attr_file_name='attr',
                    f_dict_file_name='dictFactorize.json',
                    var_dict_file_name='dictAttribute.json',
                    t_s_dict_file_name='dictTimeSpace.json')
     save_datamodel(gages_model_test,
                    "2",
                    data_source_file_name='test_data_source.txt',
                    stat_file_name='test_Statistics.json',
                    flow_file_name='test_flow',
                    forcing_file_name='test_forcing',
                    attr_file_name='test_attr',
                    f_dict_file_name='test_dictFactorize.json',
                    var_dict_file_name='test_dictAttribute.json',
                    t_s_dict_file_name='test_dictTimeSpace.json')
     print("read and save data model")
Пример #21
0
    def test_dam_test(self):
        quick_data_dir = os.path.join(self.config_data.data_path["DB"],
                                      "quickdata")
        sim_data_dir = os.path.join(quick_data_dir,
                                    "allref_85-05_nan-0.1_00-1.0")
        data_dir = os.path.join(quick_data_dir,
                                "allnonref_85-05_nan-0.1_00-1.0")
        data_model_sim8595 = GagesModel.load_datamodel(
            sim_data_dir,
            data_source_file_name='data_source.txt',
            stat_file_name='Statistics.json',
            flow_file_name='flow.npy',
            forcing_file_name='forcing.npy',
            attr_file_name='attr.npy',
            f_dict_file_name='dictFactorize.json',
            var_dict_file_name='dictAttribute.json',
            t_s_dict_file_name='dictTimeSpace.json')
        data_model_8595 = GagesModel.load_datamodel(
            data_dir,
            data_source_file_name='data_source.txt',
            stat_file_name='Statistics.json',
            flow_file_name='flow.npy',
            forcing_file_name='forcing.npy',
            attr_file_name='attr.npy',
            f_dict_file_name='dictFactorize.json',
            var_dict_file_name='dictAttribute.json',
            t_s_dict_file_name='dictTimeSpace.json')
        data_model_sim9505 = GagesModel.load_datamodel(
            sim_data_dir,
            data_source_file_name='test_data_source.txt',
            stat_file_name='test_Statistics.json',
            flow_file_name='test_flow.npy',
            forcing_file_name='test_forcing.npy',
            attr_file_name='test_attr.npy',
            f_dict_file_name='test_dictFactorize.json',
            var_dict_file_name='test_dictAttribute.json',
            t_s_dict_file_name='test_dictTimeSpace.json')
        data_model_9505 = GagesModel.load_datamodel(
            data_dir,
            data_source_file_name='test_data_source.txt',
            stat_file_name='test_Statistics.json',
            flow_file_name='test_flow.npy',
            forcing_file_name='test_forcing.npy',
            attr_file_name='test_attr.npy',
            f_dict_file_name='test_dictFactorize.json',
            var_dict_file_name='test_dictAttribute.json',
            t_s_dict_file_name='test_dictTimeSpace.json')

        sim_gages_model_train = GagesModel.update_data_model(
            self.sim_config_data, data_model_sim8595, data_attr_update=True)
        gages_model_train = GagesModel.update_data_model(self.config_data,
                                                         data_model_8595,
                                                         data_attr_update=True)
        sim_gages_model_test = GagesModel.update_data_model(
            self.sim_config_data,
            data_model_sim9505,
            data_attr_update=True,
            train_stat_dict=sim_gages_model_train.stat_dict)
        gages_model_test = GagesModel.update_data_model(
            self.config_data,
            data_model_9505,
            data_attr_update=True,
            train_stat_dict=gages_model_train.stat_dict)
        nid_dir = os.path.join(
            "/".join(self.config_data.data_path["DB"].split("/")[:-1]), "nid",
            "quickdata")
        nid_input = NidModel.load_nidmodel(
            nid_dir,
            nid_file=self.nid_file,
            nid_source_file_name='nid_source.txt',
            nid_data_file_name='nid_data.shp')
        gage_main_dam_purpose = unserialize_json(
            os.path.join(nid_dir, "dam_main_purpose_dict.json"))
        gage_main_dam_purpose_lst = list(gage_main_dam_purpose.values())
        gage_main_dam_purpose_unique = np.unique(gage_main_dam_purpose_lst)
        data_input = GagesDamDataModel(gages_model_test, nid_input, True,
                                       gage_main_dam_purpose)
        for i in range(0, gage_main_dam_purpose_unique.size):
            sim_gages_model_test.update_model_param('train', nEpoch=300)
            gages_input = choose_which_purpose(
                data_input, purpose=gage_main_dam_purpose_unique[i])
            new_temp_dir = os.path.join(
                gages_input.data_source.data_config.model_dict["dir"]["Temp"],
                gage_main_dam_purpose_unique[i])
            new_out_dir = os.path.join(
                gages_input.data_source.data_config.model_dict["dir"]["Out"],
                gage_main_dam_purpose_unique[i])
            gages_input.update_datamodel_dir(new_temp_dir, new_out_dir)
            model_input = GagesSimDataModel(sim_gages_model_test, gages_input)
            pred, obs = master_test_natural_flow(model_input,
                                                 epoch=self.test_epoch)
            basin_area = model_input.data_model2.data_source.read_attr(
                model_input.data_model2.t_s_dict["sites_id"], ['DRAIN_SQKM'],
                is_return_dict=False)
            mean_prep = model_input.data_model2.data_source.read_attr(
                model_input.data_model2.t_s_dict["sites_id"], ['PPTAVG_BASIN'],
                is_return_dict=False)
            mean_prep = mean_prep / 365 * 10
            pred = _basin_norm(pred, basin_area, mean_prep, to_norm=False)
            obs = _basin_norm(obs, basin_area, mean_prep, to_norm=False)
            save_result(
                model_input.data_model2.data_source.data_config.
                data_path['Temp'], str(self.test_epoch), pred, obs)
            plot_we_need(gages_input,
                         obs,
                         pred,
                         id_col="STAID",
                         lon_col="LNG_GAGE",
                         lat_col="LAT_GAGE")
Пример #22
0
    def test_some_reservoirs(self):
        """choose some small reservoirs for 2nd lstm not for simulate"""
        # 读取模型配置文件
        config_data = self.config_data_lstm
        # according to paper "High-resolution mapping of the world's reservoirs and dams for sustainable river-flow management"
        dor = 0.02
        source_data = GagesSource.choose_some_basins(config_data, config_data.model_dict["data"]["tRangeTrain"],
                                                     screen_basin_area_huc4=False, DOR=dor)
        sites_id_dor = source_data.all_configs['flow_screen_gage_id']

        quick_data_dir = os.path.join(self.config_data_lstm.data_path["DB"], "quickdata")
        data_dir = os.path.join(quick_data_dir, "conus-all_90-10_nan-0.0_00-1.0")
        data_model_8595 = GagesModel.load_datamodel(data_dir,
                                                    data_source_file_name='data_source.txt',
                                                    stat_file_name='Statistics.json', flow_file_name='flow.npy',
                                                    forcing_file_name='forcing.npy', attr_file_name='attr.npy',
                                                    f_dict_file_name='dictFactorize.json',
                                                    var_dict_file_name='dictAttribute.json',
                                                    t_s_dict_file_name='dictTimeSpace.json')
        data_model_9505 = GagesModel.load_datamodel(data_dir,
                                                    data_source_file_name='test_data_source.txt',
                                                    stat_file_name='test_Statistics.json',
                                                    flow_file_name='test_flow.npy',
                                                    forcing_file_name='test_forcing.npy',
                                                    attr_file_name='test_attr.npy',
                                                    f_dict_file_name='test_dictFactorize.json',
                                                    var_dict_file_name='test_dictAttribute.json',
                                                    t_s_dict_file_name='test_dictTimeSpace.json')
        conus_sites_id_all = data_model_8595.t_s_dict["sites_id"]
        nomajordam_source_data = GagesSource.choose_some_basins(self.config_data_natflow,
                                                                self.config_data_natflow.model_dict["data"][
                                                                    "tRangeTrain"],
                                                                screen_basin_area_huc4=False, major_dam_num=0)
        nomajordam_sites_id = nomajordam_source_data.all_configs['flow_screen_gage_id']
        # In no major dam case, all sites are chosen as natural flow generator
        nomajordam_in_conus = np.intersect1d(conus_sites_id_all, nomajordam_sites_id)

        conus_sites_id_dor = np.intersect1d(conus_sites_id_all, sites_id_dor)
        majordam_source_data = GagesSource.choose_some_basins(self.config_data_natflow,
                                                              self.config_data_natflow.model_dict["data"][
                                                                  "tRangeTrain"],
                                                              screen_basin_area_huc4=False, major_dam_num=[1, 2000])
        majordam_sites_id = majordam_source_data.all_configs['flow_screen_gage_id']
        majordam_in_conus = np.intersect1d(conus_sites_id_dor, majordam_sites_id)

        gages_model_train_natflow = GagesModel.update_data_model(self.config_data_natflow, data_model_8595,
                                                                 sites_id_update=nomajordam_in_conus,
                                                                 data_attr_update=True, screen_basin_area_huc4=False)
        gages_model_test_natflow = GagesModel.update_data_model(self.config_data_natflow, data_model_9505,
                                                                sites_id_update=nomajordam_in_conus,
                                                                data_attr_update=True,
                                                                train_stat_dict=gages_model_train_natflow.stat_dict,
                                                                screen_basin_area_huc4=False)

        gages_model_train_lstm = GagesModel.update_data_model(self.config_data_lstm, data_model_8595,
                                                              sites_id_update=majordam_in_conus, data_attr_update=True,
                                                              screen_basin_area_huc4=False)

        gages_model_test_lstm = GagesModel.update_data_model(self.config_data_lstm, data_model_9505,
                                                             sites_id_update=majordam_in_conus, data_attr_update=True,
                                                             train_stat_dict=gages_model_train_lstm.stat_dict,
                                                             screen_basin_area_huc4=False)

        save_datamodel(gages_model_train_natflow, "1", data_source_file_name='data_source.txt',
                       stat_file_name='Statistics.json', flow_file_name='flow', forcing_file_name='forcing',
                       attr_file_name='attr', f_dict_file_name='dictFactorize.json',
                       var_dict_file_name='dictAttribute.json', t_s_dict_file_name='dictTimeSpace.json')
        save_datamodel(gages_model_test_natflow, "1", data_source_file_name='test_data_source.txt',
                       stat_file_name='test_Statistics.json', flow_file_name='test_flow',
                       forcing_file_name='test_forcing', attr_file_name='test_attr',
                       f_dict_file_name='test_dictFactorize.json', var_dict_file_name='test_dictAttribute.json',
                       t_s_dict_file_name='test_dictTimeSpace.json')
        save_datamodel(gages_model_train_lstm, "2", data_source_file_name='data_source.txt',
                       stat_file_name='Statistics.json', flow_file_name='flow', forcing_file_name='forcing',
                       attr_file_name='attr', f_dict_file_name='dictFactorize.json',
                       var_dict_file_name='dictAttribute.json', t_s_dict_file_name='dictTimeSpace.json')
        save_datamodel(gages_model_test_lstm, "2", data_source_file_name='test_data_source.txt',
                       stat_file_name='test_Statistics.json', flow_file_name='test_flow',
                       forcing_file_name='test_forcing', attr_file_name='test_attr',
                       f_dict_file_name='test_dictFactorize.json', var_dict_file_name='test_dictAttribute.json',
                       t_s_dict_file_name='test_dictTimeSpace.json')
        print("read and save data model")
Пример #23
0
    def test_some_reservoirs(self):
        # # a control group for simulate/exp3
        dor = -0.02  # meaning dor < 0.02
        source_data = GagesSource.choose_some_basins(
            self.config_data,
            self.config_data.model_dict["data"]["tRangeTrain"],
            screen_basin_area_huc4=False,
            DOR=dor)
        sites_id_dor = source_data.all_configs['flow_screen_gage_id']

        quick_data_dir = os.path.join(self.config_data.data_path["DB"],
                                      "quickdata")
        data_dir = os.path.join(quick_data_dir,
                                "conus-all_90-10_nan-0.0_00-1.0")
        data_model_9000 = GagesModel.load_datamodel(
            data_dir,
            data_source_file_name='data_source.txt',
            stat_file_name='Statistics.json',
            flow_file_name='flow.npy',
            forcing_file_name='forcing.npy',
            attr_file_name='attr.npy',
            f_dict_file_name='dictFactorize.json',
            var_dict_file_name='dictAttribute.json',
            t_s_dict_file_name='dictTimeSpace.json')
        data_model_0010 = GagesModel.load_datamodel(
            data_dir,
            data_source_file_name='test_data_source.txt',
            stat_file_name='test_Statistics.json',
            flow_file_name='test_flow.npy',
            forcing_file_name='test_forcing.npy',
            attr_file_name='test_attr.npy',
            f_dict_file_name='test_dictFactorize.json',
            var_dict_file_name='test_dictAttribute.json',
            t_s_dict_file_name='test_dictTimeSpace.json')
        conus_sites_id_all = data_model_9000.t_s_dict["sites_id"]
        nomajordam_source_data = GagesSource.choose_some_basins(
            self.config_data,
            self.config_data.model_dict["data"]["tRangeTrain"],
            screen_basin_area_huc4=False,
            major_dam_num=0)
        nomajordam_sites_id = nomajordam_source_data.all_configs[
            'flow_screen_gage_id']
        # In no major dam case, all sites are chosen as natural flow generator
        nomajordam_in_conus = np.intersect1d(conus_sites_id_all,
                                             nomajordam_sites_id)

        conus_sites_id_dor = np.intersect1d(conus_sites_id_all, sites_id_dor)
        majordam_source_data = GagesSource.choose_some_basins(
            self.config_data,
            self.config_data.model_dict["data"]["tRangeTrain"],
            screen_basin_area_huc4=False,
            major_dam_num=[1, 2000])
        majordam_sites_id = majordam_source_data.all_configs[
            'flow_screen_gage_id']
        majordam_in_conus = np.intersect1d(conus_sites_id_dor,
                                           majordam_sites_id)

        chosen_sites_id = np.sort(
            np.append(nomajordam_in_conus, majordam_in_conus))

        gages_model_train_lstm = GagesModel.update_data_model(
            self.config_data,
            data_model_9000,
            sites_id_update=chosen_sites_id,
            data_attr_update=True,
            screen_basin_area_huc4=False)

        gages_model_test_lstm = GagesModel.update_data_model(
            self.config_data,
            data_model_0010,
            sites_id_update=chosen_sites_id,
            data_attr_update=True,
            train_stat_dict=gages_model_train_lstm.stat_dict,
            screen_basin_area_huc4=False)

        save_datamodel(gages_model_train_lstm,
                       data_source_file_name='data_source.txt',
                       stat_file_name='Statistics.json',
                       flow_file_name='flow',
                       forcing_file_name='forcing',
                       attr_file_name='attr',
                       f_dict_file_name='dictFactorize.json',
                       var_dict_file_name='dictAttribute.json',
                       t_s_dict_file_name='dictTimeSpace.json')
        save_datamodel(gages_model_test_lstm,
                       data_source_file_name='test_data_source.txt',
                       stat_file_name='test_Statistics.json',
                       flow_file_name='test_flow',
                       forcing_file_name='test_forcing',
                       attr_file_name='test_attr',
                       f_dict_file_name='test_dictFactorize.json',
                       var_dict_file_name='test_dictAttribute.json',
                       t_s_dict_file_name='test_dictTimeSpace.json')
        print("read and save data model")
Пример #24
0
def pub_lstm(args):
    update_cfg(cfg, args)
    random_seed = cfg.RANDOM_SEED
    test_epoch = cfg.TEST_EPOCH
    gpu_num = cfg.CTX
    train_mode = cfg.TRAIN_MODE
    cache = cfg.CACHE.STATE
    pub_plan = cfg.PUB_PLAN
    plus = cfg.PLUS
    dor = cfg.GAGES.attrScreenParams.DOR
    split_num = cfg.SPLIT_NUM
    print("train and test for PUB: \n")
    config_data = GagesConfig(cfg)
    if cache:
        eco_names = [
            ("ECO2_CODE", 5.2), ("ECO2_CODE", 5.3), ("ECO2_CODE", 6.2),
            ("ECO2_CODE", 7.1), ("ECO2_CODE", 8.1), ("ECO2_CODE", 8.2),
            ("ECO2_CODE", 8.3), ("ECO2_CODE", 8.4), ("ECO2_CODE", 8.5),
            ("ECO2_CODE", 9.2), ("ECO2_CODE", 9.3), ("ECO2_CODE", 9.4),
            ("ECO2_CODE", 9.5), ("ECO2_CODE", 9.6), ("ECO2_CODE", 10.1),
            ("ECO2_CODE", 10.2), ("ECO2_CODE", 10.4), ("ECO2_CODE", 11.1),
            ("ECO2_CODE", 12.1), ("ECO2_CODE", 13.1)
        ]
        quick_data_dir = os.path.join(config_data.data_path["DB"], "quickdata")
        data_dir = os.path.join(quick_data_dir,
                                "conus-all_90-10_nan-0.0_00-1.0")
        data_model_train = GagesModel.load_datamodel(
            data_dir,
            data_source_file_name='data_source.txt',
            stat_file_name='Statistics.json',
            flow_file_name='flow.npy',
            forcing_file_name='forcing.npy',
            attr_file_name='attr.npy',
            f_dict_file_name='dictFactorize.json',
            var_dict_file_name='dictAttribute.json',
            t_s_dict_file_name='dictTimeSpace.json')
        data_model_test = GagesModel.load_datamodel(
            data_dir,
            data_source_file_name='test_data_source.txt',
            stat_file_name='test_Statistics.json',
            flow_file_name='test_flow.npy',
            forcing_file_name='test_forcing.npy',
            attr_file_name='test_attr.npy',
            f_dict_file_name='test_dictFactorize.json',
            var_dict_file_name='test_dictAttribute.json',
            t_s_dict_file_name='test_dictTimeSpace.json')
        conus_sites_id = data_model_train.t_s_dict["sites_id"]
        if pub_plan == 0:
            """do a pub test like freddy's"""
            camels531_gageid_file = os.path.join(config_data.data_path["DB"],
                                                 "camels531", "camels531.txt")
            gauge_df = pd.read_csv(camels531_gageid_file,
                                   dtype={"GaugeID": str})
            gauge_list = gauge_df["GaugeID"].values
            all_sites_camels_531 = np.sort(
                [str(gauge).zfill(8) for gauge in gauge_list])
            sites_id_train = np.intersect1d(conus_sites_id,
                                            all_sites_camels_531)
            # basins not in CAMELS
            sites_id_test = [
                a_temp_site for a_temp_site in conus_sites_id
                if a_temp_site not in all_sites_camels_531
            ]
            assert (all(x < y
                        for x, y in zip(sites_id_test, sites_id_test[1:])))
        elif pub_plan == 1 or pub_plan == 4:
            source_data_dor1 = GagesSource.choose_some_basins(
                config_data,
                config_data.model_dict["data"]["tRangeTrain"],
                screen_basin_area_huc4=False,
                DOR=-dor)
            # basins with dams
            source_data_withdams = GagesSource.choose_some_basins(
                config_data,
                config_data.model_dict["data"]["tRangeTrain"],
                screen_basin_area_huc4=False,
                dam_num=[1, 100000])
            # basins without dams
            source_data_withoutdams = GagesSource.choose_some_basins(
                config_data,
                config_data.model_dict["data"]["tRangeTrain"],
                screen_basin_area_huc4=False,
                dam_num=0)

            sites_id_dor1 = source_data_dor1.all_configs['flow_screen_gage_id']
            sites_id_withdams = source_data_withdams.all_configs[
                'flow_screen_gage_id']

            if pub_plan == 1:
                sites_id_train = source_data_withoutdams.all_configs[
                    'flow_screen_gage_id']
                sites_id_test = np.intersect1d(
                    np.array(sites_id_dor1),
                    np.array(sites_id_withdams)).tolist()
            else:
                sites_id_train = np.intersect1d(
                    np.array(sites_id_dor1),
                    np.array(sites_id_withdams)).tolist()
                sites_id_test = source_data_withoutdams.all_configs[
                    'flow_screen_gage_id']

        elif pub_plan == 2 or pub_plan == 5:
            source_data_dor1 = GagesSource.choose_some_basins(
                config_data,
                config_data.model_dict["data"]["tRangeTrain"],
                screen_basin_area_huc4=False,
                DOR=dor)
            # basins without dams
            source_data_withoutdams = GagesSource.choose_some_basins(
                config_data,
                config_data.model_dict["data"]["tRangeTrain"],
                screen_basin_area_huc4=False,
                dam_num=0)

            if pub_plan == 2:
                sites_id_train = source_data_withoutdams.all_configs[
                    'flow_screen_gage_id']
                sites_id_test = source_data_dor1.all_configs[
                    'flow_screen_gage_id']
            else:
                sites_id_train = source_data_dor1.all_configs[
                    'flow_screen_gage_id']
                sites_id_test = source_data_withoutdams.all_configs[
                    'flow_screen_gage_id']

        elif pub_plan == 3 or pub_plan == 6:
            dor_1 = -dor
            dor_2 = dor
            source_data_dor1 = GagesSource.choose_some_basins(
                config_data,
                config_data.model_dict["data"]["tRangeTrain"],
                screen_basin_area_huc4=False,
                DOR=dor_1)
            # basins with dams
            source_data_withdams = GagesSource.choose_some_basins(
                config_data,
                config_data.model_dict["data"]["tRangeTrain"],
                screen_basin_area_huc4=False,
                dam_num=[1, 100000])
            sites_id_dor1 = source_data_dor1.all_configs['flow_screen_gage_id']
            sites_id_withdams = source_data_withdams.all_configs[
                'flow_screen_gage_id']

            source_data_dor2 = GagesSource.choose_some_basins(
                config_data,
                config_data.model_dict["data"]["tRangeTrain"],
                screen_basin_area_huc4=False,
                DOR=dor_2)

            if pub_plan == 3:
                sites_id_train = np.intersect1d(
                    np.array(sites_id_dor1),
                    np.array(sites_id_withdams)).tolist()
                sites_id_test = source_data_dor2.all_configs[
                    'flow_screen_gage_id']
            else:
                sites_id_train = source_data_dor2.all_configs[
                    'flow_screen_gage_id']
                sites_id_test = np.intersect1d(
                    np.array(sites_id_dor1),
                    np.array(sites_id_withdams)).tolist()

        else:
            print("wrong plan")
            sites_id_train = None
            sites_id_test = None

        train_sites_in_conus = np.intersect1d(conus_sites_id, sites_id_train)
        test_sites_in_conus = np.intersect1d(conus_sites_id, sites_id_test)

        if plus == 0:
            all_index_lst_train_1 = []
            # all sites come from train1 dataset
            sites_lst_train = []
            all_index_lst_test_1 = []
            sites_lst_test_1 = []
            all_index_lst_test_2 = []
            sites_lst_test_2 = []
            np.random.seed(random_seed)
            kf = KFold(n_splits=split_num,
                       shuffle=True,
                       random_state=random_seed)
            eco_name_chosen = []
            for eco_name in eco_names:
                eco_source_data = GagesSource.choose_some_basins(
                    config_data,
                    config_data.model_dict["data"]["tRangeTrain"],
                    screen_basin_area_huc4=False,
                    ecoregion=eco_name)
                eco_sites_id = eco_source_data.all_configs[
                    'flow_screen_gage_id']
                train_sites_id_inter = np.intersect1d(train_sites_in_conus,
                                                      eco_sites_id)
                test_sites_id_inter = np.intersect1d(test_sites_in_conus,
                                                     eco_sites_id)
                if train_sites_id_inter.size < split_num or test_sites_id_inter.size < 1:
                    continue
                for train, test in kf.split(train_sites_id_inter):
                    all_index_lst_train_1.append(train)
                    sites_lst_train.append(train_sites_id_inter[train])
                    all_index_lst_test_1.append(test)
                    sites_lst_test_1.append(train_sites_id_inter[test])
                    if test_sites_id_inter.size < test.size:
                        all_index_lst_test_2.append(
                            np.arange(test_sites_id_inter.size))
                        sites_lst_test_2.append(test_sites_id_inter)
                    else:
                        test2_chosen_idx = np.random.choice(
                            test_sites_id_inter.size, test.size, replace=False)
                        all_index_lst_test_2.append(test2_chosen_idx)
                        sites_lst_test_2.append(
                            test_sites_id_inter[test2_chosen_idx])
                eco_name_chosen.append(eco_name)
        elif plus == -1:
            print("camels pub, only do pub on the camels basins")
            all_index_lst_train_1 = []
            # all sites come from train1 dataset
            sites_lst_train = []
            all_index_lst_test_1 = []
            sites_lst_test_1 = []
            np.random.seed(random_seed)
            kf = KFold(n_splits=split_num,
                       shuffle=True,
                       random_state=random_seed)
            eco_name_chosen = []
            for eco_name in eco_names:
                eco_source_data = GagesSource.choose_some_basins(
                    config_data,
                    config_data.model_dict["data"]["tRangeTrain"],
                    screen_basin_area_huc4=False,
                    ecoregion=eco_name)
                eco_sites_id = eco_source_data.all_configs[
                    'flow_screen_gage_id']
                train_sites_id_inter = np.intersect1d(train_sites_in_conus,
                                                      eco_sites_id)
                if train_sites_id_inter.size < split_num:
                    continue
                for train, test in kf.split(train_sites_id_inter):
                    all_index_lst_train_1.append(train)
                    sites_lst_train.append(train_sites_id_inter[train])
                    all_index_lst_test_1.append(test)
                    sites_lst_test_1.append(train_sites_id_inter[test])
                eco_name_chosen.append(eco_name)
        elif plus == -2:
            print(
                "camels pub, only do pub on the camels basins, same with freddy's split method"
            )
            all_index_lst_train_1 = []
            # all sites come from train1 dataset
            sites_lst_train = []
            all_index_lst_test_1 = []
            sites_lst_test_1 = []
            np.random.seed(random_seed)
            kf = KFold(n_splits=split_num,
                       shuffle=True,
                       random_state=random_seed)

            for train, test in kf.split(train_sites_in_conus):
                all_index_lst_train_1.append(train)
                sites_lst_train.append(train_sites_in_conus[train])
                all_index_lst_test_1.append(test)
                sites_lst_test_1.append(train_sites_in_conus[test])
        else:
            sites_lst_train = []
            sites_lst_test_1 = []
            sites_lst_test_2 = []

            np.random.seed(random_seed)
            kf = KFold(n_splits=split_num,
                       shuffle=True,
                       random_state=random_seed)
            eco_name_chosen = []
            for eco_name in eco_names:
                eco_source_data = GagesSource.choose_some_basins(
                    config_data,
                    config_data.model_dict["data"]["tRangeTrain"],
                    screen_basin_area_huc4=False,
                    ecoregion=eco_name)
                eco_sites_id = eco_source_data.all_configs[
                    'flow_screen_gage_id']
                sites_id_inter_1 = np.intersect1d(train_sites_in_conus,
                                                  eco_sites_id)
                sites_id_inter_2 = np.intersect1d(test_sites_in_conus,
                                                  eco_sites_id)

                if sites_id_inter_1.size < sites_id_inter_2.size:
                    if sites_id_inter_1.size < split_num:
                        continue
                    for train, test in kf.split(sites_id_inter_1):
                        sites_lst_train_1 = sites_id_inter_1[train]
                        sites_lst_test_1.append(sites_id_inter_1[test])

                        chosen_lst_2 = random_choice_no_return(
                            sites_id_inter_2, [train.size, test.size])
                        sites_lst_train_2 = chosen_lst_2[0]
                        sites_lst_test_2.append(chosen_lst_2[1])

                        sites_lst_train.append(
                            np.sort(
                                np.append(sites_lst_train_1,
                                          sites_lst_train_2)))

                else:
                    if sites_id_inter_2.size < split_num:
                        continue
                    for train, test in kf.split(sites_id_inter_2):
                        sites_lst_train_2 = sites_id_inter_2[train]
                        sites_lst_test_2.append(sites_id_inter_2[test])

                        chosen_lst_1 = random_choice_no_return(
                            sites_id_inter_1, [train.size, test.size])
                        sites_lst_train_1 = chosen_lst_1[0]
                        sites_lst_test_1.append(chosen_lst_1[1])

                        sites_lst_train.append(
                            np.sort(
                                np.append(sites_lst_train_1,
                                          sites_lst_train_2)))

                eco_name_chosen.append(eco_name)
        for i in range(split_num):
            sites_ids_train_ilst = [
                sites_lst_train[j] for j in range(len(sites_lst_train))
                if j % split_num == i
            ]
            sites_ids_train_i = np.sort(
                reduce(lambda x, y: np.hstack((x, y)), sites_ids_train_ilst))
            sites_ids_test_ilst_1 = [
                sites_lst_test_1[j] for j in range(len(sites_lst_test_1))
                if j % split_num == i
            ]
            sites_ids_test_i_1 = np.sort(
                reduce(lambda x, y: np.hstack((x, y)), sites_ids_test_ilst_1))

            if plus >= 0:
                sites_ids_test_ilst_2 = [
                    sites_lst_test_2[j] for j in range(len(sites_lst_test_2))
                    if j % split_num == i
                ]
                sites_ids_test_i_2 = np.sort(
                    reduce(lambda x, y: np.hstack((x, y)),
                           sites_ids_test_ilst_2))
            config_data_i = GagesConfig.set_subdir(cfg, str(i))

            gages_model_train_i = GagesModel.update_data_model(
                config_data_i,
                data_model_train,
                sites_id_update=sites_ids_train_i,
                data_attr_update=True,
                screen_basin_area_huc4=False)
            gages_model_test_baseline_i = GagesModel.update_data_model(
                config_data_i,
                data_model_test,
                sites_id_update=sites_ids_train_i,
                data_attr_update=True,
                train_stat_dict=gages_model_train_i.stat_dict,
                screen_basin_area_huc4=False)
            gages_model_test_i_1 = GagesModel.update_data_model(
                config_data_i,
                data_model_test,
                sites_id_update=sites_ids_test_i_1,
                data_attr_update=True,
                train_stat_dict=gages_model_train_i.stat_dict,
                screen_basin_area_huc4=False)
            if plus >= 0:
                gages_model_test_i_2 = GagesModel.update_data_model(
                    config_data_i,
                    data_model_test,
                    sites_id_update=sites_ids_test_i_2,
                    data_attr_update=True,
                    train_stat_dict=gages_model_train_i.stat_dict,
                    screen_basin_area_huc4=False)
            save_datamodel(gages_model_train_i,
                           data_source_file_name='data_source.txt',
                           stat_file_name='Statistics.json',
                           flow_file_name='flow',
                           forcing_file_name='forcing',
                           attr_file_name='attr',
                           f_dict_file_name='dictFactorize.json',
                           var_dict_file_name='dictAttribute.json',
                           t_s_dict_file_name='dictTimeSpace.json')
            save_datamodel(gages_model_test_baseline_i,
                           data_source_file_name='test_data_source_base.txt',
                           stat_file_name='test_Statistics_base.json',
                           flow_file_name='test_flow_base',
                           forcing_file_name='test_forcing_base',
                           attr_file_name='test_attr_base',
                           f_dict_file_name='test_dictFactorize_base.json',
                           var_dict_file_name='test_dictAttribute_base.json',
                           t_s_dict_file_name='test_dictTimeSpace_base.json')
            save_datamodel(gages_model_test_i_1,
                           data_source_file_name='test_data_source.txt',
                           stat_file_name='test_Statistics.json',
                           flow_file_name='test_flow',
                           forcing_file_name='test_forcing',
                           attr_file_name='test_attr',
                           f_dict_file_name='test_dictFactorize.json',
                           var_dict_file_name='test_dictAttribute.json',
                           t_s_dict_file_name='test_dictTimeSpace.json')
            if plus >= 0:
                save_datamodel(gages_model_test_i_2,
                               data_source_file_name='test_data_source_2.txt',
                               stat_file_name='test_Statistics_2.json',
                               flow_file_name='test_flow_2',
                               forcing_file_name='test_forcing_2',
                               attr_file_name='test_attr_2',
                               f_dict_file_name='test_dictFactorize_2.json',
                               var_dict_file_name='test_dictAttribute_2.json',
                               t_s_dict_file_name='test_dictTimeSpace_2.json')
            print("save ecoregion " + str(i) + " data model")
    with torch.cuda.device(gpu_num):
        if train_mode:
            for i in range(split_num):
                data_model = GagesModel.load_datamodel(
                    config_data.data_path["Temp"],
                    str(i),
                    data_source_file_name='data_source.txt',
                    stat_file_name='Statistics.json',
                    flow_file_name='flow.npy',
                    forcing_file_name='forcing.npy',
                    attr_file_name='attr.npy',
                    f_dict_file_name='dictFactorize.json',
                    var_dict_file_name='dictAttribute.json',
                    t_s_dict_file_name='dictTimeSpace.json')
                master_train(data_model, random_seed=random_seed)
        for i in range(split_num):
            data_model_baseline = GagesModel.load_datamodel(
                config_data.data_path["Temp"],
                str(i),
                data_source_file_name='test_data_source_base.txt',
                stat_file_name='test_Statistics_base.json',
                flow_file_name='test_flow_base.npy',
                forcing_file_name='test_forcing_base.npy',
                attr_file_name='test_attr_base.npy',
                f_dict_file_name='test_dictFactorize_base.json',
                var_dict_file_name='test_dictAttribute_base.json',
                t_s_dict_file_name='test_dictTimeSpace_base.json')
            data_model = GagesModel.load_datamodel(
                config_data.data_path["Temp"],
                str(i),
                data_source_file_name='test_data_source.txt',
                stat_file_name='test_Statistics.json',
                flow_file_name='test_flow.npy',
                forcing_file_name='test_forcing.npy',
                attr_file_name='test_attr.npy',
                f_dict_file_name='test_dictFactorize.json',
                var_dict_file_name='test_dictAttribute.json',
                t_s_dict_file_name='test_dictTimeSpace.json')
            if plus >= 0:
                data_model_2 = GagesModel.load_datamodel(
                    config_data.data_path["Temp"],
                    str(i),
                    data_source_file_name='test_data_source_2.txt',
                    stat_file_name='test_Statistics_2.json',
                    flow_file_name='test_flow_2.npy',
                    forcing_file_name='test_forcing_2.npy',
                    attr_file_name='test_attr_2.npy',
                    f_dict_file_name='test_dictFactorize_2.json',
                    var_dict_file_name='test_dictAttribute_2.json',
                    t_s_dict_file_name='test_dictTimeSpace_2.json')
            pred_baseline, obs_baseline = master_test(data_model_baseline,
                                                      epoch=test_epoch,
                                                      save_file_suffix="base")
            basin_area_baseline = data_model_baseline.data_source.read_attr(
                data_model_baseline.t_s_dict["sites_id"], ['DRAIN_SQKM'],
                is_return_dict=False)
            mean_prep_baseline = data_model_baseline.data_source.read_attr(
                data_model_baseline.t_s_dict["sites_id"], ['PPTAVG_BASIN'],
                is_return_dict=False)
            mean_prep_baseline = mean_prep_baseline / 365 * 10
            pred_baseline = _basin_norm(pred_baseline,
                                        basin_area_baseline,
                                        mean_prep_baseline,
                                        to_norm=False)
            obs_baseline = _basin_norm(obs_baseline,
                                       basin_area_baseline,
                                       mean_prep_baseline,
                                       to_norm=False)
            save_result(
                data_model_baseline.data_source.data_config.data_path['Temp'],
                test_epoch,
                pred_baseline,
                obs_baseline,
                pred_name='flow_pred_base',
                obs_name='flow_obs_base')

            pred, obs = master_test(data_model, epoch=test_epoch)
            basin_area = data_model.data_source.read_attr(
                data_model.t_s_dict["sites_id"], ['DRAIN_SQKM'],
                is_return_dict=False)
            mean_prep = data_model.data_source.read_attr(
                data_model.t_s_dict["sites_id"], ['PPTAVG_BASIN'],
                is_return_dict=False)
            mean_prep = mean_prep / 365 * 10
            pred = _basin_norm(pred, basin_area, mean_prep, to_norm=False)
            obs = _basin_norm(obs, basin_area, mean_prep, to_norm=False)
            save_result(data_model.data_source.data_config.data_path['Temp'],
                        test_epoch, pred, obs)
            if plus >= 0:
                pred_2, obs_2 = master_test(data_model_2,
                                            epoch=test_epoch,
                                            save_file_suffix="2")
                basin_area_2 = data_model_2.data_source.read_attr(
                    data_model_2.t_s_dict["sites_id"], ['DRAIN_SQKM'],
                    is_return_dict=False)
                mean_prep_2 = data_model_2.data_source.read_attr(
                    data_model_2.t_s_dict["sites_id"], ['PPTAVG_BASIN'],
                    is_return_dict=False)
                mean_prep_2 = mean_prep_2 / 365 * 10
                pred_2 = _basin_norm(pred_2,
                                     basin_area_2,
                                     mean_prep_2,
                                     to_norm=False)
                obs_2 = _basin_norm(obs_2,
                                    basin_area_2,
                                    mean_prep_2,
                                    to_norm=False)
                save_result(
                    data_model_2.data_source.data_config.data_path['Temp'],
                    test_epoch,
                    pred_2,
                    obs_2,
                    pred_name='flow_pred_2',
                    obs_name='flow_obs_2')
Пример #25
0
        var_dict_file_name='dictAttribute.json',
        t_s_dict_file_name='dictTimeSpace.json')
    data_model_test = GagesModel.load_datamodel(
        data_dir,
        data_source_file_name='test_data_source.txt',
        stat_file_name='test_Statistics.json',
        flow_file_name='test_flow.npy',
        forcing_file_name='test_forcing.npy',
        attr_file_name='test_attr.npy',
        f_dict_file_name='test_dictFactorize.json',
        var_dict_file_name='test_dictAttribute.json',
        t_s_dict_file_name='test_dictTimeSpace.json')

    gages_model_train = GagesModel.update_data_model(
        all_config_Data,
        data_model_train,
        data_attr_update=True,
        screen_basin_area_huc4=False)
    gages_model_test = GagesModel.update_data_model(
        all_config_Data,
        data_model_test,
        data_attr_update=True,
        train_stat_dict=gages_model_train.stat_dict,
        screen_basin_area_huc4=False)
    save_datamodel(gages_model_test,
                   data_source_file_name='test_data_source.txt',
                   stat_file_name='test_Statistics.json',
                   flow_file_name='test_flow',
                   forcing_file_name='test_forcing',
                   attr_file_name='test_attr',
                   f_dict_file_name='test_dictFactorize.json',
Пример #26
0
    def test_some_reservoirs(self):
        """choose some small reservoirs randomly to train and test"""
        # 读取模型配置文件
        config_data = self.config_data_1
        # according to paper "High-resolution mapping of the world's reservoirs and dams for sustainable river-flow management"
        dor = -0.02
        source_data = GagesSource.choose_some_basins(
            config_data,
            config_data.model_dict["data"]["tRangeTrain"],
            DOR=dor)
        sites_id = source_data.all_configs['flow_screen_gage_id']

        # data1 is historical data as input of LSTM-Inv, which will be a kernel for the second LSTM
        quick_data_dir = os.path.join(self.config_data_1.data_path["DB"],
                                      "quickdata")
        data_dir = os.path.join(quick_data_dir,
                                "allnonref_85-05_nan-0.1_00-1.0")
        # for inv model, datamodel of  train and test are same
        data_model_8595 = GagesModel.load_datamodel(
            data_dir,
            data_source_file_name='data_source.txt',
            stat_file_name='Statistics.json',
            flow_file_name='flow.npy',
            forcing_file_name='forcing.npy',
            attr_file_name='attr.npy',
            f_dict_file_name='dictFactorize.json',
            var_dict_file_name='dictAttribute.json',
            t_s_dict_file_name='dictTimeSpace.json')
        # for 2nd model, datamodel of train and test belong to parts of the test time
        data_model_9505 = GagesModel.load_datamodel(
            data_dir,
            data_source_file_name='test_data_source.txt',
            stat_file_name='test_Statistics.json',
            flow_file_name='test_flow.npy',
            forcing_file_name='test_forcing.npy',
            attr_file_name='test_attr.npy',
            f_dict_file_name='test_dictFactorize.json',
            var_dict_file_name='test_dictAttribute.json',
            t_s_dict_file_name='test_dictTimeSpace.json')

        t_range1_train = self.config_data_1.model_dict["data"]["tRangeTrain"]
        t_range1_test = self.config_data_1.model_dict["data"]["tRangeTest"]
        gages_model1_train = GagesModel.update_data_model(
            self.config_data_1,
            data_model_8595,
            sites_id_update=sites_id,
            t_range_update=t_range1_train,
            data_attr_update=True)
        # Because we know data of period "90-95", so that we can get its statistics according to this period
        gages_model1_test = GagesModel.update_data_model(
            self.config_data_1,
            data_model_8595,
            sites_id_update=sites_id,
            t_range_update=t_range1_test,
            data_attr_update=True)
        t_range2_train = self.config_data_2.model_dict["data"]["tRangeTrain"]
        t_range2_test = self.config_data_2.model_dict["data"]["tRangeTest"]
        gages_model2_train = GagesModel.update_data_model(
            self.config_data_2,
            data_model_8595,
            sites_id_update=sites_id,
            t_range_update=t_range2_train,
            data_attr_update=True)
        gages_model2_test = GagesModel.update_data_model(
            self.config_data_2,
            data_model_9505,
            sites_id_update=sites_id,
            t_range_update=t_range2_test,
            data_attr_update=True,
            train_stat_dict=gages_model2_train.stat_dict)
        save_datamodel(gages_model1_train,
                       "1",
                       data_source_file_name='data_source.txt',
                       stat_file_name='Statistics.json',
                       flow_file_name='flow',
                       forcing_file_name='forcing',
                       attr_file_name='attr',
                       f_dict_file_name='dictFactorize.json',
                       var_dict_file_name='dictAttribute.json',
                       t_s_dict_file_name='dictTimeSpace.json')
        save_datamodel(gages_model1_test,
                       "1",
                       data_source_file_name='test_data_source.txt',
                       stat_file_name='test_Statistics.json',
                       flow_file_name='test_flow',
                       forcing_file_name='test_forcing',
                       attr_file_name='test_attr',
                       f_dict_file_name='test_dictFactorize.json',
                       var_dict_file_name='test_dictAttribute.json',
                       t_s_dict_file_name='test_dictTimeSpace.json')
        save_datamodel(gages_model2_train,
                       "2",
                       data_source_file_name='data_source.txt',
                       stat_file_name='Statistics.json',
                       flow_file_name='flow',
                       forcing_file_name='forcing',
                       attr_file_name='attr',
                       f_dict_file_name='dictFactorize.json',
                       var_dict_file_name='dictAttribute.json',
                       t_s_dict_file_name='dictTimeSpace.json')
        save_datamodel(gages_model2_test,
                       "2",
                       data_source_file_name='test_data_source.txt',
                       stat_file_name='test_Statistics.json',
                       flow_file_name='test_flow',
                       forcing_file_name='test_forcing',
                       attr_file_name='test_attr',
                       f_dict_file_name='test_dictFactorize.json',
                       var_dict_file_name='test_dictAttribute.json',
                       t_s_dict_file_name='test_dictTimeSpace.json')
        print("read and save data model")
Пример #27
0
    def test_some_reservoirs(self):
        """choose some small reservoirs to train and test"""
        # 读取模型配置文件
        config_data = self.config_data
        # according to paper "High-resolution mapping of the world's reservoirs and dams for sustainable river-flow management"
        dor = -0.02  # meaning dor < 0.02
        source_data = GagesSource.choose_some_basins(
            config_data,
            config_data.model_dict["data"]["tRangeTrain"],
            screen_basin_area_huc4=False,
            DOR=dor)
        sites_id = source_data.all_configs['flow_screen_gage_id']
        quick_data_dir = os.path.join(self.config_data.data_path["DB"],
                                      "quickdata")
        data_dir = os.path.join(quick_data_dir,
                                "conus-all_90-10_nan-0.0_00-1.0")
        data_model_train = GagesModel.load_datamodel(
            data_dir,
            data_source_file_name='data_source.txt',
            stat_file_name='Statistics.json',
            flow_file_name='flow.npy',
            forcing_file_name='forcing.npy',
            attr_file_name='attr.npy',
            f_dict_file_name='dictFactorize.json',
            var_dict_file_name='dictAttribute.json',
            t_s_dict_file_name='dictTimeSpace.json')
        data_model_test = GagesModel.load_datamodel(
            data_dir,
            data_source_file_name='test_data_source.txt',
            stat_file_name='test_Statistics.json',
            flow_file_name='test_flow.npy',
            forcing_file_name='test_forcing.npy',
            attr_file_name='test_attr.npy',
            f_dict_file_name='test_dictFactorize.json',
            var_dict_file_name='test_dictAttribute.json',
            t_s_dict_file_name='test_dictTimeSpace.json')

        gages_model_train = GagesModel.update_data_model(
            self.config_data,
            data_model_train,
            sites_id_update=sites_id,
            data_attr_update=True,
            screen_basin_area_huc4=False)
        gages_model_test = GagesModel.update_data_model(
            self.config_data,
            data_model_test,
            sites_id_update=sites_id,
            data_attr_update=True,
            train_stat_dict=gages_model_train.stat_dict,
            screen_basin_area_huc4=False)
        save_datamodel(gages_model_train,
                       data_source_file_name='data_source.txt',
                       stat_file_name='Statistics.json',
                       flow_file_name='flow',
                       forcing_file_name='forcing',
                       attr_file_name='attr',
                       f_dict_file_name='dictFactorize.json',
                       var_dict_file_name='dictAttribute.json',
                       t_s_dict_file_name='dictTimeSpace.json')
        save_datamodel(gages_model_test,
                       data_source_file_name='test_data_source.txt',
                       stat_file_name='test_Statistics.json',
                       flow_file_name='test_flow',
                       forcing_file_name='test_forcing',
                       attr_file_name='test_attr',
                       f_dict_file_name='test_dictFactorize.json',
                       var_dict_file_name='test_dictAttribute.json',
                       t_s_dict_file_name='test_dictTimeSpace.json')
        print("read and save data model")
Пример #28
0
 def test_purposes_seperate(self):
     quick_data_dir = os.path.join(self.config_data.data_path["DB"],
                                   "quickdata")
     data_dir = os.path.join(quick_data_dir,
                             "allnonref-dam_95-05_nan-0.1_00-1.0")
     data_model_test = GagesModel.load_datamodel(
         data_dir,
         data_source_file_name='test_data_source.txt',
         stat_file_name='test_Statistics.json',
         flow_file_name='test_flow.npy',
         forcing_file_name='test_forcing.npy',
         attr_file_name='test_attr.npy',
         f_dict_file_name='test_dictFactorize.json',
         var_dict_file_name='test_dictAttribute.json',
         t_s_dict_file_name='test_dictTimeSpace.json')
     data_model = GagesModel.update_data_model(self.config_data,
                                               data_model_test)
     nid_dir = os.path.join(
         "/".join(self.config_data.data_path["DB"].split("/")[:-1]), "nid",
         "quickdata")
     gage_main_dam_purpose = unserialize_json(
         os.path.join(nid_dir, "dam_main_purpose_dict.json"))
     gage_main_dam_purpose_lst = list(gage_main_dam_purpose.values())
     gage_main_dam_purpose_unique = np.unique(gage_main_dam_purpose_lst)
     purpose_regions = {}
     for i in range(gage_main_dam_purpose_unique.size):
         sites_id = []
         for key, value in gage_main_dam_purpose.items():
             if value == gage_main_dam_purpose_unique[i]:
                 sites_id.append(key)
         assert (all(x < y for x, y in zip(sites_id, sites_id[1:])))
         purpose_regions[gage_main_dam_purpose_unique[i]] = sites_id
     id_regions_idx = []
     id_regions_sites_ids = []
     df_id_region = np.array(data_model.t_s_dict["sites_id"])
     for key, value in purpose_regions.items():
         gages_id = value
         c, ind1, ind2 = np.intersect1d(df_id_region,
                                        gages_id,
                                        return_indices=True)
         assert (all(x < y for x, y in zip(ind1, ind1[1:])))
         assert (all(x < y for x, y in zip(c, c[1:])))
         id_regions_idx.append(ind1)
         id_regions_sites_ids.append(c)
     pred_all, obs_all = load_result(self.config_data.data_path["Temp"],
                                     self.test_epoch)
     pred_all = pred_all.reshape(pred_all.shape[0], pred_all.shape[1])
     obs_all = obs_all.reshape(obs_all.shape[0], obs_all.shape[1])
     for i in range(9, len(gage_main_dam_purpose_unique)):
         pred = pred_all[id_regions_idx[i], :]
         obs = obs_all[id_regions_idx[i], :]
         inds = statError(obs, pred)
         inds['STAID'] = id_regions_sites_ids[i]
         inds_df = pd.DataFrame(inds)
         inds_df.to_csv(
             os.path.join(
                 self.config_data.data_path["Out"],
                 gage_main_dam_purpose_unique[i] + "epoch" +
                 str(self.test_epoch) + 'data_df.csv'))
         # plot box,使用seaborn库
         keys = ["Bias", "RMSE", "NSE"]
         inds_test = subset_of_dict(inds, keys)
         box_fig = plot_diff_boxes(inds_test)
         box_fig.savefig(
             os.path.join(
                 self.config_data.data_path["Out"],
                 gage_main_dam_purpose_unique[i] + "epoch" +
                 str(self.test_epoch) + "box_fig.png"))
         # plot ts
         sites = np.array(df_id_region[id_regions_idx[i]])
         t_range = np.array(data_model.t_s_dict["t_final_range"])
         show_me_num = 1
         ts_fig = plot_ts_obs_pred(obs, pred, sites, t_range, show_me_num)
         ts_fig.savefig(
             os.path.join(
                 self.config_data.data_path["Out"],
                 gage_main_dam_purpose_unique[i] + "epoch" +
                 str(self.test_epoch) + "ts_fig.png"))
         # plot nse ecdf
         sites_df_nse = pd.DataFrame({
             "sites": sites,
             keys[2]: inds_test[keys[2]]
         })
         plot_ecdf(
             sites_df_nse, keys[2],
             os.path.join(
                 self.config_data.data_path["Out"],
                 gage_main_dam_purpose_unique[i] + "epoch" +
                 str(self.test_epoch) + "ecdf_fig.png"))
         # plot map
         gauge_dict = data_model.data_source.gage_dict
         save_map_file = os.path.join(
             self.config_data.data_path["Out"],
             gage_main_dam_purpose_unique[i] + "epoch" +
             str(self.test_epoch) + "map_fig.png")
         plot_map(gauge_dict,
                  sites_df_nse,
                  save_file=save_map_file,
                  id_col="STAID",
                  lon_col="LNG_GAGE",
                  lat_col="LAT_GAGE")
Пример #29
0
    def test_split_nomajordam_ecoregion(self):
        quick_data_dir = os.path.join(self.config_data.data_path["DB"], "quickdata")
        # data_dir = os.path.join(quick_data_dir, "conus-all_85-05_nan-0.1_00-1.0")
        data_dir = os.path.join(quick_data_dir, "conus-all_90-10_nan-0.0_00-1.0")
        data_model_train = GagesModel.load_datamodel(data_dir,
                                                     data_source_file_name='data_source.txt',
                                                     stat_file_name='Statistics.json', flow_file_name='flow.npy',
                                                     forcing_file_name='forcing.npy', attr_file_name='attr.npy',
                                                     f_dict_file_name='dictFactorize.json',
                                                     var_dict_file_name='dictAttribute.json',
                                                     t_s_dict_file_name='dictTimeSpace.json')
        data_model_test = GagesModel.load_datamodel(data_dir,
                                                    data_source_file_name='test_data_source.txt',
                                                    stat_file_name='test_Statistics.json',
                                                    flow_file_name='test_flow.npy',
                                                    forcing_file_name='test_forcing.npy',
                                                    attr_file_name='test_attr.npy',
                                                    f_dict_file_name='test_dictFactorize.json',
                                                    var_dict_file_name='test_dictAttribute.json',
                                                    t_s_dict_file_name='test_dictTimeSpace.json')
        conus_sites_id = data_model_train.t_s_dict["sites_id"]
        nomajordam_source_data = GagesSource.choose_some_basins(self.config_data,
                                                                self.config_data.model_dict["data"]["tRangeTrain"],
                                                                screen_basin_area_huc4=False, major_dam_num=0)
        nomajordam_sites_id = nomajordam_source_data.all_configs['flow_screen_gage_id']
        nomajordam_in_conus = np.intersect1d(conus_sites_id, nomajordam_sites_id)
        majordam_source_data = GagesSource.choose_some_basins(self.config_data,
                                                              self.config_data.model_dict["data"]["tRangeTrain"],
                                                              screen_basin_area_huc4=False, major_dam_num=[1, 2000])
        majordam_sites_id = majordam_source_data.all_configs['flow_screen_gage_id']
        majordam_in_conus = np.intersect1d(conus_sites_id, majordam_sites_id)

        sites_lst_train = []
        sites_lst_test_nomajordam = []
        sites_lst_test_majordam = []

        random_seed = 1
        np.random.seed(random_seed)
        kf = KFold(n_splits=self.split_num, shuffle=True, random_state=random_seed)
        eco_name_chosen = []
        for eco_name in self.eco_names:
            eco_source_data = GagesSource.choose_some_basins(self.config_data,
                                                             self.config_data.model_dict["data"]["tRangeTrain"],
                                                             screen_basin_area_huc4=False, ecoregion=eco_name)
            eco_sites_id = eco_source_data.all_configs['flow_screen_gage_id']
            nomajordam_sites_id_inter = np.intersect1d(nomajordam_in_conus, eco_sites_id)
            majordam_sites_id_inter = np.intersect1d(majordam_in_conus, eco_sites_id)

            if nomajordam_sites_id_inter.size < majordam_sites_id_inter.size:
                if nomajordam_sites_id_inter.size < self.split_num:
                    continue
                for train, test in kf.split(nomajordam_sites_id_inter):
                    sites_lst_train_nomajordam = nomajordam_sites_id_inter[train]
                    sites_lst_test_nomajordam.append(nomajordam_sites_id_inter[test])

                    majordam_chosen_lst = random_choice_no_return(majordam_sites_id_inter, [train.size, test.size])
                    sites_lst_train_majordam = majordam_chosen_lst[0]
                    sites_lst_test_majordam.append(majordam_chosen_lst[1])

                    sites_lst_train.append(np.sort(np.append(sites_lst_train_nomajordam, sites_lst_train_majordam)))

            else:
                if majordam_sites_id_inter.size < self.split_num:
                    continue
                for train, test in kf.split(majordam_sites_id_inter):
                    sites_lst_train_majordam = majordam_sites_id_inter[train]
                    sites_lst_test_majordam.append(majordam_sites_id_inter[test])

                    nomajordam_chosen_lst = random_choice_no_return(nomajordam_sites_id_inter,
                                                                        [train.size, test.size])
                    sites_lst_train_nomajordam = nomajordam_chosen_lst[0]
                    sites_lst_test_nomajordam.append(nomajordam_chosen_lst[1])

                    sites_lst_train.append(np.sort(np.append(sites_lst_train_nomajordam, sites_lst_train_majordam)))

            eco_name_chosen.append(eco_name)
        for i in range(self.split_num):
            sites_ids_train_ilst = [sites_lst_train[j] for j in range(len(sites_lst_train)) if j % self.split_num == i]
            sites_ids_train_i = np.sort(reduce(lambda x, y: np.hstack((x, y)), sites_ids_train_ilst))
            sites_ids_test_ilst = [sites_lst_test_nomajordam[j] for j in range(len(sites_lst_test_nomajordam)) if
                                   j % self.split_num == i]
            sites_ids_test_i = np.sort(reduce(lambda x, y: np.hstack((x, y)), sites_ids_test_ilst))
            sites_ids_test_majordam_ilst = [sites_lst_test_majordam[j] for j in range(len(sites_lst_test_majordam)) if
                                            j % self.split_num == i]
            sites_ids_test_majordam_i = np.sort(reduce(lambda x, y: np.hstack((x, y)), sites_ids_test_majordam_ilst))
            subdir_i = os.path.join(self.subdir, str(i))
            config_data_i = GagesConfig.set_subdir(self.config_file, subdir_i)
            gages_model_train_i = GagesModel.update_data_model(config_data_i, data_model_train,
                                                               sites_id_update=sites_ids_train_i,
                                                               data_attr_update=True, screen_basin_area_huc4=False)
            gages_model_test_i = GagesModel.update_data_model(config_data_i, data_model_test,
                                                              sites_id_update=sites_ids_test_i,
                                                              data_attr_update=True,
                                                              train_stat_dict=gages_model_train_i.stat_dict,
                                                              screen_basin_area_huc4=False)
            gages_model_test_majordam_i = GagesModel.update_data_model(config_data_i, data_model_test,
                                                                       sites_id_update=sites_ids_test_majordam_i,
                                                                       data_attr_update=True,
                                                                       train_stat_dict=gages_model_train_i.stat_dict,
                                                                       screen_basin_area_huc4=False)
            save_datamodel(gages_model_train_i, data_source_file_name='data_source.txt',
                           stat_file_name='Statistics.json', flow_file_name='flow', forcing_file_name='forcing',
                           attr_file_name='attr', f_dict_file_name='dictFactorize.json',
                           var_dict_file_name='dictAttribute.json', t_s_dict_file_name='dictTimeSpace.json')
            save_datamodel(gages_model_test_i, data_source_file_name='test_data_source.txt',
                           stat_file_name='test_Statistics.json', flow_file_name='test_flow',
                           forcing_file_name='test_forcing', attr_file_name='test_attr',
                           f_dict_file_name='test_dictFactorize.json', var_dict_file_name='test_dictAttribute.json',
                           t_s_dict_file_name='test_dictTimeSpace.json')
            save_datamodel(gages_model_test_majordam_i, data_source_file_name='test_data_source_majordam.txt',
                           stat_file_name='test_Statistics_majordam.json', flow_file_name='test_flow_majordam',
                           forcing_file_name='test_forcing_majordam', attr_file_name='test_attr_majordam',
                           f_dict_file_name='test_dictFactorize_majordam.json',
                           var_dict_file_name='test_dictAttribute_majordam.json',
                           t_s_dict_file_name='test_dictTimeSpace_majordam.json')
            print("save ecoregion " + str(i) + " data model")
Пример #30
0
    def test_split_dor(self):
        pub_plan = self.pub_plan
        config_file = self.config_file
        config_data = self.config_data
        plus = self.plus
        random_seed = self.random_seed
        split_num = self.split_num
        eco_names = self.eco_names
        dor = self.dor
        quick_data_dir = os.path.join(config_data.data_path["DB"], "quickdata")
        data_dir = os.path.join(quick_data_dir,
                                "conus-all_90-10_nan-0.0_00-1.0")
        data_model_train = GagesModel.load_datamodel(
            data_dir,
            data_source_file_name='data_source.txt',
            stat_file_name='Statistics.json',
            flow_file_name='flow.npy',
            forcing_file_name='forcing.npy',
            attr_file_name='attr.npy',
            f_dict_file_name='dictFactorize.json',
            var_dict_file_name='dictAttribute.json',
            t_s_dict_file_name='dictTimeSpace.json')
        data_model_test = GagesModel.load_datamodel(
            data_dir,
            data_source_file_name='test_data_source.txt',
            stat_file_name='test_Statistics.json',
            flow_file_name='test_flow.npy',
            forcing_file_name='test_forcing.npy',
            attr_file_name='test_attr.npy',
            f_dict_file_name='test_dictFactorize.json',
            var_dict_file_name='test_dictAttribute.json',
            t_s_dict_file_name='test_dictTimeSpace.json')
        conus_sites_id = data_model_train.t_s_dict["sites_id"]
        if pub_plan == 0:
            """do a pub test like freddy's"""
            camels531_gageid_file = os.path.join(config_data.data_path["DB"],
                                                 "camels531", "camels531.txt")
            gauge_df = pd.read_csv(camels531_gageid_file,
                                   dtype={"GaugeID": str})
            gauge_list = gauge_df["GaugeID"].values
            all_sites_camels_531 = np.sort(
                [str(gauge).zfill(8) for gauge in gauge_list])
            sites_id_train = np.intersect1d(conus_sites_id,
                                            all_sites_camels_531)
            # basins not in CAMELS
            sites_id_test = [
                a_temp_site for a_temp_site in conus_sites_id
                if a_temp_site not in all_sites_camels_531
            ]
            assert (all(x < y
                        for x, y in zip(sites_id_test, sites_id_test[1:])))
        elif pub_plan == 1 or pub_plan == 4:
            source_data_dor1 = GagesSource.choose_some_basins(
                config_data,
                config_data.model_dict["data"]["tRangeTrain"],
                screen_basin_area_huc4=False,
                DOR=-dor)
            # basins with dams
            source_data_withdams = GagesSource.choose_some_basins(
                config_data,
                config_data.model_dict["data"]["tRangeTrain"],
                screen_basin_area_huc4=False,
                dam_num=[1, 100000])
            # basins without dams
            source_data_withoutdams = GagesSource.choose_some_basins(
                config_data,
                config_data.model_dict["data"]["tRangeTrain"],
                screen_basin_area_huc4=False,
                dam_num=0)

            sites_id_dor1 = source_data_dor1.all_configs['flow_screen_gage_id']
            sites_id_withdams = source_data_withdams.all_configs[
                'flow_screen_gage_id']

            if pub_plan == 1:
                sites_id_train = source_data_withoutdams.all_configs[
                    'flow_screen_gage_id']
                sites_id_test = np.intersect1d(
                    np.array(sites_id_dor1),
                    np.array(sites_id_withdams)).tolist()
            else:
                sites_id_train = np.intersect1d(
                    np.array(sites_id_dor1),
                    np.array(sites_id_withdams)).tolist()
                sites_id_test = source_data_withoutdams.all_configs[
                    'flow_screen_gage_id']

        elif pub_plan == 2 or pub_plan == 5:
            source_data_dor1 = GagesSource.choose_some_basins(
                config_data,
                config_data.model_dict["data"]["tRangeTrain"],
                screen_basin_area_huc4=False,
                DOR=dor)
            # basins without dams
            source_data_withoutdams = GagesSource.choose_some_basins(
                config_data,
                config_data.model_dict["data"]["tRangeTrain"],
                screen_basin_area_huc4=False,
                dam_num=0)

            if pub_plan == 2:
                sites_id_train = source_data_withoutdams.all_configs[
                    'flow_screen_gage_id']
                sites_id_test = source_data_dor1.all_configs[
                    'flow_screen_gage_id']
            else:
                sites_id_train = source_data_dor1.all_configs[
                    'flow_screen_gage_id']
                sites_id_test = source_data_withoutdams.all_configs[
                    'flow_screen_gage_id']

        elif pub_plan == 3 or pub_plan == 6:
            dor_1 = -dor
            dor_2 = dor
            source_data_dor1 = GagesSource.choose_some_basins(
                config_data,
                config_data.model_dict["data"]["tRangeTrain"],
                screen_basin_area_huc4=False,
                DOR=dor_1)
            # basins with dams
            source_data_withdams = GagesSource.choose_some_basins(
                config_data,
                config_data.model_dict["data"]["tRangeTrain"],
                screen_basin_area_huc4=False,
                dam_num=[1, 100000])
            sites_id_dor1 = source_data_dor1.all_configs['flow_screen_gage_id']
            sites_id_withdams = source_data_withdams.all_configs[
                'flow_screen_gage_id']

            source_data_dor2 = GagesSource.choose_some_basins(
                config_data,
                config_data.model_dict["data"]["tRangeTrain"],
                screen_basin_area_huc4=False,
                DOR=dor_2)

            if pub_plan == 3:
                sites_id_train = np.intersect1d(
                    np.array(sites_id_dor1),
                    np.array(sites_id_withdams)).tolist()
                sites_id_test = source_data_dor2.all_configs[
                    'flow_screen_gage_id']
            else:
                sites_id_train = source_data_dor2.all_configs[
                    'flow_screen_gage_id']
                sites_id_test = np.intersect1d(
                    np.array(sites_id_dor1),
                    np.array(sites_id_withdams)).tolist()
        else:
            print("wrong plan")
            sites_id_train = None
            sites_id_test = None

        train_sites_in_conus = np.intersect1d(conus_sites_id, sites_id_train)
        test_sites_in_conus = np.intersect1d(conus_sites_id, sites_id_test)

        if plus == 0:
            all_index_lst_train_1 = []
            # all sites come from train1 dataset
            sites_lst_train = []
            all_index_lst_test_1 = []
            sites_lst_test_1 = []
            all_index_lst_test_2 = []
            sites_lst_test_2 = []
            np.random.seed(random_seed)
            kf = KFold(n_splits=split_num,
                       shuffle=True,
                       random_state=random_seed)
            eco_name_chosen = []
            for eco_name in eco_names:
                eco_source_data = GagesSource.choose_some_basins(
                    config_data,
                    config_data.model_dict["data"]["tRangeTrain"],
                    screen_basin_area_huc4=False,
                    ecoregion=eco_name)
                eco_sites_id = eco_source_data.all_configs[
                    'flow_screen_gage_id']
                train_sites_id_inter = np.intersect1d(train_sites_in_conus,
                                                      eco_sites_id)
                test_sites_id_inter = np.intersect1d(test_sites_in_conus,
                                                     eco_sites_id)
                if train_sites_id_inter.size < split_num or test_sites_id_inter.size < 1:
                    continue
                for train, test in kf.split(train_sites_id_inter):
                    all_index_lst_train_1.append(train)
                    sites_lst_train.append(train_sites_id_inter[train])
                    all_index_lst_test_1.append(test)
                    sites_lst_test_1.append(train_sites_id_inter[test])
                    if test_sites_id_inter.size < test.size:
                        all_index_lst_test_2.append(
                            np.arange(test_sites_id_inter.size))
                        sites_lst_test_2.append(test_sites_id_inter)
                    else:
                        test2_chosen_idx = np.random.choice(
                            test_sites_id_inter.size, test.size, replace=False)
                        all_index_lst_test_2.append(test2_chosen_idx)
                        sites_lst_test_2.append(
                            test_sites_id_inter[test2_chosen_idx])
                eco_name_chosen.append(eco_name)
        elif plus == -1:
            print("camels pub, only do pub on the camels basins")
            all_index_lst_train_1 = []
            # all sites come from train1 dataset
            sites_lst_train = []
            all_index_lst_test_1 = []
            sites_lst_test_1 = []
            np.random.seed(random_seed)
            kf = KFold(n_splits=split_num,
                       shuffle=True,
                       random_state=random_seed)
            eco_name_chosen = []
            for eco_name in eco_names:
                eco_source_data = GagesSource.choose_some_basins(
                    config_data,
                    config_data.model_dict["data"]["tRangeTrain"],
                    screen_basin_area_huc4=False,
                    ecoregion=eco_name)
                eco_sites_id = eco_source_data.all_configs[
                    'flow_screen_gage_id']
                train_sites_id_inter = np.intersect1d(train_sites_in_conus,
                                                      eco_sites_id)
                if train_sites_id_inter.size < split_num:
                    continue
                for train, test in kf.split(train_sites_id_inter):
                    all_index_lst_train_1.append(train)
                    sites_lst_train.append(train_sites_id_inter[train])
                    all_index_lst_test_1.append(test)
                    sites_lst_test_1.append(train_sites_id_inter[test])
                eco_name_chosen.append(eco_name)
        elif plus == -2:
            print(
                "camels pub, only do pub on the camels basins, same with freddy's split method"
            )
            all_index_lst_train_1 = []
            # all sites come from train1 dataset
            sites_lst_train = []
            all_index_lst_test_1 = []
            sites_lst_test_1 = []
            np.random.seed(random_seed)
            kf = KFold(n_splits=split_num,
                       shuffle=True,
                       random_state=random_seed)

            for train, test in kf.split(train_sites_in_conus):
                all_index_lst_train_1.append(train)
                sites_lst_train.append(train_sites_in_conus[train])
                all_index_lst_test_1.append(test)
                sites_lst_test_1.append(train_sites_in_conus[test])
        else:
            sites_lst_train = []
            sites_lst_test_1 = []
            sites_lst_test_2 = []

            np.random.seed(random_seed)
            kf = KFold(n_splits=split_num,
                       shuffle=True,
                       random_state=random_seed)
            eco_name_chosen = []
            for eco_name in eco_names:
                eco_source_data = GagesSource.choose_some_basins(
                    config_data,
                    config_data.model_dict["data"]["tRangeTrain"],
                    screen_basin_area_huc4=False,
                    ecoregion=eco_name)
                eco_sites_id = eco_source_data.all_configs[
                    'flow_screen_gage_id']
                sites_id_inter_1 = np.intersect1d(train_sites_in_conus,
                                                  eco_sites_id)
                sites_id_inter_2 = np.intersect1d(test_sites_in_conus,
                                                  eco_sites_id)

                if sites_id_inter_1.size < sites_id_inter_2.size:
                    if sites_id_inter_1.size < split_num:
                        continue
                    for train, test in kf.split(sites_id_inter_1):
                        sites_lst_train_1 = sites_id_inter_1[train]
                        sites_lst_test_1.append(sites_id_inter_1[test])

                        chosen_lst_2 = random_choice_no_return(
                            sites_id_inter_2, [train.size, test.size])
                        sites_lst_train_2 = chosen_lst_2[0]
                        sites_lst_test_2.append(chosen_lst_2[1])

                        sites_lst_train.append(
                            np.sort(
                                np.append(sites_lst_train_1,
                                          sites_lst_train_2)))

                else:
                    if sites_id_inter_2.size < split_num:
                        continue
                    for train, test in kf.split(sites_id_inter_2):
                        sites_lst_train_2 = sites_id_inter_2[train]
                        sites_lst_test_2.append(sites_id_inter_2[test])

                        chosen_lst_1 = random_choice_no_return(
                            sites_id_inter_1, [train.size, test.size])
                        sites_lst_train_1 = chosen_lst_1[0]
                        sites_lst_test_1.append(chosen_lst_1[1])

                        sites_lst_train.append(
                            np.sort(
                                np.append(sites_lst_train_1,
                                          sites_lst_train_2)))

                eco_name_chosen.append(eco_name)
        for i in range(split_num):
            sites_ids_train_ilst = [
                sites_lst_train[j] for j in range(len(sites_lst_train))
                if j % split_num == i
            ]
            sites_ids_train_i = np.sort(
                reduce(lambda x, y: np.hstack((x, y)), sites_ids_train_ilst))
            sites_ids_test_ilst_1 = [
                sites_lst_test_1[j] for j in range(len(sites_lst_test_1))
                if j % split_num == i
            ]
            sites_ids_test_i_1 = np.sort(
                reduce(lambda x, y: np.hstack((x, y)), sites_ids_test_ilst_1))

            if plus >= 0:
                sites_ids_test_ilst_2 = [
                    sites_lst_test_2[j] for j in range(len(sites_lst_test_2))
                    if j % split_num == i
                ]
                sites_ids_test_i_2 = np.sort(
                    reduce(lambda x, y: np.hstack((x, y)),
                           sites_ids_test_ilst_2))
            config_data_i = GagesConfig.set_subdir(config_file, str(i))

            gages_model_train_i = GagesModel.update_data_model(
                config_data_i,
                data_model_train,
                sites_id_update=sites_ids_train_i,
                data_attr_update=True,
                screen_basin_area_huc4=False)
            gages_model_test_baseline_i = GagesModel.update_data_model(
                config_data_i,
                data_model_test,
                sites_id_update=sites_ids_train_i,
                data_attr_update=True,
                train_stat_dict=gages_model_train_i.stat_dict,
                screen_basin_area_huc4=False)
            gages_model_test_i_1 = GagesModel.update_data_model(
                config_data_i,
                data_model_test,
                sites_id_update=sites_ids_test_i_1,
                data_attr_update=True,
                train_stat_dict=gages_model_train_i.stat_dict,
                screen_basin_area_huc4=False)
            if plus >= 0:
                gages_model_test_i_2 = GagesModel.update_data_model(
                    config_data_i,
                    data_model_test,
                    sites_id_update=sites_ids_test_i_2,
                    data_attr_update=True,
                    train_stat_dict=gages_model_train_i.stat_dict,
                    screen_basin_area_huc4=False)
            save_datamodel(gages_model_train_i,
                           data_source_file_name='data_source.txt',
                           stat_file_name='Statistics.json',
                           flow_file_name='flow',
                           forcing_file_name='forcing',
                           attr_file_name='attr',
                           f_dict_file_name='dictFactorize.json',
                           var_dict_file_name='dictAttribute.json',
                           t_s_dict_file_name='dictTimeSpace.json')
            save_datamodel(gages_model_test_baseline_i,
                           data_source_file_name='test_data_source_base.txt',
                           stat_file_name='test_Statistics_base.json',
                           flow_file_name='test_flow_base',
                           forcing_file_name='test_forcing_base',
                           attr_file_name='test_attr_base',
                           f_dict_file_name='test_dictFactorize_base.json',
                           var_dict_file_name='test_dictAttribute_base.json',
                           t_s_dict_file_name='test_dictTimeSpace_base.json')
            save_datamodel(gages_model_test_i_1,
                           data_source_file_name='test_data_source.txt',
                           stat_file_name='test_Statistics.json',
                           flow_file_name='test_flow',
                           forcing_file_name='test_forcing',
                           attr_file_name='test_attr',
                           f_dict_file_name='test_dictFactorize.json',
                           var_dict_file_name='test_dictAttribute.json',
                           t_s_dict_file_name='test_dictTimeSpace.json')
            if plus >= 0:
                save_datamodel(gages_model_test_i_2,
                               data_source_file_name='test_data_source_2.txt',
                               stat_file_name='test_Statistics_2.json',
                               flow_file_name='test_flow_2',
                               forcing_file_name='test_forcing_2',
                               attr_file_name='test_attr_2',
                               f_dict_file_name='test_dictFactorize_2.json',
                               var_dict_file_name='test_dictAttribute_2.json',
                               t_s_dict_file_name='test_dictTimeSpace_2.json')
            print("save ecoregion " + str(i) + " data model")