def _load_measured_record(facade, obs_keys): measured_data = MeasuredData(facade, obs_keys) measured_data.remove_failed_realizations() measured_data.remove_inactive_observations() measured_data.filter_ensemble_mean_obs(facade.get_alpha()) measured_data.filter_ensemble_std(facade.get_std_cutoff()) return measured_data
def test_history_obs(monkeypatch, facade_snake_oil): fopr = MeasuredData(facade_snake_oil, ["FOPR"]) fopr.remove_inactive_observations() assert all( fopr.data.columns.get_level_values("data_index").values == list( range(199)))
def test_summary_obs(monkeypatch, facade_snake_oil): summary_obs = MeasuredData(facade_snake_oil, ["WOPR_OP1_72"]) summary_obs.remove_inactive_observations() assert all( summary_obs.data.columns.get_level_values("data_index").values == [71]) # Only one observation, we check the key_index is what we expect: assert summary_obs.data.columns.get_level_values( "key_index").values[0] == np.datetime64("2011-12-21")
def _get_measured_data( facade, observation_keys, observation_index_list, alpha, std_cutoff ): measured_data = MeasuredData(facade, observation_keys, observation_index_list) measured_data.remove_failed_realizations() measured_data.remove_inactive_observations() measured_data.filter_ensemble_mean_obs(alpha) measured_data.filter_ensemble_std(std_cutoff) return measured_data
def test_gen_obs(monkeypatch, facade_snake_oil): df = MeasuredData(facade_snake_oil, ["WPR_DIFF_1"]) df.remove_inactive_observations() assert all( df.data.columns.get_level_values("data_index").values == [400, 800, 1200, 1800]) assert all( df.data.columns.get_level_values("key_index").values == [400, 800, 1200, 1800])
def test_remove_failed_realizations(input_dataframe, expected_result, monkeypatch, facade, measured_data_setup): measured_data_setup(input_dataframe, monkeypatch) md = MeasuredData(facade, ["test_key"]) md.remove_failed_realizations() expected_result.columns = _set_multiindex(expected_result) assert md.data.equals( pd.concat({"test_key": expected_result.astype(float)}, axis=1))
def test_get_simulated_data(input_dataframe, expected_result, monkeypatch, facade, measured_data_setup): measured_data_setup(input_dataframe, monkeypatch) md = MeasuredData(facade, ["test_key"]) expected_result.columns = _set_multiindex(expected_result) result = md.get_simulated_data() assert result.equals( pd.concat({"test_key": expected_result.astype(float)}, axis=1))
def test_filter_ensamble_std(std_cutoff, expected_result, monkeypatch, facade, measured_data_setup): expected_result.columns = _set_multiindex(expected_result) input_dataframe = pd.DataFrame(data=[[1, 1.5], [1, 2.5]], index=[1, 2]) input_obs = pd.DataFrame(data=[[1, 2], [0.1, 0.2]], index=["OBS", "STD"]) input_obs.columns = _set_multiindex(input_obs) measured_data_setup(input_dataframe, input_obs, monkeypatch) md = MeasuredData(facade, ["obs_key"]) md.filter_ensemble_std(std_cutoff) assert md.data.equals(pd.concat({"obs_key": expected_result}, axis=1))
def _load_measured_record(enkf_main): facade = LibresFacade(enkf_main) obs_keys = [ facade.get_observation_key(nr) for nr, _ in enumerate(facade.get_observations()) ] measured_data = MeasuredData(facade, obs_keys) measured_data.remove_failed_realizations() measured_data.remove_inactive_observations() measured_data.filter_ensemble_mean_obs(facade.get_alpha()) measured_data.filter_ensemble_std(facade.get_std_cutoff()) return measured_data
def test_filter_ens_mean_obs(alpha, expected_result, monkeypatch, facade, measured_data_setup): expected_result.columns = _set_multiindex(expected_result) input_dataframe = pd.DataFrame(data=[[1, 2], [0.1, 0.2], [1.1, 1.6], [1, 2.5]], index=["OBS", "STD", 1, 2]) measured_data_setup(input_dataframe, monkeypatch) md = MeasuredData(facade, ["test_key"]) md.filter_ensemble_mean_obs(alpha) assert md.data.equals(pd.concat({"test_key": expected_result}, axis=1))
def test_empty_dataset_from_remove_inactive_observations( input_header, measured_data, monkeypatch, facade, measured_data_setup, ): input_header.columns = _set_multiindex(input_header) measured_data_setup(measured_data, input_header, monkeypatch) md = MeasuredData(facade, ["obs_key"]) with pytest.raises(ValueError, match="This operation results in an empty dataset"): md.remove_inactive_observations()
def test_invalid_set_data( facade, monkeypatch, invalid_input, expected_error, valid_dataframe, measured_data_setup, ): measured_data_setup(valid_dataframe, monkeypatch) md = MeasuredData(facade, ["test_key"], index_lists=[[1, 2]]) with pytest.raises(expected_error): md._set_data(invalid_input)
def test_gen_obs_runtime(monkeypatch, copy_snake_oil): obs_file = pathlib.Path.cwd() / "observations" / "observations.txt" with obs_file.open(mode="a") as fin: fin.write(create_general_observation()) res_config = ResConfig("snake_oil.ert") ert = EnKFMain(res_config) facade = LibresFacade(ert) df = MeasuredData(facade, [f"CUSTOM_DIFF_{restart}" for restart in range(1, 500)]) df.remove_inactive_observations() assert df.data.shape == (27, 1995)
def test_gen_obs_and_summary_index_range(monkeypatch, facade_snake_oil): df = MeasuredData(facade_snake_oil, ["WPR_DIFF_1", "FOPR"], [[800], [10]]) df.remove_inactive_observations() assert df.data.columns.get_level_values(0).to_list() == [ "WPR_DIFF_1", "FOPR", ] assert df.data.columns.get_level_values("data_index").to_list() == [ 800, 10, ] assert df.data.loc["OBS"].values == pytest.approx([0.1, 0.23281], abs=0.00001) assert df.data.loc["STD"].values == pytest.approx([0.2, 0.1])
def _extract_and_dump_observations(rdb_api, blob_api): facade = ERT.enkf_facade observation_keys = [ facade.get_observation_key(nr) for nr, _ in enumerate(facade.get_observations()) ] measured_data = MeasuredData(facade, observation_keys) measured_data.remove_inactive_observations() observations = measured_data.data.loc[["OBS", "STD"]] _dump_observations(rdb_api=rdb_api, blob_api=blob_api, observations=observations)
def _load_measured_record(enkf_main): facade = LibresFacade(enkf_main) obs_keys = [ facade.get_observation_key(nr) for nr, _ in enumerate(facade.get_observations()) ] return MeasuredData(facade, obs_keys)
def test_remove_failed_realizations( input_dataframe, expected_result, monkeypatch, facade, measured_data_setup, valid_obs_data, ): measured_data_setup(input_dataframe, valid_obs_data, monkeypatch) md = MeasuredData(facade, ["obs_key"]) md.remove_failed_realizations() expected_result.columns = _set_multiindex(expected_result) expected_result = pd.concat({"obs_key": expected_result}, axis=1) assert md.data.equals(expected_result)
def test_remove_inactive_observations( input_header, measured_data, expected_result, monkeypatch, facade, measured_data_setup, ): input_header.columns = _set_multiindex(input_header) measured_data_setup(measured_data, input_header, monkeypatch) md = MeasuredData(facade, ["obs_key"]) expected_result.columns = _set_multiindex(expected_result) expected_result = pd.concat({"obs_key": expected_result}, axis=1) md.remove_inactive_observations() assert md.data.equals(expected_result)
def test_no_storage_obs_only(monkeypatch, obs_key): shutil.rmtree("storage") res_config = ResConfig("snake_oil.ert") ert = EnKFMain(res_config) facade = LibresFacade(ert) md = MeasuredData(facade, [obs_key], load_data=False) assert set(md.data.columns.get_level_values(0)) == {obs_key}
def test_gen_obs_and_summary(monkeypatch, facade_snake_oil): df = MeasuredData(facade_snake_oil, ["WPR_DIFF_1", "WOPR_OP1_9"]) df.remove_inactive_observations() assert df.data.columns.get_level_values(0).to_list() == [ "WPR_DIFF_1", "WPR_DIFF_1", "WPR_DIFF_1", "WPR_DIFF_1", "WOPR_OP1_9", ] assert df.data.columns.get_level_values("data_index").to_list() == [ 400, 800, 1200, 1800, 8, ]
def test_all_measured_snapshot(snapshot, facade_snake_oil): """ While there is no guarantee that this snapshot is 100% correct, it does represent the current state of loading from storage for the snake_oil case. """ obs_keys = facade_snake_oil.get_matching_wildcards()("*").strings measured_data = MeasuredData(facade_snake_oil, obs_keys) snapshot.assert_match(measured_data.data.to_csv(), "snake_oil_measured_output.csv")
def test_no_storage(monkeypatch, obs_key, expected_msg): shutil.rmtree("storage") res_config = ResConfig("snake_oil.ert") ert = EnKFMain(res_config) facade = LibresFacade(ert) with pytest.raises( loader.ResponseError, match=expected_msg, ): MeasuredData(facade, [obs_key])
def test_block_obs(monkeypatch, tmpdir): """ This test causes util_abort on some runs, so it will not be run by default as it is too flaky. I have chosen to leave it here as it could be useful when debugging. To run the test, run an ensemble_experiment on the snake_oil_field case to create a storage with BLOCK_OBS. """ with tmpdir.as_cwd(): test_data_dir = pathlib.Path(test_data_root) / "snake_oil_field" if not (test_data_dir / "storage").exists(): pytest.skip() else: shutil.copytree(test_data_dir, "test_data") os.chdir("test_data") block_obs = """ \nBLOCK_OBSERVATION RFT_2006 { FIELD = PRESSURE; DATE = 10/01/2010; SOURCE = SUMMARY; OBS P1 { I = 5; J = 5; K = 5; VALUE = 100; ERROR = 5; }; OBS P2 { I = 1; J = 3; K = 8; VALUE = 50; ERROR = 2; }; }; """ obs_file = pathlib.Path.cwd() / "observations" / "observations.txt" with obs_file.open(mode="a") as fin: fin.write(block_obs) res_config = ResConfig("snake_oil.ert") ert = EnKFMain(res_config) facade = LibresFacade(ert) df = MeasuredData(facade, ["RFT_2006"]) df.remove_inactive_observations() assert all( df.data.columns.get_level_values("data_index").values == [0, 1]) assert all( df.data.columns.get_level_values("key_index").values == [0, 1])
def _observation_scaling(facade, config): """ Collects data, performs scaling and applies scaling, assumes validated input. """ calculate_keys = [event.key for event in config.CALCULATE_KEYS.keys] index_lists = [event.index for event in config.CALCULATE_KEYS.keys] measured_data = MeasuredData(facade, calculate_keys, index_lists) measured_data.remove_failed_realizations() measured_data.remove_inactive_observations() measured_data.filter_ensemble_mean_obs(config.CALCULATE_KEYS.alpha) measured_data.filter_ensemble_std(config.CALCULATE_KEYS.std_cutoff) matrix = DataMatrix(measured_data.data) matrix.std_normalization(inplace=True) scale_factor = matrix.get_scaling_factor(config.CALCULATE_KEYS) update_data = _create_active_lists(facade.get_observations(), config.UPDATE_KEYS.keys) _update_scaling(facade.get_observations(), scale_factor, update_data)
def test_summary_obs_runtime(monkeypatch, copy_snake_oil): """ This is mostly a regression test, as reading SUMMARY_OBS was very slow when using SUMMARY_OBSERVATION and not HISTORY_OBSERVATION where multiple observations were pointing to the same response. To simulate that we load the same observations though individual points, and also in one go. To avoid this test being flaky the we assert on the difference in runtime. The difference in runtime we assert on is set to 10x though it should be around 2x """ obs_file = pathlib.Path.cwd() / "observations" / "observations.txt" with obs_file.open(mode="a") as fin: fin.write(create_summary_observation()) res_config = ResConfig("snake_oil.ert") ert = EnKFMain(res_config) facade = LibresFacade(ert) start_time = time.time() foprh = MeasuredData(facade, [f"FOPR_{restart}" for restart in range(1, 201)]) summary_obs_time = time.time() - start_time start_time = time.time() fopr = MeasuredData(facade, ["FOPR"]) history_obs_time = time.time() - start_time assert (fopr.data.columns.get_level_values("data_index").values.tolist() == foprh.data.columns.get_level_values("data_index").values.tolist()) result = foprh.get_simulated_data().values == fopr.get_simulated_data( ).values assert np.logical_and.reduce(result).all() assert summary_obs_time < 10 * history_obs_time
def test_gen_obs_runtime(monkeypatch, copy_snake_oil, snapshot): obs_file = pathlib.Path.cwd() / "observations" / "observations.txt" with obs_file.open(mode="a") as fin: fin.write(create_general_observation()) res_config = ResConfig("snake_oil.ert") ert = EnKFMain(res_config) facade = LibresFacade(ert) df = MeasuredData(facade, [f"CUSTOM_DIFF_{restart}" for restart in range(500)]) snapshot.assert_match(df.data.to_csv(), "snake_oil_gendata_output.csv")
def _create_observation_transformation(ert, db_observations) -> List[dict]: observation_vectors = ert.get_observations() summary_obs_keys = observation_vectors.getTypedKeylist( EnkfObservationImplementationType.SUMMARY_OBS ) active_obs = _extract_active_observations(ert) transformations: Dict = dict() keys = [ert.get_observation_key(i) for i, _ in enumerate(observation_vectors)] data = MeasuredData(ert, keys, load_data=False) observations = data.data.loc[["OBS", "STD"]] for obs_key, active_mask in active_obs.items(): obs_data = _get_obs_data(obs_key, observations[obs_key]) if obs_key in summary_obs_keys: obs_vec = observation_vectors[obs_key] data_key = obs_vec.getDataKey() if data_key in transformations: transformations[data_key]["x_axis"] += obs_data["x_axis"] transformations[data_key]["active"] += active_mask transformations[data_key]["scale"] += [1 for _ in active_mask] else: transformations[data_key] = dict( name=data_key, x_axis=obs_data["x_axis"], scale=[1 for _ in active_mask], active=active_mask, ) else: # Scale is now mocked to 1 for now transformations[obs_key] = dict( name=obs_key, x_axis=obs_data["x_axis"], scale=[1 for _ in active_mask], active=active_mask, ) observation_ids = {obs["name"]: obs["id"] for obs in db_observations} # Sorting by x_axis matches the transformation with the observation, mostly needed for grouped summary obs for key, obs in transformations.items(): x_axis, active, scale = ( list(t) for t in zip(*sorted(zip(obs["x_axis"], obs["active"], obs["scale"]))) ) x_axis = _prepare_x_axis(x_axis) transformations[key]["x_axis"] = x_axis transformations[key]["active"] = active transformations[key]["scale"] = scale transformations[key]["observation_id"] = observation_ids[key] return [transformation for _, transformation in transformations.items()]
def _spearman_correlation(facade, obs_keys, threshold, dry_run): """ Collects data, performs scaling and applies scaling, assumes validated input. """ measured_data = MeasuredData(facade, obs_keys) measured_data.remove_failed_realizations() measured_data.remove_inactive_observations() measured_data.filter_ensemble_std(1.0e-6) simulated_data = measured_data.get_simulated_data() correlation_matrix = _calculate_correlation_matrix(simulated_data) clusters = _cluster_analysis(correlation_matrix, threshold) columns = correlation_matrix.columns # Here the clusters are joined with the key and data index # to group the observations, the column level values are the column # headers, where key_index is the observation key and data_index # is a range. data = list( zip( clusters, columns.get_level_values(0), columns.get_level_values("data_index"), ) ) clustered_data = _cluster_data(data) job_configs = _config_creation(clustered_data) _output_clusters(clustered_data) if not dry_run: _run_scaling(facade, job_configs)
def test_get_data(obs_type, monkeypatch, facade, valid_dataframe, measured_data_setup): facade.get_impl_type_name_for_obs_key.return_value = obs_type factory = measured_data_setup(valid_dataframe, monkeypatch) md = MeasuredData(facade, ["test_key"], index_lists=[[1, 2]]) factory.assert_called_once_with(obs_type) mocked_loader = factory() mocked_loader.assert_called_once_with(facade, "test_key", "test_case", True) df = pd.DataFrame( data=[[2.0, 3.0], [5.0, 6.0]], index=["OBS", "STD"], columns=[1, 2] ) df.columns = _set_multiindex(df) expected_result = pd.concat({"test_key": df}, axis=1) assert md._data.equals(expected_result)
def run(self, job_config): facade = LibresFacade(self.ert()) user_config = load_yaml(job_config) user_config = _insert_default_group(user_config) obs = facade.get_observations() obs_keys = [facade.get_observation_key(nr) for nr, _ in enumerate(obs)] obs_with_data = keys_with_data( obs, obs_keys, facade.get_ensemble_size(), facade.get_current_fs(), ) for config in user_config: job = ScalingJob(obs_keys, obs, obs_with_data, config) measured_data = MeasuredData(facade, job.get_calc_keys(), job.get_index_lists()) job.scale(measured_data)