def test_manual_aggregation(): """Test that aggregating an ensemble using RealizationCombination is the same as calling agg() on the ensemble""" if "__file__" in globals(): # Easen up copying test code into interactive sessions testdir = os.path.dirname(os.path.abspath(__file__)) else: testdir = os.path.abspath(".") reekensemble = ScratchEnsemble( "reektest", testdir + "/data/testensemble-reek001/" + "realization-*/iter-0") reekensemble.load_smry(time_index="yearly", column_keys=["F*"]) reekensemble.load_csv("share/results/volumes/simulator_volume_fipnum.csv") # Aggregate an ensemble into a virtual "mean" realization mean = reekensemble.agg("mean") # Combine the ensemble members directly into a mean computation. # Also returns a virtual realization. manualmean = (1 / 5 * (reekensemble[0] + reekensemble[1] + reekensemble[2] + reekensemble[3] + reekensemble[4])) # Commutativity proof: assert mean["parameters"]["RMS_SEED"] == manualmean["parameters"][ "RMS_SEED"]
def test_get_df_merge(): """Testing merge support in get_df()""" if "__file__" in globals(): # Easen up copying test code into interactive sessions testdir = os.path.dirname(os.path.abspath(__file__)) else: testdir = os.path.abspath(".") reekensemble = ScratchEnsemble( "reektest", testdir + "/data/testensemble-reek001/" + "realization-*/iter-0" ) reekensemble.load_smry(time_index="yearly", column_keys=["F*"]) reekensemble.load_scalar("npv.txt") reekensemble.load_csv("share/results/volumes/simulator_volume_fipnum.csv") outputs = reekensemble.load_txt("outputs.txt") vens = reekensemble.to_virtual() params = vens.get_df("parameters.txt") smrycount = len(vens.get_df("unsmry--yearly").columns) smryparams = vens.get_df("unsmry--yearly", merge="parameters") # The "minus 1" is due to the REAL column being present in both tables. assert len(smryparams.columns) == len(params.columns) + smrycount - 1 paramsoutputs = vens.get_df("parameters", merge=["outputs"]) assert len(paramsoutputs.columns) == len(params.columns) + len(outputs.columns) - 1 assert ( len(vens.get_df("unsmry--yearly", merge=["parameters", "outputs"]).columns) == smrycount + len(params.columns) + len(outputs.columns) - 2 ) assert ( len(vens.get_df("parameters", merge="npv.txt").columns) == len(params.columns) + 1 ) # Symmetry: assert ( len(vens.get_df("npv.txt", merge="parameters.txt").columns) == len(params.columns) + 1 ) # Merge with zone data, inject a mocked dataframe to the realization: vens.data["fipnum2zone"] = pd.DataFrame( columns=["FIPNUM", "ZONE"], data=[ [1, "UpperReek"], [2, "MidReek"], [3, "LowerReek"], [4, "UpperReek"], [5, "MidReek"], [6, "LowerReek"], ], ) volframe = vens.get_df("simulator_volume_fipnum", merge="fipnum2zone") assert "ZONE" in volframe assert "FIPNUM" in volframe assert "STOIIP_OIL" in volframe assert len(volframe["ZONE"].unique()) == 3
def load_per_real_csv_file_using_fmu(ens_path: str, csv_file_rel_path: str) -> pd.DataFrame: LOGGER.debug( f"load_per_real_csv_file_using_fmu() starting - {csv_file_rel_path}") timer = PerfTimer() scratch_ensemble = ScratchEnsemble("tempEnsName", ens_path, autodiscovery=True) df = scratch_ensemble.load_csv(csv_file_rel_path) LOGGER.debug( f"load_per_real_csv_file_using_fmu() finished in: {timer.elapsed_s():.2f}s" ) return df
def test_ensemble_aggregations(tmpdir): """Test aggregations of ensembles, that is taking means, medians, p10 and so on, producing virtual realizations""" if "__file__" in globals(): # Easen up copying test code into interactive sessions testdir = os.path.dirname(os.path.abspath(__file__)) else: testdir = os.path.abspath(".") reekensemble = ScratchEnsemble( "reektest", testdir + "/data/testensemble-reek001/" + "realization-*/iter-0") reekensemble.load_smry(time_index="monthly", column_keys=["F*"]) reekensemble.load_smry(time_index="yearly", column_keys=["F*"]) reekensemble.load_csv("share/results/volumes/simulator_volume_fipnum.csv") reekensemble.load_scalar("npv.txt", convert_numeric=True) stats = { "mean": reekensemble.agg("mean"), "median": reekensemble.agg("median"), "min": reekensemble.agg("min"), "max": reekensemble.agg("max"), "p10": reekensemble.agg("p10"), # low estimate "p90": reekensemble.agg("p90"), # high estimate } tmpdir.chdir() stats["min"].to_disk("virtreal_min", delete=True) stats["max"].to_disk("virtreal_max", delete=True) stats["mean"].to_disk("virtreal_mean", delete=True) assert (stats["min"]["parameters.txt"]["RMS_SEED"] < stats["max"]["parameters.txt"]["RMS_SEED"]) assert (stats["min"]["parameters.txt"]["RMS_SEED"] <= stats["p10"]["parameters.txt"]["RMS_SEED"]) assert (stats["p10"]["parameters.txt"]["RMS_SEED"] <= stats["median"]["parameters.txt"]["RMS_SEED"]) assert (stats["median"]["parameters.txt"]["RMS_SEED"] <= stats["p90"]["parameters.txt"]["RMS_SEED"]) assert (stats["p90"]["parameters.txt"]["RMS_SEED"] <= stats["max"]["parameters.txt"]["RMS_SEED"]) assert (stats["min"]["parameters.txt"]["RMS_SEED"] <= stats["mean"]["parameters.txt"]["RMS_SEED"]) assert (stats["min"]["parameters.txt"]["RMS_SEED"] <= stats["max"]["parameters.txt"]["RMS_SEED"]) assert (stats["min"]["unsmry--monthly"]["FOPT"].iloc[-1] < stats["max"]["unsmry--monthly"]["FOPT"].iloc[-1]) # .loc[2] corresponds to FIPNUM=3 assert (stats["min"]["simulator_volume_fipnum"].iloc[2]["STOIIP_OIL"] < stats["mean"]["simulator_volume_fipnum"].iloc[2]["STOIIP_OIL"]) assert (stats["mean"]["simulator_volume_fipnum"].loc[2]["STOIIP_OIL"] < stats["max"]["simulator_volume_fipnum"].loc[2]["STOIIP_OIL"]) # Aggregation of STATUS also works. Note that min and max # works for string columns, so the available data will vary # depending on aggregation method assert (stats["p10"]["STATUS"].iloc[49]["DURATION"] < stats["max"]["STATUS"].iloc[49]["DURATION"]) # job 49 is the Eclipse forward model assert "npv.txt" in stats["mean"].keys() assert stats["mean"]["npv.txt"] == 3382.5 # Test agg(excludekeys=..) assert "STATUS" not in reekensemble.agg("mean", excludekeys="STATUS").keys() assert "STATUS" not in reekensemble.agg("mean", keylist=["parameters.txt"]).keys() assert (reekensemble.agg("p01")["parameters"]["RMS_SEED"] < reekensemble.agg("p99")["parameters"]["RMS_SEED"]) with pytest.raises(ValueError): reekensemble.agg("foobar") # Check that include/exclude functionality in agg() works: assert ("parameters.txt" not in reekensemble.agg("mean", excludekeys="parameters.txt").keys()) assert ("parameters.txt" not in reekensemble.agg("mean", excludekeys=["parameters.txt"]).keys()) assert "parameters.txt" not in reekensemble.agg("mean", keylist="STATUS").keys() assert "parameters.txt" not in reekensemble.agg("mean", keylist=["STATUS"]).keys() # Shorthand notion works for keys to include, but they # should get returned with fully qualified paths. assert ("share/results/tables/unsmry--yearly.csv" in reekensemble.agg("mean", keylist="unsmry--yearly").keys()) assert ("share/results/tables/unsmry--yearly.csv" in reekensemble.agg("mean", keylist=["unsmry--yearly"]).keys()) assert isinstance( reekensemble.agg("mean", keylist="unsmry--yearly").get_df("unsmry--yearly"), pd.DataFrame, )
from fmu.ensemble import ScratchEnsemble # Gather selected CSV files from each realization # and dump them (each indivitually merged with parameters.txt) # to share/results ens = ScratchEnsemble("", "realization-*/iter-0") csv_files = [ "volumes/geogrid--oil.csv", "volumes/simgrid--oil.csv", "volumes/simulator_volume_fipnum.csv", "tables/rft.csv", "tables/unsmry--monthly.csv", "tables/equil.csv", "tables/relperm.csv", "tables/pvt.csv", ] for file in csv_files: ens.load_csv("share/results/" + file) ens.get_df(file.split("/")[1], merge="parameters.txt").to_csv("share/results/" + file, index=False)
def test_reek001(tmp="TMP"): """Test import of a stripped 5 realization ensemble""" if "__file__" in globals(): # Easen up copying test code into interactive sessions testdir = os.path.dirname(os.path.abspath(__file__)) else: testdir = os.path.abspath(".") reekensemble = ScratchEnsemble( "reektest", testdir + "/data/testensemble-reek001/" + "realization-*/iter-0") assert isinstance(reekensemble, ScratchEnsemble) assert reekensemble.name == "reektest" assert len(reekensemble) == 5 assert isinstance(reekensemble[0], ScratchRealization) assert len( reekensemble.files[reekensemble.files.LOCALPATH == "jobs.json"]) == 5 assert (len(reekensemble.files[reekensemble.files.LOCALPATH == "parameters.txt"]) == 5) assert len( reekensemble.files[reekensemble.files.LOCALPATH == "STATUS"]) == 5 statusdf = reekensemble.get_df("STATUS") assert len(statusdf) == 250 # 5 realizations, 50 jobs in each assert "REAL" in statusdf.columns assert "FORWARD_MODEL" in statusdf.columns statusdf = statusdf.set_index(["REAL", "FORWARD_MODEL"]).sort_index() assert "DURATION" in statusdf.columns # calculated assert "argList" in statusdf.columns # from jobs.json # Sample check the duration for RMS in realization 4: assert int(statusdf.loc[4, "RMS_BATCH"]["DURATION"].values[0]) == 195 # STATUS in real4 is modified to simulate that Eclipse never finished: assert numpy.isnan(statusdf.loc[4, "ECLIPSE100_2014.2"]["DURATION"].values[0]) if not os.path.exists(tmp): os.mkdir(tmp) statusdf.to_csv(os.path.join(tmp, "status.csv"), index=False) # Parameters.txt paramsdf = reekensemble.load_txt("parameters.txt") assert len(paramsdf) == 5 # 5 realizations paramsdf = reekensemble.parameters # also test as property paramsdf = reekensemble.get_df("parameters.txt") assert len(paramsdf) == 5 assert len(paramsdf.columns) == 26 # 25 parameters, + REAL column paramsdf.to_csv(os.path.join(tmp, "params.csv"), index=False) # Check that the ensemble object has not tainted the realization dataframe: assert "REAL" not in reekensemble._realizations[0].get_df("parameters.txt") # The column FOO in parameters is only present in some, and # is present with NaN in real0: assert "FOO" in reekensemble.parameters.columns assert len(reekensemble.parameters["FOO"].dropna()) == 1 # (NaN ine one real, and non-existing in the others is the same thing) # Test loading of another txt file: reekensemble.load_txt("outputs.txt") assert "NPV" in reekensemble.load_txt("outputs.txt").columns # Check implicit discovery assert "outputs.txt" in reekensemble.files["LOCALPATH"].values assert all([os.path.isabs(x) for x in reekensemble.files["FULLPATH"]]) # File discovery: csvvolfiles = reekensemble.find_files("share/results/volumes/*csv", metadata={"GRID": "simgrid"}) assert isinstance(csvvolfiles, pd.DataFrame) assert "REAL" in csvvolfiles assert "FULLPATH" in csvvolfiles assert "LOCALPATH" in csvvolfiles assert "BASENAME" in csvvolfiles # Check the explicit metadata: assert "GRID" in csvvolfiles assert csvvolfiles["GRID"].unique() == ["simgrid"] reekensemble.files.to_csv(os.path.join(tmp, "files.csv"), index=False) # Check that rediscovery does not mess things up: filecount = len(reekensemble.files) newfiles = reekensemble.find_files("share/results/volumes/*csv") # Also note that we skipped metadata here in rediscovery: assert len(reekensemble.files) == filecount assert len(newfiles) == len(csvvolfiles) # The last invocation of find_files() should not return the metadata assert len(newfiles.columns) + 1 == len(csvvolfiles.columns) # FULLPATH should always contain absolute paths assert all([os.path.isabs(x) for x in reekensemble.files["FULLPATH"]]) # The metadata in the rediscovered files should have been removed assert len( reekensemble.files[reekensemble.files["GRID"] == "simgrid"]) == 0 # CSV files csvpath = "share/results/volumes/simulator_volume_fipnum.csv" vol_df = reekensemble.load_csv(csvpath) # Check that we have not tainted the realization dataframes: assert "REAL" not in reekensemble._realizations[0].get_df(csvpath) assert "REAL" in vol_df assert len(vol_df["REAL"].unique()) == 3 # missing in 2 reals vol_df.to_csv(os.path.join(tmp, "simulatorvolumes.csv"), index=False) # Test retrival of cached data vol_df2 = reekensemble.get_df(csvpath) assert "REAL" in vol_df2 assert len(vol_df2["REAL"].unique()) == 3 # missing in 2 reals # Realization deletion: reekensemble.remove_realizations([1, 3]) assert len(reekensemble) == 3 # Readd the same realizations reekensemble.add_realizations([ testdir + "/data/testensemble-reek001/" + "realization-1/iter-0", testdir + "/data/testensemble-reek001/" + "realization-3/iter-0", ]) assert len(reekensemble) == 5 assert len(reekensemble.files) == 24 # File discovery must be repeated for the newly added realizations reekensemble.find_files( "share/results/volumes/" + "simulator_volume_fipnum.csv", metadata={"GRID": "simgrid"}, ) assert len(reekensemble.files) == 25 # Test addition of already added realization: reekensemble.add_realizations(testdir + "/data/testensemble-reek001/" + "realization-1/iter-0") assert len(reekensemble) == 5 assert len(reekensemble.files) == 24 # discovered files are lost! keycount = len(reekensemble.keys()) reekensemble.remove_data("parameters.txt") assert len(reekensemble.keys()) == keycount - 1
def test_get_df(): """Test the data retrieval functionality get_df() in the ensemble context is an aggregator, that will aggregate data from individual realaizations to the ensemble level, with optional merging capabilities performed on realization level.""" testdir = os.path.dirname(os.path.abspath(__file__)) ens = ScratchEnsemble( "reektest", testdir + "/data/testensemble-reek001/" + "realization-*/iter-0") smry = ens.load_smry(column_keys="FO*", time_index="yearly") assert not ens.get_df("unsmry--yearly").empty assert not ens.get_df("unsmry--yearly.csv").empty assert not ens.get_df("share/results/tables/unsmry--yearly").empty assert not ens.get_df("share/results/tables/unsmry--yearly.csv").empty with pytest.raises(KeyError): # pylint: disable=pointless-statement ens.get_df("unsmry--monthly") ens.load_smry(column_keys="FO*", time_index="monthly") assert not ens.get_df("unsmry--monthly").empty with pytest.raises(KeyError): # pylint: disable=pointless-statement ens.get_df("unsmry-monthly") # Tests that we can do merges directly: params = ens.get_df("parameters.txt") smryparams = ens.get_df("unsmry--yearly", merge="parameters") # The set union is to handle the REAL column present in both smry and params: assert len(smryparams.columns) == len( set(smry.columns).union(params.columns)) # Test multiple merges: outputs = ens.load_txt("outputs.txt") assert len( ens.get_df("unsmry--yearly", merge=["parameters", "outputs.txt"]).columns) == len( set(smry.columns).union(params.columns).union( outputs.columns)) # Try merging dataframes: ens.load_csv("share/results/volumes/simulator_volume_fipnum.csv") # Inject a mocked dataframe to the realization, there is # no "add_data" API for ensembles, but we can use the apply() # functionality def fipnum2zone(): """Helper function for injecting mocked frame into each realization""" return pd.DataFrame( columns=["FIPNUM", "ZONE"], data=[ [1, "UpperReek"], [2, "MidReek"], [3, "LowerReek"], [4, "UpperReek"], [5, "MidReek"], [6, "LowerReek"], ], ) ens.apply(fipnum2zone, localpath="fipnum2zone") volframe = ens.get_df("simulator_volume_fipnum", merge="fipnum2zone") assert "ZONE" in volframe assert "FIPNUM" in volframe assert "STOIIP_OIL" in volframe assert len(volframe["ZONE"].unique()) == 3 # Merge with scalar data: ens.load_scalar("npv.txt") vol_npv = ens.get_df("simulator_volume_fipnum", merge="npv.txt") # (this particular data combination does not really make sense) assert "STOIIP_OIL" in vol_npv assert "npv.txt" in vol_npv