def test_summary2df_dates(): """Test that we have some API possibilities with ISO dates""" eclfiles = EclFiles(DATAFILE) sumdf = summary.df( eclfiles, start_date=datetime.date(2002, 1, 2), end_date="2002-03-01", time_index="daily", datetime=True, ) assert sumdf.index.name == "DATE" assert sumdf.index.dtype == "datetime64[ns]" or sumdf.index.dtype == "datetime64" assert len(sumdf) == 59 assert str(sumdf.index.values[0])[0:10] == "2002-01-02" assert sumdf.index.values[0] == np.datetime64("2002-01-02") assert sumdf.index.values[-1] == np.datetime64("2002-03-01") sumdf = summary.df(eclfiles, time_index="last", datetime=True) assert len(sumdf) == 1 assert sumdf.index.values[0] == np.datetime64("2003-01-02") # Leave this test for the datetime=False behaviour: sumdf = summary.df(eclfiles, time_index="first") assert len(sumdf) == 1 assert str(sumdf.index.values[0]) == "2000-01-01"
def test_df_column_keys(): """Test that we can slice the dataframe on columns""" sumdf = summary.df(EclFiles(REEK), column_keys="FOPT") assert set(sumdf.columns) == {"FOPT"} assert set(sumdf.attrs["meta"].keys()) == {"FOPT"} fop_cols = { "FOPRS", "FOPT", "FOPRH", "FOPTH", "FOPRF", "FOPR", "FOPTS", "FOPTF", "FOPP", } sumdf = summary.df(EclFiles(REEK), column_keys="FOP*") assert set(sumdf.columns) == fop_cols assert set(sumdf.attrs["meta"].keys()) == fop_cols sumdf = summary.df(EclFiles(REEK), column_keys=["FOP*"]) assert set(sumdf.columns) == fop_cols assert set(sumdf.attrs["meta"].keys()) == fop_cols sumdf = summary.df(EclFiles(REEK), column_keys=["FOPR", "FOPT"]) assert set(sumdf.columns) == {"FOPT", "FOPR"} assert set(sumdf.attrs["meta"].keys()) == {"FOPT", "FOPR"} sumdf_no_columns = summary.df(EclFiles(REEK), column_keys=["BOGUS"]) assert sumdf_no_columns.columns.empty assert all(sumdf_no_columns.index == sumdf.index)
def test_datenormalization(): """Test normalization of dates, where dates can be ensured to be on dategrid boundaries""" start = datetime.date(1997, 11, 5) end = datetime.date(2020, 3, 2) assert normalize_dates(start, end, "monthly") == ( datetime.date(1997, 11, 1), datetime.date(2020, 4, 1), ) assert normalize_dates(start, end, "yearly") == ( datetime.date(1997, 1, 1), datetime.date(2021, 1, 1), ) # Check it does not touch already aligned dates assert normalize_dates(datetime.date(1997, 11, 1), datetime.date(2020, 4, 1), "monthly") == (datetime.date(1997, 11, 1), datetime.date(2020, 4, 1)) assert normalize_dates(datetime.date(1997, 1, 1), datetime.date(2021, 1, 1), "yearly") == (datetime.date(1997, 1, 1), datetime.date(2021, 1, 1)) # Check that we normalize correctly with get_smry(): # realization-0 here has its last summary date at 2003-01-02 eclfiles = EclFiles(DATAFILE) daily = summary.df(eclfiles, column_keys="FOPT", time_index="daily") assert str(daily.index[-1]) == "2003-01-02" monthly = summary.df(eclfiles, column_keys="FOPT", time_index="monthly") assert str(monthly.index[-1]) == "2003-02-01" yearly = summary.df(eclfiles, column_keys="FOPT", time_index="yearly") assert str(yearly.index[-1]) == "2004-01-01"
def test_foreseeable_future(tmpdir): """The foreseeable future in reservoir simulation is "defined" as 500 years. Check that we support summary files with this timespan""" tmpdir.chdir() src_dframe = pd.DataFrame([ { "DATE": "2000-01-01", "FPR": 200 }, { "DATE": "2500-01-01", "FPR": 180 }, ]) eclsum = df2eclsum(src_dframe, casename="PLUGABANDON") dframe = summary.df(eclsum) assert (dframe.index == [ dt(2000, 1, 1), # This discrepancy is due to seconds as a 32-bit float # having an accuracy limit (roundoff-error) # https://github.com/equinor/ecl/issues/803 dt(2499, 12, 31, 23, 55, 44), ]).all() # Try with time interpolation involved: dframe = summary.df(eclsum, time_index="yearly") assert len(dframe) == 501 assert dframe.index.max() == datetime.date(year=2500, month=1, day=1) # Try with one-year timesteps: src_dframe = pd.DataFrame({ "DATE": pd.date_range("2000-01-01", "2069-01-01", freq="YS"), "FPR": range(70), }) eclsum = df2eclsum(src_dframe, casename="PLUGABANDON") dframe = summary.df(eclsum) # Still buggy: assert dframe.index[-1] == dt(2068, 12, 31, 23, 57, 52) # Try with one-year timesteps, starting late: src_dframe = pd.DataFrame({ "DATE": [datetime.date(2400 + year, 1, 1) for year in range(69)], "FPR": range(69), }) eclsum = df2eclsum(src_dframe, casename="PLUGABANDON") dframe = summary.df(eclsum) # Works fine when stepping only 68 years: assert dframe.index[-1] == dt(2468, 1, 1, 0, 0, 0)
def test_datenormalization(): """Test normalization of dates, where dates can be ensured to be on dategrid boundaries""" # realization-0 here has its last summary date at 2003-01-02 eclfiles = EclFiles(REEK) daily = summary.df(eclfiles, column_keys="FOPT", time_index="daily", datetime=True) assert str(daily.index[-1])[0:10] == "2003-01-02" monthly = summary.df( eclfiles, column_keys="FOPT", time_index="monthly", datetime=True ) assert str(monthly.index[-1])[0:10] == "2003-02-01" yearly = summary.df( eclfiles, column_keys="FOPT", time_index="yearly", datetime=True ) assert str(yearly.index[-1])[0:10] == "2004-01-01"
def test_summary2df(): """Test that dataframes are produced""" eclfiles = EclFiles(DATAFILE) sumdf = summary.df(eclfiles) assert sumdf.index.name == "DATE" assert sumdf.index.dtype == "datetime64[ns]" or sumdf.index.dtype == "datetime64" assert not sumdf.empty assert sumdf.index.name == "DATE" assert not sumdf.columns.empty assert "FOPT" in sumdf.columns sumdf = summary.df(eclfiles, datetime=True) # (datetime=True is superfluous when raw time reports are requested) assert sumdf.index.name == "DATE" assert sumdf.index.dtype == "datetime64[ns]" or sumdf.index.dtype == "datetime64"
def test_df_column_keys(): """Test that we can slice the dataframe on columns""" sumdf = summary.df(EclFiles(DATAFILE), column_keys="FOPT") assert set(sumdf.columns) == {"FOPT"} assert set(sumdf.attrs["meta"].keys()) == {"FOPT"} fop_cols = { "FOPRS", "FOPT", "FOPRH", "FOPTH", "FOPRF", "FOPR", "FOPTS", "FOPTF", "FOPP", } sumdf = summary.df(EclFiles(DATAFILE), column_keys="FOP*") assert set(sumdf.columns) == fop_cols assert set(sumdf.attrs["meta"].keys()) == fop_cols sumdf = summary.df(EclFiles(DATAFILE), column_keys=["FOP*"]) assert set(sumdf.columns) == fop_cols assert set(sumdf.attrs["meta"].keys()) == fop_cols sumdf = summary.df(EclFiles(DATAFILE), column_keys=["FOPR", "FOPT"]) assert set(sumdf.columns) == {"FOPT", "FOPR"} assert set(sumdf.attrs["meta"].keys()) == {"FOPT", "FOPR"} with pytest.raises(ValueError, match="No valid key"): summary.df(EclFiles(DATAFILE), column_keys=["BOGUS"])
def test_df(): """Test that dataframes are produced""" eclfiles = EclFiles(DATAFILE) sumdf = summary.df(eclfiles) assert sumdf.index.name == "DATE" assert sumdf.index.dtype == "datetime64[ns]" or sumdf.index.dtype == "datetime64" assert not sumdf.empty assert sumdf.index.name == "DATE" assert not sumdf.columns.empty assert "FOPT" in sumdf.columns sumdf = summary.df(eclfiles, datetime=True) # (datetime=True is implicit when raw time reports are requested) assert sumdf.index.name == "DATE" assert sumdf.index.dtype == "datetime64[ns]" or sumdf.index.dtype == "datetime64" # Metadata should be attached using the attrs attribute on a Pandas # Dataframe (considered experimental by Pandas) assert "meta" in sumdf.attrs assert sumdf.attrs["meta"]["FOPR"]["unit"] == "SM3/DAY"
def test_df2eclsum_datetimeindex(): """Test that providing a dataframe with a datetimeindex also works""" dframe = pd.DataFrame( [ {"DATE": "2016-01-01", "FOPT": 1000, "FOPR": 100}, ] ) dframe["DATE"] = pd.to_datetime(dframe["DATE"]) dframe.set_index("DATE") roundtrip = df(df2eclsum(dframe)) assert isinstance(roundtrip.index, pd.DatetimeIndex) assert roundtrip["FOPR"].values == [100] assert roundtrip["FOPT"].values == [1000]
def df(eclfiles: EclFiles) -> pd.DataFrame: """Exctracts connection status history for each compdat connection that is included in the summary data on the form CPI:WELL,I,J,K. CPI stands for connection productivity index. One line is added to the export every time a connection changes status. It is OPEN when CPI>0 and SHUT when CPI=0. The earliest date for any connection will be OPEN, i.e a cell can not be SHUT before it has been OPEN. This means that any cells that are always SHUT will be excluded. The output data set is very sparse compared to the CPI summary data. """ smry = summary.df(eclfiles, column_keys="CPI*") return _extract_status_changes(smry)
def test_ecl2df_errors(tmpdir): """Test error handling on bogus/corrupted summary files""" tmpdir.chdir() Path("FOO.UNSMRY").write_bytes(os.urandom(100)) Path("FOO.SMSPEC").write_bytes(os.urandom(100)) with pytest.raises(OSError, match="Failed to create summary instance"): # This is how libecl reacts to bogus binary data ecl.summary.EclSum("FOO.UNSMRY") # But EclFiles should be more tolerant, as it should be possible # to extract other data if SMRY is corrupted Path("FOO.DATA").write_text("RUNSPEC") assert str(EclFiles("FOO").get_ecldeck()).strip() == "RUNSPEC" with pytest.raises(OSError): EclFiles("FOO").get_eclsum() # Getting a dataframe from bogus data should give empty data: assert df(EclFiles("FOO")).empty
def test_df2eclsum(dframe): """Test that a dataframe can be converted to an EclSum object, and then read back again""" # Massage the dframe first so we can assert on equivalence after. dframe = _fix_dframe_for_libecl(dframe) eclsum = df2eclsum(dframe) if dframe.empty: assert eclsum is None return dframe_roundtrip = df(eclsum) pd.testing.assert_frame_equal( dframe.sort_index(axis=1), dframe_roundtrip.sort_index(axis=1), check_dtype=False, )
def test_extrapolation(): """Summary data should be possible to extrapolate into the future, rates should be zero, cumulatives should be constant""" eclfiles = EclFiles(DATAFILE) lastfopt = summary.df(eclfiles, column_keys="FOPT", time_index="last", datetime=True)["FOPT"].values[0] answer = pd.DataFrame( # This is the maximal date for datetime64[ns] index=[np.datetime64("2262-04-11")], columns=["FOPT", "FOPR"], data=[[lastfopt, 0.0]], ).rename_axis("DATE") pd.testing.assert_frame_equal( summary.df( eclfiles, column_keys=["FOPT", "FOPR"], time_index="2262-04-11", datetime=True, ), answer, ) pd.testing.assert_frame_equal( summary.df( eclfiles, column_keys=["FOPT", "FOPR"], time_index=[datetime.date(2262, 4, 11)], # NB: df() does not support datetime64 for time_index datetime=True, ), answer, ) # Pandas does not support DatetimeIndex beyound 2262: with pytest.raises(pd.errors.OutOfBoundsDatetime): summary.df( eclfiles, column_keys=["FOPT"], time_index=[datetime.date(2300, 1, 1)], datetime=True, ) # But without datetime, we can get it extrapolated by libecl: assert summary.df(eclfiles, column_keys=["FOPT"], time_index=[datetime.date(2300, 1, 1) ])["FOPT"].values == [lastfopt]
def test_duplicated_summary_vectors(caplog): """EclSum files on disk may contain repeated vectors if the user has inserted a vector name twice in the SUMMARY section ecl2df.summary.df() should deduplicate this, and give a warning. """ # ecl2df.df2eclsum() is not able to mock such a UNSMRY file. dupe_datafile = ( TESTDIR / "data" / "eightcells" / "eightcells_duplicated_summary_vector" / "EIGHTCELLS_DUPES.DATA" ) assert "SUMMARY\nFOPR\nFOPR" in dupe_datafile.read_text() deduplicated_dframe = df(EclFiles(dupe_datafile)) assert (deduplicated_dframe.columns == ["YEARS", "FOPR"]).all() assert "Duplicated columns detected" in caplog.text
def test_paramsupport(tmpdir): """Test that we can merge in parameters.txt This test code manipulates the paths in the checked out repository (as it involves some pointing upwards in the directory structure) It should not leave any extra files around, but requires certain filenames not to be under version control. """ tmpcsvfile = tmpdir / "sum.csv" eclfiles = EclFiles(DATAFILE) parameterstxt = Path(eclfiles.get_path()) / "parameters.txt" if parameterstxt.is_file(): parameterstxt.unlink() parameterstxt.write_text("FOO 1\nBAR 3", encoding="utf-8") sys.argv = ["ecl2csv", "summary", DATAFILE, "-o", str(tmpcsvfile), "-p"] ecl2csv.main() disk_df = pd.read_csv(tmpcsvfile) assert "FOPT" in disk_df assert "FOO" in disk_df assert "BAR" in disk_df assert disk_df["BAR"].unique()[0] == 3 parameterstxt.unlink() parametersyml = Path(eclfiles.get_path()) / "parameters.yml" if parametersyml.is_file(): parametersyml.unlink() parametersyml.write_text(yaml.dump({"FOO": 1, "BAR": 3}), encoding="utf-8") sys.argv = ["ecl2csv", "summary", DATAFILE, "-o", str(tmpcsvfile), "-p"] ecl2csv.main() disk_df = pd.read_csv(str(tmpcsvfile)) assert "FOPT" in disk_df assert "FOO" in disk_df assert len(disk_df["FOO"].unique()) == 1 assert disk_df["FOO"].unique()[0] == 1 assert "BAR" in disk_df assert len(disk_df["BAR"].unique()) == 1 assert disk_df["BAR"].unique()[0] == 3 # Test the merging from summary.df() explicitly: assert "FOO" in summary.df(eclfiles, params=True, paramfile=None) assert "FOO" not in summary.df(eclfiles, params=False, paramfile=None) assert "FOO" not in summary.df(eclfiles, params=None, paramfile=None) assert "FOO" in summary.df(eclfiles, params=False, paramfile=parametersyml) assert "FOO" in summary.df(eclfiles, params=None, paramfile=parametersyml) assert "FOO" in summary.df(eclfiles, params=None, paramfile="parameters.yml") # Non-existing relative path is a soft error: assert "FOO" not in summary.df(eclfiles, params=None, paramfile="notexisting/parameters.yml") # Non-existing absolute path is a hard error: with pytest.raises(FileNotFoundError): summary.df(eclfiles, params=None, paramfile="/tmp/notexisting/parameters.yml") parametersyml.unlink()
def test_summary2df_dates(tmpdir): """Test that we have some API possibilities with ISO dates""" eclfiles = EclFiles(DATAFILE) sumdf = summary.df( eclfiles, start_date=datetime.date(2002, 1, 2), end_date="2002-03-01", time_index="daily", ) assert sumdf.index.name == "DATE" # This is the default when daily index is requested: assert sumdf.index.dtype == "object" assert len(sumdf) == 59 assert str(sumdf.index.values[0]) == "2002-01-02" assert str(sumdf.index.values[-1]) == "2002-03-01" sumdf = summary.df(eclfiles, time_index="last") assert len(sumdf) == 1 assert str(sumdf.index.values[0]) == "2003-01-02" sumdf = summary.df(eclfiles, time_index="first") assert len(sumdf) == 1 assert str(sumdf.index.values[0]) == "2000-01-01" sumdf = summary.df( eclfiles, start_date=datetime.date(2002, 1, 2), end_date="2002-03-01", time_index="daily", datetime=True, ) assert sumdf.index.name == "DATE" assert sumdf.index.dtype == "datetime64[ns]" or sumdf.index.dtype == "datetime64" tmpcsvfile = tmpdir.join(".TMP-sum.csv") sys.argv = [ "ecl2csv", "summary", "-v", DATAFILE, "-o", str(tmpcsvfile), "--start_date", "2002-01-02", "--end_date", "2003-01-02", ] ecl2csv.main() disk_df = pd.read_csv(tmpcsvfile) assert len(disk_df) == 97 # Includes timestamps assert str(disk_df["DATE"].values[0]) == "2002-01-02 00:00:00" assert str(disk_df["DATE"].values[-1]) == "2003-01-02 00:00:00" tmpcsvfile = tmpdir.join(".TMP-sum.csv") sys.argv = [ "ecl2csv", "summary", DATAFILE, "-o", str(tmpcsvfile), "--time_index", "daily", "--start_date", "2002-01-02", "--end_date", "2003-01-02", ] ecl2csv.main() disk_df = pd.read_csv(tmpcsvfile) assert len(disk_df) == 366 assert str(disk_df["DATE"].values[0]) == "2002-01-02" assert str(disk_df["DATE"].values[-1]) == "2003-01-02"