def test_assert_date_column_is_datetime_object_datetime_year_2263() -> None: try: assert_date_column_is_datetime_object(INPUT_DATETIME_YEAR_2263_DF) # pylint: disable = bare-except except: pytest.fail("Excpected no raise of ERROR!")
def test_assert_date_column_is_datetime_object_no_rows_df() -> None: try: assert_date_column_is_datetime_object(INPUT_NO_ROWS_DF) # pylint: disable = bare-except except: pytest.fail("Excpected no raise of ERROR!")
def test_assert_date_column_is_datetime_object_timestamp_datetime_input_error() -> None: with pytest.raises(ValueError) as err: assert_date_column_is_datetime_object(INPUT_TIMESTAMP_DATETIME_DF) assert ( str(err.value) == '"DATE"-column in dataframe is not on datetime.datetime format!' )
def create_relative_to_date_df( df: pd.DataFrame, relative_date: datetime.datetime ) -> pd.DataFrame: """ Create dataframe where data for relative_date is subtracted from respective vector data. I.e. Subtract realization data at given relative date from corresponding realizations at each individual date for each vector column in dataframe. `Assume:` Set of realizations are equal for each date in "DATE" column of input dataframe. `Input:` * df - `Columns` in dataframe: ["DATE", "REAL", vector1, ..., vectorN] `Output:` * df - `Columns` in dataframe: ["DATE", "REAL", vector1, ..., vectorN] NOTE: - This function iterates over realization group in input dataframe - For-loop makes it possible to get realization not present in relative_date_df, if realization is not present in relative_date_df the realization is excluded output. """ assert_date_column_is_datetime_object(df) if not set(["DATE", "REAL"]).issubset(set(df.columns)): raise ValueError('Expect column "DATE" and "REAL" in input dataframe!') # Columns of correct dtype _columns = {name: pd.Series(dtype=df.dtypes[name]) for name in df.columns} output_df = pd.DataFrame(_columns) relative_date_df: pd.DataFrame = df.loc[df["DATE"] == relative_date].drop( columns=["DATE"] ) if relative_date_df.empty: # Dataframe with columns, but no rows return output_df vectors = [elm for elm in df.columns if elm not in ("DATE", "REAL")] # NOTE: This for-loop makes it possible to get real not represented in relative_date_df! for real, real_df in df.groupby("REAL"): relative_date_data = relative_date_df.loc[ relative_date_df["REAL"] == real ].drop(columns=["REAL"]) # If realization does not exist in relative_date_df if relative_date_data.empty: continue real_df[vectors] = real_df[vectors].sub(relative_date_data.iloc[0], axis=1) output_df = pd.concat([output_df, real_df], ignore_index=True) make_date_column_datetime_object(output_df) return output_df
def test_assert_date_column_is_datetime_object_timestamp_input_error() -> None: # fmt: off input_timestamp_df = pd.DataFrame( columns=["DATE", "A"], data=[ [pd.Timestamp(2000, 1, 15), 1.0], [pd.Timestamp(2000, 2, 15), 1.0]], ) # fmt: on with pytest.raises(ValueError) as err: assert_date_column_is_datetime_object(input_timestamp_df) assert ( str(err.value) == '"DATE"-column in dataframe is not on datetime.datetime format!' )
def test_assert_date_column_is_datetime_object_datetime_timestamp_df() -> None: # fmt: off input_datetime_timestamp_df = pd.DataFrame( data=[ [datetime.datetime(2263, 2, 1), 1.0], [pd.Timestamp(2000, 1, 1), 1.0], # pd.Timestamp NOT detected in df["DATE"][1] ], columns=["DATE", "A"], ) # fmt: on try: assert_date_column_is_datetime_object(input_datetime_timestamp_df) # pylint: disable = bare-except except: pytest.fail("Excpected no raise of ERROR!")
def test_assert_date_column_is_datetime_object_datetime_inconsistent_index_df() -> None: """To verify iloc usage""" # fmt: off input_datetime_year_2263_inconsistent_index_df = pd.DataFrame( columns=["DATE", "A"], data=[ [datetime.datetime(2263, 1, 15), 1.0], [datetime.datetime(2263, 2, 15), 2.0], [datetime.datetime(2263, 3, 15), 3.0], ], index = [2,5,9] ) # fmt: on try: assert_date_column_is_datetime_object( input_datetime_year_2263_inconsistent_index_df ) # pylint: disable = bare-except except: pytest.fail("Excpected no raise of ERROR!")
def test_assert_date_column_is_datetime_object_no_date_column_error() -> None: with pytest.raises(ValueError) as err: assert_date_column_is_datetime_object(INPUT_EMPTY_DF) assert str(err.value) == 'df does not contain column "DATE"'
def create_vectors_statistics_df(vectors_df: pd.DataFrame) -> pd.DataFrame: """ Create vectors statistics dataframe for given vectors in columns of provided vectors dataframe Calculate min, max, mean, p10, p90 and p50 for each vector in dataframe column `Input:` * vectors_df: pd.DataFrame - Dataframe with vectors dataframe and columns: ["DATE", "REAL", vector1, ... , vectorN] `Returns:` * Dataframe with double column level:\n [ "DATE", vector1, ... vectorN MEAN, MIN, MAX, P10, P90, P50 ... MEAN, MIN, MAX, P10, P90, P50] """ assert_date_column_is_datetime_object(vectors_df) # Get vectors names, keep order columns_list = list(vectors_df.columns) vector_names = sorted((set(columns_list) ^ set(["DATE", "REAL"])), key=columns_list.index) # If no rows of data: if not vectors_df.shape[0]: columns_tuples = [("DATE", "")] for vector in vector_names: columns_tuples.extend([ (vector, StatisticsOptions.MEAN), (vector, StatisticsOptions.MIN), (vector, StatisticsOptions.MAX), (vector, StatisticsOptions.P10), (vector, StatisticsOptions.P90), (vector, StatisticsOptions.P50), ]) return pd.DataFrame(columns=pd.MultiIndex.from_tuples(columns_tuples)) # Invert p10 and p90 due to oil industry convention. def p10(x: List[float]) -> np.floating: return np.nanpercentile(x, q=90) def p90(x: List[float]) -> np.floating: return np.nanpercentile(x, q=10) def p50(x: List[float]) -> np.floating: return np.nanpercentile(x, q=50) statistics_df: pd.DataFrame = (vectors_df[["DATE"] + vector_names].groupby( ["DATE"]).agg([np.nanmean, np.nanmin, np.nanmax, p10, p90, p50]).reset_index(level=["DATE"], col_level=0)) # Rename columns to StatisticsOptions enum types for strongly typed format col_stat_label_map = { "nanmin": StatisticsOptions.MIN, "nanmax": StatisticsOptions.MAX, "nanmean": StatisticsOptions.MEAN, "p10": StatisticsOptions.P10, "p90": StatisticsOptions.P90, "p50": StatisticsOptions.P50, } statistics_df.rename(columns=col_stat_label_map, level=1, inplace=True) make_date_column_datetime_object(statistics_df) return statistics_df