示例#1
0
def test_assert_date_column_is_datetime_object_datetime_year_2263() -> None:
    try:
        assert_date_column_is_datetime_object(INPUT_DATETIME_YEAR_2263_DF)

    # pylint: disable = bare-except
    except:
        pytest.fail("Excpected no raise of ERROR!")
示例#2
0
def test_assert_date_column_is_datetime_object_no_rows_df() -> None:
    try:
        assert_date_column_is_datetime_object(INPUT_NO_ROWS_DF)

    # pylint: disable = bare-except
    except:
        pytest.fail("Excpected no raise of ERROR!")
示例#3
0
def test_assert_date_column_is_datetime_object_timestamp_datetime_input_error() -> None:
    with pytest.raises(ValueError) as err:
        assert_date_column_is_datetime_object(INPUT_TIMESTAMP_DATETIME_DF)
    assert (
        str(err.value)
        == '"DATE"-column in dataframe is not on datetime.datetime format!'
    )
示例#4
0
def create_relative_to_date_df(
    df: pd.DataFrame, relative_date: datetime.datetime
) -> pd.DataFrame:
    """
    Create dataframe where data for relative_date is subtracted from respective
    vector data.

    I.e. Subtract realization data at given relative date from corresponding
    realizations at each individual date for each vector column in dataframe.

    `Assume:`
    Set of realizations are equal for each date in "DATE" column of input dataframe.

    `Input:`
    * df - `Columns` in dataframe: ["DATE", "REAL", vector1, ..., vectorN]

    `Output:`
    * df - `Columns` in dataframe: ["DATE", "REAL", vector1, ..., vectorN]

    NOTE:
    - This function iterates over realization group in input dataframe
    - For-loop makes it possible to get realization not present in relative_date_df, if
    realization is not present in relative_date_df the realization is excluded output.
    """

    assert_date_column_is_datetime_object(df)

    if not set(["DATE", "REAL"]).issubset(set(df.columns)):
        raise ValueError('Expect column "DATE" and "REAL" in input dataframe!')

    # Columns of correct dtype
    _columns = {name: pd.Series(dtype=df.dtypes[name]) for name in df.columns}
    output_df = pd.DataFrame(_columns)

    relative_date_df: pd.DataFrame = df.loc[df["DATE"] == relative_date].drop(
        columns=["DATE"]
    )
    if relative_date_df.empty:
        # Dataframe with columns, but no rows
        return output_df

    vectors = [elm for elm in df.columns if elm not in ("DATE", "REAL")]

    # NOTE: This for-loop makes it possible to get real not represented in relative_date_df!
    for real, real_df in df.groupby("REAL"):
        relative_date_data = relative_date_df.loc[
            relative_date_df["REAL"] == real
        ].drop(columns=["REAL"])

        # If realization does not exist in relative_date_df
        if relative_date_data.empty:
            continue

        real_df[vectors] = real_df[vectors].sub(relative_date_data.iloc[0], axis=1)
        output_df = pd.concat([output_df, real_df], ignore_index=True)

    make_date_column_datetime_object(output_df)
    return output_df
示例#5
0
def test_assert_date_column_is_datetime_object_timestamp_input_error() -> None:
    # fmt: off
    input_timestamp_df = pd.DataFrame(
        columns=["DATE", "A"],
        data=[
            [pd.Timestamp(2000, 1, 15), 1.0],
            [pd.Timestamp(2000, 2, 15), 1.0]],
    )
    # fmt: on
    with pytest.raises(ValueError) as err:
        assert_date_column_is_datetime_object(input_timestamp_df)
    assert (
        str(err.value)
        == '"DATE"-column in dataframe is not on datetime.datetime format!'
    )
示例#6
0
def test_assert_date_column_is_datetime_object_datetime_timestamp_df() -> None:
    # fmt: off
    input_datetime_timestamp_df = pd.DataFrame(
        data=[
            [datetime.datetime(2263, 2, 1), 1.0],
            [pd.Timestamp(2000, 1, 1),      1.0],  # pd.Timestamp NOT detected in df["DATE"][1]
        ],
        columns=["DATE", "A"],
    )
    # fmt: on
    try:
        assert_date_column_is_datetime_object(input_datetime_timestamp_df)

    # pylint: disable = bare-except
    except:
        pytest.fail("Excpected no raise of ERROR!")
示例#7
0
def test_assert_date_column_is_datetime_object_datetime_inconsistent_index_df() -> None:
    """To verify iloc usage"""
    # fmt: off
    input_datetime_year_2263_inconsistent_index_df = pd.DataFrame(
        columns=["DATE", "A"],
        data=[
            [datetime.datetime(2263, 1, 15), 1.0],
            [datetime.datetime(2263, 2, 15), 2.0],
            [datetime.datetime(2263, 3, 15), 3.0],
        ],
        index = [2,5,9]
    )
    # fmt: on
    try:
        assert_date_column_is_datetime_object(
            input_datetime_year_2263_inconsistent_index_df
        )

    # pylint: disable = bare-except
    except:
        pytest.fail("Excpected no raise of ERROR!")
示例#8
0
def test_assert_date_column_is_datetime_object_no_date_column_error() -> None:
    with pytest.raises(ValueError) as err:
        assert_date_column_is_datetime_object(INPUT_EMPTY_DF)
    assert str(err.value) == 'df does not contain column "DATE"'
def create_vectors_statistics_df(vectors_df: pd.DataFrame) -> pd.DataFrame:
    """
    Create vectors statistics dataframe for given vectors in columns of provided vectors dataframe

    Calculate min, max, mean, p10, p90 and p50 for each vector in dataframe column

    `Input:`
    * vectors_df: pd.DataFrame - Dataframe with vectors dataframe and columns:
        ["DATE", "REAL", vector1, ... , vectorN]

    `Returns:`
    * Dataframe with double column level:\n
      [ "DATE",     vector1,                        ... vectorN
                    MEAN, MIN, MAX, P10, P90, P50   ... MEAN, MIN, MAX, P10, P90, P50]
    """
    assert_date_column_is_datetime_object(vectors_df)

    # Get vectors names, keep order
    columns_list = list(vectors_df.columns)
    vector_names = sorted((set(columns_list) ^ set(["DATE", "REAL"])),
                          key=columns_list.index)

    # If no rows of data:
    if not vectors_df.shape[0]:
        columns_tuples = [("DATE", "")]
        for vector in vector_names:
            columns_tuples.extend([
                (vector, StatisticsOptions.MEAN),
                (vector, StatisticsOptions.MIN),
                (vector, StatisticsOptions.MAX),
                (vector, StatisticsOptions.P10),
                (vector, StatisticsOptions.P90),
                (vector, StatisticsOptions.P50),
            ])
        return pd.DataFrame(columns=pd.MultiIndex.from_tuples(columns_tuples))

    # Invert p10 and p90 due to oil industry convention.
    def p10(x: List[float]) -> np.floating:
        return np.nanpercentile(x, q=90)

    def p90(x: List[float]) -> np.floating:
        return np.nanpercentile(x, q=10)

    def p50(x: List[float]) -> np.floating:
        return np.nanpercentile(x, q=50)

    statistics_df: pd.DataFrame = (vectors_df[["DATE"] + vector_names].groupby(
        ["DATE"]).agg([np.nanmean, np.nanmin, np.nanmax, p10, p90,
                       p50]).reset_index(level=["DATE"], col_level=0))

    # Rename columns to StatisticsOptions enum types for strongly typed format
    col_stat_label_map = {
        "nanmin": StatisticsOptions.MIN,
        "nanmax": StatisticsOptions.MAX,
        "nanmean": StatisticsOptions.MEAN,
        "p10": StatisticsOptions.P10,
        "p90": StatisticsOptions.P90,
        "p50": StatisticsOptions.P50,
    }
    statistics_df.rename(columns=col_stat_label_map, level=1, inplace=True)

    make_date_column_datetime_object(statistics_df)

    return statistics_df