Пример #1
0
    def test_resample(self):
        df = timeseries_df.copy()
        df.index.name = "time_column"
        df.reset_index(inplace=True)

        post_df = proc.resample(
            df=df,
            rule="1D",
            method="ffill",
            time_column="time_column",
        )
        self.assertListEqual(post_df["label"].tolist(),
                             ["x", "y", "y", "y", "z", "z", "q"])
        self.assertListEqual(post_df["y"].tolist(),
                             [1.0, 2.0, 2.0, 2.0, 3.0, 3.0, 4.0])

        post_df = proc.resample(
            df=df,
            rule="1D",
            method="asfreq",
            time_column="time_column",
            fill_value=0,
        )
        self.assertListEqual(post_df["label"].tolist(),
                             ["x", "y", 0, 0, "z", 0, "q"])
        self.assertListEqual(post_df["y"].tolist(),
                             [1.0, 2.0, 0, 0, 3.0, 0, 4.0])
Пример #2
0
def test_resample_should_raise_ex():
    with pytest.raises(InvalidPostProcessingError):
        pp.resample(
            df=categories_df,
            rule="1D",
            method="asfreq",
        )

    with pytest.raises(InvalidPostProcessingError):
        pp.resample(
            df=timeseries_df,
            rule="1D",
            method="foobar",
        )
Пример #3
0
def test_resample():
    post_df = pp.resample(df=timeseries_df, rule="1D", method="ffill")
    """
               label    y
    2019-01-01     x  1.0
    2019-01-02     y  2.0
    2019-01-03     y  2.0
    2019-01-04     y  2.0
    2019-01-05     z  3.0
    2019-01-06     z  3.0
    2019-01-07     q  4.0
    """
    assert post_df.equals(
        pd.DataFrame(
            index=pd.to_datetime(
                [
                    "2019-01-01",
                    "2019-01-02",
                    "2019-01-03",
                    "2019-01-04",
                    "2019-01-05",
                    "2019-01-06",
                    "2019-01-07",
                ]
            ),
            data={
                "label": ["x", "y", "y", "y", "z", "z", "q"],
                "y": [1.0, 2.0, 2.0, 2.0, 3.0, 3.0, 4.0],
            },
        )
    )
Пример #4
0
def test_resample():
    df = timeseries_df.copy()
    df.index.name = "time_column"
    df.reset_index(inplace=True)

    post_df = resample(
        df=df,
        rule="1D",
        method="ffill",
        time_column="time_column",
    )
    assert post_df["label"].tolist() == ["x", "y", "y", "y", "z", "z", "q"]

    assert post_df["y"].tolist() == [1.0, 2.0, 2.0, 2.0, 3.0, 3.0, 4.0]

    post_df = resample(
        df=df,
        rule="1D",
        method="asfreq",
        time_column="time_column",
        fill_value=0,
    )
    assert post_df["label"].tolist() == ["x", "y", 0, 0, "z", 0, "q"]
    assert post_df["y"].tolist() == [1.0, 2.0, 0, 0, 3.0, 0, 4.0]
Пример #5
0
def test_resample_zero_fill():
    post_df = pp.resample(df=timeseries_df, rule="1D", method="asfreq", fill_value=0)
    assert post_df.equals(
        pd.DataFrame(
            index=pd.to_datetime(
                [
                    "2019-01-01",
                    "2019-01-02",
                    "2019-01-03",
                    "2019-01-04",
                    "2019-01-05",
                    "2019-01-06",
                    "2019-01-07",
                ]
            ),
            data={
                "label": ["x", "y", 0, 0, "z", 0, "q"],
                "y": [1.0, 2.0, 0, 0, 3.0, 0, 4.0],
            },
        )
    )
Пример #6
0
def test_resample_linear():
    df = pd.DataFrame(
        index=to_datetime(["2019-01-01", "2019-01-05", "2019-01-08"]),
        data={"label": ["a", "e", "j"], "y": [1.0, 5.0, 8.0]},
    )
    post_df = pp.resample(df=df, rule="1D", method="linear")
    """
               label    y
    2019-01-01     a  1.0
    2019-01-02   NaN  2.0
    2019-01-03   NaN  3.0
    2019-01-04   NaN  4.0
    2019-01-05     e  5.0
    2019-01-06   NaN  6.0
    2019-01-07   NaN  7.0
    2019-01-08     j  8.0
    """
    assert post_df.equals(
        pd.DataFrame(
            index=pd.to_datetime(
                [
                    "2019-01-01",
                    "2019-01-02",
                    "2019-01-03",
                    "2019-01-04",
                    "2019-01-05",
                    "2019-01-06",
                    "2019-01-07",
                    "2019-01-08",
                ]
            ),
            data={
                "label": ["a", np.NaN, np.NaN, np.NaN, "e", np.NaN, np.NaN, "j"],
                "y": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0],
            },
        )
    )
Пример #7
0
def test_resample_with_groupby():
    """
The Dataframe contains a timestamp column, a string column and a numeric column.
__timestamp     city  val
0  2022-01-13  Chicago  6.0
1  2022-01-13       LA  5.0
2  2022-01-13       NY  4.0
3  2022-01-11  Chicago  3.0
4  2022-01-11       LA  2.0
5  2022-01-11       NY  1.0
    """
    df = DataFrame({
        "__timestamp":
        to_datetime([
            "2022-01-13",
            "2022-01-13",
            "2022-01-13",
            "2022-01-11",
            "2022-01-11",
            "2022-01-11",
        ]),
        "city": ["Chicago", "LA", "NY", "Chicago", "LA", "NY"],
        "val": [6.0, 5.0, 4.0, 3.0, 2.0, 1.0],
    })
    post_df = resample(
        df=df,
        rule="1D",
        method="asfreq",
        fill_value=0,
        time_column="__timestamp",
        groupby_columns=("city", ),
    )
    assert list(post_df.columns) == [
        "__timestamp",
        "city",
        "val",
    ]
    assert [str(dt.date()) for dt in post_df["__timestamp"]
            ] == (["2022-01-11"] * 3 + ["2022-01-12"] * 3 + ["2022-01-13"] * 3)
    assert list(post_df["val"]) == [3.0, 2.0, 1.0, 0, 0, 0, 6.0, 5.0, 4.0]

    # should raise error when get a non-existent column
    with pytest.raises(QueryObjectValidationError):
        resample(
            df=df,
            rule="1D",
            method="asfreq",
            fill_value=0,
            time_column="__timestamp",
            groupby_columns=(
                "city",
                "unkonw_column",
            ),
        )

    # should raise error when get a None value in groupby list
    with pytest.raises(QueryObjectValidationError):
        resample(
            df=df,
            rule="1D",
            method="asfreq",
            fill_value=0,
            time_column="__timestamp",
            groupby_columns=(
                "city",
                None,
            ),
        )
Пример #8
0
def test_resample_after_pivot():
    df = pd.DataFrame(
        data={
            "__timestamp": pd.to_datetime(
                [
                    "2022-01-13",
                    "2022-01-13",
                    "2022-01-13",
                    "2022-01-11",
                    "2022-01-11",
                    "2022-01-11",
                ]
            ),
            "city": ["Chicago", "LA", "NY", "Chicago", "LA", "NY"],
            "val": [6.0, 5.0, 4.0, 3.0, 2.0, 1.0],
        }
    )
    pivot_df = pp.pivot(
        df=df,
        index=["__timestamp"],
        columns=["city"],
        aggregates={
            "val": {"operator": "sum"},
        },
        flatten_columns=False,
        reset_index=False,
    )
    """
                    val
    city        Chicago   LA   NY
    __timestamp
    2022-01-11      3.0  2.0  1.0
    2022-01-13      6.0  5.0  4.0
    """
    resample_df = pp.resample(
        df=pivot_df,
        rule="1D",
        method="asfreq",
        fill_value=0,
    )
    """
                    val
    city        Chicago   LA   NY
    __timestamp
    2022-01-11      3.0  2.0  1.0
    2022-01-12      0.0  0.0  0.0
    2022-01-13      6.0  5.0  4.0
    """
    flat_df = pp.flatten(resample_df)
    """
      __timestamp  val, Chicago  val, LA  val, NY
    0  2022-01-11           3.0      2.0      1.0
    1  2022-01-12           0.0      0.0      0.0
    2  2022-01-13           6.0      5.0      4.0
    """
    assert flat_df.equals(
        pd.DataFrame(
            data={
                "__timestamp": pd.to_datetime(
                    ["2022-01-11", "2022-01-12", "2022-01-13"]
                ),
                "val, Chicago": [3.0, 0, 6.0],
                "val, LA": [2.0, 0, 5.0],
                "val, NY": [1.0, 0, 4.0],
            }
        )
    )
Пример #9
0
def test_resample_should_not_side_effect():
    _timeseries_df = timeseries_df.copy()
    pp.resample(df=_timeseries_df, rule="1D", method="ffill")
    assert _timeseries_df.equals(timeseries_df)