def test_resample(self): df = timeseries_df.copy() df.index.name = "time_column" df.reset_index(inplace=True) post_df = proc.resample( df=df, rule="1D", method="ffill", time_column="time_column", ) self.assertListEqual(post_df["label"].tolist(), ["x", "y", "y", "y", "z", "z", "q"]) self.assertListEqual(post_df["y"].tolist(), [1.0, 2.0, 2.0, 2.0, 3.0, 3.0, 4.0]) post_df = proc.resample( df=df, rule="1D", method="asfreq", time_column="time_column", fill_value=0, ) self.assertListEqual(post_df["label"].tolist(), ["x", "y", 0, 0, "z", 0, "q"]) self.assertListEqual(post_df["y"].tolist(), [1.0, 2.0, 0, 0, 3.0, 0, 4.0])
def test_resample_should_raise_ex(): with pytest.raises(InvalidPostProcessingError): pp.resample( df=categories_df, rule="1D", method="asfreq", ) with pytest.raises(InvalidPostProcessingError): pp.resample( df=timeseries_df, rule="1D", method="foobar", )
def test_resample(): post_df = pp.resample(df=timeseries_df, rule="1D", method="ffill") """ label y 2019-01-01 x 1.0 2019-01-02 y 2.0 2019-01-03 y 2.0 2019-01-04 y 2.0 2019-01-05 z 3.0 2019-01-06 z 3.0 2019-01-07 q 4.0 """ assert post_df.equals( pd.DataFrame( index=pd.to_datetime( [ "2019-01-01", "2019-01-02", "2019-01-03", "2019-01-04", "2019-01-05", "2019-01-06", "2019-01-07", ] ), data={ "label": ["x", "y", "y", "y", "z", "z", "q"], "y": [1.0, 2.0, 2.0, 2.0, 3.0, 3.0, 4.0], }, ) )
def test_resample(): df = timeseries_df.copy() df.index.name = "time_column" df.reset_index(inplace=True) post_df = resample( df=df, rule="1D", method="ffill", time_column="time_column", ) assert post_df["label"].tolist() == ["x", "y", "y", "y", "z", "z", "q"] assert post_df["y"].tolist() == [1.0, 2.0, 2.0, 2.0, 3.0, 3.0, 4.0] post_df = resample( df=df, rule="1D", method="asfreq", time_column="time_column", fill_value=0, ) assert post_df["label"].tolist() == ["x", "y", 0, 0, "z", 0, "q"] assert post_df["y"].tolist() == [1.0, 2.0, 0, 0, 3.0, 0, 4.0]
def test_resample_zero_fill(): post_df = pp.resample(df=timeseries_df, rule="1D", method="asfreq", fill_value=0) assert post_df.equals( pd.DataFrame( index=pd.to_datetime( [ "2019-01-01", "2019-01-02", "2019-01-03", "2019-01-04", "2019-01-05", "2019-01-06", "2019-01-07", ] ), data={ "label": ["x", "y", 0, 0, "z", 0, "q"], "y": [1.0, 2.0, 0, 0, 3.0, 0, 4.0], }, ) )
def test_resample_linear(): df = pd.DataFrame( index=to_datetime(["2019-01-01", "2019-01-05", "2019-01-08"]), data={"label": ["a", "e", "j"], "y": [1.0, 5.0, 8.0]}, ) post_df = pp.resample(df=df, rule="1D", method="linear") """ label y 2019-01-01 a 1.0 2019-01-02 NaN 2.0 2019-01-03 NaN 3.0 2019-01-04 NaN 4.0 2019-01-05 e 5.0 2019-01-06 NaN 6.0 2019-01-07 NaN 7.0 2019-01-08 j 8.0 """ assert post_df.equals( pd.DataFrame( index=pd.to_datetime( [ "2019-01-01", "2019-01-02", "2019-01-03", "2019-01-04", "2019-01-05", "2019-01-06", "2019-01-07", "2019-01-08", ] ), data={ "label": ["a", np.NaN, np.NaN, np.NaN, "e", np.NaN, np.NaN, "j"], "y": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0], }, ) )
def test_resample_with_groupby(): """ The Dataframe contains a timestamp column, a string column and a numeric column. __timestamp city val 0 2022-01-13 Chicago 6.0 1 2022-01-13 LA 5.0 2 2022-01-13 NY 4.0 3 2022-01-11 Chicago 3.0 4 2022-01-11 LA 2.0 5 2022-01-11 NY 1.0 """ df = DataFrame({ "__timestamp": to_datetime([ "2022-01-13", "2022-01-13", "2022-01-13", "2022-01-11", "2022-01-11", "2022-01-11", ]), "city": ["Chicago", "LA", "NY", "Chicago", "LA", "NY"], "val": [6.0, 5.0, 4.0, 3.0, 2.0, 1.0], }) post_df = resample( df=df, rule="1D", method="asfreq", fill_value=0, time_column="__timestamp", groupby_columns=("city", ), ) assert list(post_df.columns) == [ "__timestamp", "city", "val", ] assert [str(dt.date()) for dt in post_df["__timestamp"] ] == (["2022-01-11"] * 3 + ["2022-01-12"] * 3 + ["2022-01-13"] * 3) assert list(post_df["val"]) == [3.0, 2.0, 1.0, 0, 0, 0, 6.0, 5.0, 4.0] # should raise error when get a non-existent column with pytest.raises(QueryObjectValidationError): resample( df=df, rule="1D", method="asfreq", fill_value=0, time_column="__timestamp", groupby_columns=( "city", "unkonw_column", ), ) # should raise error when get a None value in groupby list with pytest.raises(QueryObjectValidationError): resample( df=df, rule="1D", method="asfreq", fill_value=0, time_column="__timestamp", groupby_columns=( "city", None, ), )
def test_resample_after_pivot(): df = pd.DataFrame( data={ "__timestamp": pd.to_datetime( [ "2022-01-13", "2022-01-13", "2022-01-13", "2022-01-11", "2022-01-11", "2022-01-11", ] ), "city": ["Chicago", "LA", "NY", "Chicago", "LA", "NY"], "val": [6.0, 5.0, 4.0, 3.0, 2.0, 1.0], } ) pivot_df = pp.pivot( df=df, index=["__timestamp"], columns=["city"], aggregates={ "val": {"operator": "sum"}, }, flatten_columns=False, reset_index=False, ) """ val city Chicago LA NY __timestamp 2022-01-11 3.0 2.0 1.0 2022-01-13 6.0 5.0 4.0 """ resample_df = pp.resample( df=pivot_df, rule="1D", method="asfreq", fill_value=0, ) """ val city Chicago LA NY __timestamp 2022-01-11 3.0 2.0 1.0 2022-01-12 0.0 0.0 0.0 2022-01-13 6.0 5.0 4.0 """ flat_df = pp.flatten(resample_df) """ __timestamp val, Chicago val, LA val, NY 0 2022-01-11 3.0 2.0 1.0 1 2022-01-12 0.0 0.0 0.0 2 2022-01-13 6.0 5.0 4.0 """ assert flat_df.equals( pd.DataFrame( data={ "__timestamp": pd.to_datetime( ["2022-01-11", "2022-01-12", "2022-01-13"] ), "val, Chicago": [3.0, 0, 6.0], "val, LA": [2.0, 0, 5.0], "val, NY": [1.0, 0, 4.0], } ) )
def test_resample_should_not_side_effect(): _timeseries_df = timeseries_df.copy() pp.resample(df=_timeseries_df, rule="1D", method="ffill") assert _timeseries_df.equals(timeseries_df)