def test_geodetic_parse(): # parse geodetic string with altitude into lon/lat/altitude post_df = geodetic_parse( df=lonlat_df[["city", "geodetic"]], geodetic="geodetic", latitude="latitude", longitude="longitude", altitude="altitude", ) assert sorted(post_df.columns.tolist()) == sorted( ["city", "geodetic", "latitude", "longitude", "altitude"]) assert series_to_list(post_df["longitude"]) == series_to_list( lonlat_df["longitude"]) assert series_to_list(post_df["latitude"]) == series_to_list( lonlat_df["latitude"]) assert series_to_list(post_df["altitude"]) == series_to_list( lonlat_df["altitude"]) # parse geodetic string into lon/lat post_df = geodetic_parse( df=lonlat_df[["city", "geodetic"]], geodetic="geodetic", latitude="latitude", longitude="longitude", ) assert sorted(post_df.columns.tolist()) == sorted( ["city", "geodetic", "latitude", "longitude"]) assert series_to_list(post_df["longitude"]) == series_to_list( lonlat_df["longitude"]) assert series_to_list(post_df["latitude"]), series_to_list( lonlat_df["latitude"])
def test_diff(): # overwrite column post_df = diff(df=timeseries_df, columns={"y": "y"}) assert post_df.columns.tolist() == ["label", "y"] assert series_to_list(post_df["y"]) == [None, 1.0, 1.0, 1.0] # add column post_df = diff(df=timeseries_df, columns={"y": "y1"}) assert post_df.columns.tolist() == ["label", "y", "y1"] assert series_to_list(post_df["y"]) == [1.0, 2.0, 3.0, 4.0] assert series_to_list(post_df["y1"]) == [None, 1.0, 1.0, 1.0] # look ahead post_df = diff(df=timeseries_df, columns={"y": "y1"}, periods=-1) assert series_to_list(post_df["y1"]) == [-1.0, -1.0, -1.0, None] # invalid column reference with pytest.raises(QueryObjectValidationError): diff( df=timeseries_df, columns={"abc": "abc"}, ) # diff by columns post_df = diff(df=timeseries_df2, columns={"y": "y", "z": "z"}, axis=1) assert post_df.columns.tolist() == ["label", "y", "z"] assert series_to_list(post_df["z"]) == [0.0, 2.0, 8.0, 6.0]
def test_aggregate(): aggregates = { "asc sum": { "column": "asc_idx", "operator": "sum" }, "asc q2": { "column": "asc_idx", "operator": "percentile", "options": { "q": 75 }, }, "desc q1": { "column": "desc_idx", "operator": "percentile", "options": { "q": 25 }, }, } df = aggregate(df=categories_df, groupby=["constant"], aggregates=aggregates) assert df.columns.tolist() == ["constant", "asc sum", "asc q2", "desc q1"] assert series_to_list(df["asc sum"])[0] == 5050 assert series_to_list(df["asc q2"])[0] == 75 assert series_to_list(df["desc q1"])[0] == 25
def test_geohash_encode(): # encode lon/lat into geohash post_df = geohash_encode( df=lonlat_df[["city", "latitude", "longitude"]], latitude="latitude", longitude="longitude", geohash="geohash", ) assert sorted(post_df.columns.tolist()) == sorted( ["city", "geohash", "latitude", "longitude"]) assert series_to_list(post_df["geohash"]) == series_to_list( lonlat_df["geohash"])
def test_geohash_decode(): # decode lon/lat from geohash post_df = geohash_decode( df=lonlat_df[["city", "geohash"]], geohash="geohash", latitude="latitude", longitude="longitude", ) assert sorted(post_df.columns.tolist()) == sorted( ["city", "geohash", "latitude", "longitude"]) assert round_floats(series_to_list(post_df["longitude"]), 6) == round_floats( series_to_list(lonlat_df["longitude"]), 6) assert round_floats(series_to_list(post_df["latitude"]), 6) == round_floats( series_to_list(lonlat_df["latitude"]), 6)
def test_compare(): # `difference` comparison post_df = compare( df=timeseries_df2, source_columns=["y"], compare_columns=["z"], compare_type="difference", ) assert post_df.columns.tolist() == ["label", "y", "z", "difference__y__z"] assert series_to_list( post_df["difference__y__z"]) == [0.0, -2.0, -8.0, -6.0] # drop original columns post_df = compare( df=timeseries_df2, source_columns=["y"], compare_columns=["z"], compare_type="difference", drop_original_columns=True, ) assert post_df.columns.tolist() == ["label", "difference__y__z"] # `percentage` comparison post_df = compare( df=timeseries_df2, source_columns=["y"], compare_columns=["z"], compare_type="percentage", ) assert post_df.columns.tolist() == ["label", "y", "z", "percentage__y__z"] assert series_to_list( post_df["percentage__y__z"]) == [0.0, -0.5, -0.8, -0.75] # `ratio` comparison post_df = compare( df=timeseries_df2, source_columns=["y"], compare_columns=["z"], compare_type="ratio", ) assert post_df.columns.tolist() == ["label", "y", "z", "ratio__y__z"] assert series_to_list(post_df["ratio__y__z"]) == [1.0, 0.5, 0.2, 0.25]
def test_cum(): # create new column (cumsum) post_df = pp.cum( df=timeseries_df, columns={"y": "y2"}, operator="sum", ) assert post_df.columns.tolist() == ["label", "y", "y2"] assert series_to_list(post_df["label"]) == ["x", "y", "z", "q"] assert series_to_list(post_df["y"]) == [1.0, 2.0, 3.0, 4.0] assert series_to_list(post_df["y2"]) == [1.0, 3.0, 6.0, 10.0] # overwrite column (cumprod) post_df = pp.cum( df=timeseries_df, columns={"y": "y"}, operator="prod", ) assert post_df.columns.tolist() == ["label", "y"] assert series_to_list(post_df["y"]) == [1.0, 2.0, 6.0, 24.0] # overwrite column (cummin) post_df = pp.cum( df=timeseries_df, columns={"y": "y"}, operator="min", ) assert post_df.columns.tolist() == ["label", "y"] assert series_to_list(post_df["y"]) == [1.0, 1.0, 1.0, 1.0] # invalid operator with pytest.raises(InvalidPostProcessingError): pp.cum( df=timeseries_df, columns={"y": "y"}, operator="abc", )
def test_rolling(): # sum rolling type post_df = pp.rolling( df=timeseries_df, columns={"y": "y"}, rolling_type="sum", window=2, min_periods=0, ) assert post_df.columns.tolist() == ["label", "y"] assert series_to_list(post_df["y"]) == [1.0, 3.0, 5.0, 7.0] # mean rolling type with alias post_df = pp.rolling( df=timeseries_df, rolling_type="mean", columns={"y": "y_mean"}, window=10, min_periods=0, ) assert post_df.columns.tolist() == ["label", "y", "y_mean"] assert series_to_list(post_df["y_mean"]) == [1.0, 1.5, 2.0, 2.5] # count rolling type post_df = pp.rolling( df=timeseries_df, rolling_type="count", columns={"y": "y"}, window=10, min_periods=0, ) assert post_df.columns.tolist() == ["label", "y"] assert series_to_list(post_df["y"]) == [1.0, 2.0, 3.0, 4.0] # quantile rolling type post_df = pp.rolling( df=timeseries_df, columns={"y": "q1"}, rolling_type="quantile", rolling_type_options={"quantile": 0.25}, window=10, min_periods=0, ) assert post_df.columns.tolist() == ["label", "y", "q1"] assert series_to_list(post_df["q1"]) == [1.0, 1.25, 1.5, 1.75] # incorrect rolling type with pytest.raises(InvalidPostProcessingError): pp.rolling( df=timeseries_df, columns={"y": "y"}, rolling_type="abc", window=2, ) # incorrect rolling type options with pytest.raises(InvalidPostProcessingError): pp.rolling( df=timeseries_df, columns={"y": "y"}, rolling_type="quantile", rolling_type_options={"abc": 123}, window=2, )
def test_sort(): df = sort(df=categories_df, columns={"category": True, "asc_idx": False}) assert series_to_list(df["asc_idx"])[1] == 96 with pytest.raises(QueryObjectValidationError): sort(df=df, columns={"abc": True})
def test_sort(): df = sort(df=categories_df, columns={"category": True, "asc_idx": False}) assert series_to_list(df["asc_idx"])[1] == 96 with pytest.raises(InvalidPostProcessingError): sort(df=df, columns={"abc": True})