def test_arrow(): a = Series("a", [1, 2, 3, None]) out = a.to_arrow() assert out == pa.array([1, 2, 3, None]) a = pa.array(["foo", "bar"], pa.dictionary(pa.int32(), pa.utf8())) s = pl.Series("a", a) assert s.dtype == pl.Utf8
def test_rechunk(): a = Series("a", [1, 2, 3]) b = Series("b", [4, 5, 6]) a.append(b) assert a.n_chunks() == 2 assert a.rechunk(in_place=False).n_chunks() == 1 a.rechunk(in_place=True) assert a.n_chunks() == 1
def test_ufunc(): a = Series("a", [1.0, 2.0, 3.0, 4.0]) b = np.multiply(a, 4) assert isinstance(b, Series) assert b == [4, 8, 12, 16] # test if null bitmask is preserved a = Series("a", [1.0, None, 3.0], nullable=True) b = np.exp(a) assert b.null_count() == 1
def test_strategy_shape(df1: pl.DataFrame, df2: pl.DataFrame, s1: pl.Series, s2: pl.Series) -> None: assert df1.shape == (5, 5) assert df1.columns == ["col0", "col1", "col2", "col3", "col4"] assert 2 <= len(df2.columns) <= 5 assert 3 <= len(df2) <= 8 assert s1.len() == 5 assert 3 <= s2.len() <= 8 assert s1.name == "" assert s2.name == "col"
def test_cast(): a = Series("a", range(20)) assert a.cast_f32().dtype == "f32" assert a.cast_f64().dtype == "f64" assert a.cast_i32().dtype == "i32" assert a.cast_u32().dtype == "u32" assert a.cast_date64().dtype == "date64" assert a.cast_time64ns().dtype == "time64(ns)" assert a.cast_date32().dtype == "date32"
def test_df_fold(): df = DataFrame({"a": [2, 1, 3], "b": [1, 2, 3], "c": [1.0, 2.0, 3.0]}) assert df.fold(lambda s1, s2: s1 + s2).series_equal( Series("a", [4.0, 5.0, 9.0])) assert df.fold(lambda s1, s2: s1.zip_with(s1 < s2, s2)).series_equal( Series("a", [1.0, 1.0, 3.0])) df = DataFrame({ "a": ["foo", "bar", "2"], "b": [1, 2, 3], "c": [1.0, 2.0, 3.0] }) out = df.fold(lambda s1, s2: s1 + s2) out.series_equal(Series("", ["foo11", "bar22", "233"]))
def test_filter(): a = Series("a", range(20)) assert a[a > 1].len() == 18 assert a[a < 1].len() == 1 assert a[a <= 1].len() == 2 assert a[a >= 1].len() == 19 assert a[a == 1].len() == 1 assert a[a != 1].len() == 19
def test_replace(): df = DataFrame({"a": [2, 1, 3], "b": [1, 2, 3]}) s = Series("c", [True, False, True]) df.replace("a", s) assert df.frame_equal(DataFrame({ "c": [True, False, True], "b": [1, 2, 3] }))
def test_join(): df_left = DataFrame( {"a": ["a", "b", "a", "z"], "b": [1, 2, 3, 4], "c": [6, 5, 4, 3],} ) df_right = DataFrame( {"a": ["b", "c", "b", "a"], "k": [0, 3, 9, 6], "c": [1, 0, 2, 1],} ) joined = df_left.join(df_right, left_on="a", right_on="a").sort("a") assert joined["b"].series_equal(Series("", [1, 3, 2, 2])) joined = df_left.join(df_right, left_on="a", right_on="a", how="left").sort("a") assert joined["c_right"].is_null().sum() == 1 assert joined["b"].series_equal(Series("", [1, 3, 2, 2, 4])) joined = df_left.join(df_right, left_on="a", right_on="a", how="outer").sort("a") assert joined["c_right"].null_count() == 1 assert joined["c"].null_count() == 2 assert joined["b"].null_count() == 2
def test_to_pandas(): df = get_complete_df() df.to_arrow() df.to_pandas() # test shifted df df.shift(2).to_pandas() df = DataFrame({"col": Series([True, False, True])}) df.shift(2).to_pandas()
def test_df_fold(): df = pl.DataFrame({"a": [2, 1, 3], "b": [1, 2, 3], "c": [1.0, 2.0, 3.0]}) assert df.fold(lambda s1, s2: s1 + s2).series_equal(Series("a", [4.0, 5.0, 9.0])) assert df.fold(lambda s1, s2: s1.zip_with(s1 < s2, s2)).series_equal( Series("a", [1.0, 1.0, 3.0]) ) df = pl.DataFrame({"a": ["foo", "bar", "2"], "b": [1, 2, 3], "c": [1.0, 2.0, 3.0]}) out = df.fold(lambda s1, s2: s1 + s2) out.series_equal(Series("", ["foo11", "bar22", "233"])) df = pl.DataFrame({"a": [3, 2, 1], "b": [1, 2, 3], "c": [1.0, 2.0, 3.0]}) # just check dispatch. values are tested on rust side. assert df.sum(axis=1).shape == (3, 1) assert df.mean(axis=1).shape == (3, 1) assert df.min(axis=1).shape == (3, 1) assert df.max(axis=1).shape == (3, 1)
def test_to_python(): a = Series("a", range(20)) b = a.to_list() assert isinstance(b, list) assert len(b) == 20 a = Series("a", [1, None, 2], nullable=True) assert a.null_count() == 1 assert a.to_list() == [1, None, 2]
def test_downsample(): s = Series( "datetime", [ 946684800000, 946684860000, 946684920000, 946684980000, 946685040000, 946685100000, 946685160000, 946685220000, 946685280000, 946685340000, 946685400000, 946685460000, 946685520000, 946685580000, 946685640000, 946685700000, 946685760000, 946685820000, 946685880000, 946685940000, ], ).cast(Date64) s2 = s.clone() df = DataFrame({"a": s, "b": s2}) out = df.downsample("a", rule="minute", n=5).first() assert out.shape == (4, 2) # OLHC out = df.downsample("a", rule="minute", n=5).agg( {"b": ["first", "min", "max", "last"]} ) assert out.shape == (4, 5) # test to_pandas as well. out = df.to_pandas() assert out["a"].dtype == "datetime64[ns]"
def test_apply(): a = Series("a", [1, 2, None], nullable=True) b = a.apply(lambda x: x**2) assert b == [1, 4, None] a = Series("a", ["foo", "bar", None], nullable=True) b = a.apply(lambda x: x + "py") assert b == ["foopy", "barpy", None] b = a.apply(lambda x: len(x), dtype_out=Int32) assert b == [3, 3, None] b = a.apply(lambda x: len(x)) assert b == [3, 3, None]
def test_join(): df_left = DataFrame({ "a": ["a", "b", "a", "z"], "b": [1, 2, 3, 4], "c": [6, 5, 4, 3], }) df_right = DataFrame({ "a": ["b", "c", "b", "a"], "k": [0, 3, 9, 6], "c": [1, 0, 2, 1], }) joined = df_left.join(df_right, left_on="a", right_on="a").sort("a") assert joined["b"].series_equal(Series("", [1, 3, 2, 2])) joined = df_left.join(df_right, left_on="a", right_on="a", how="left").sort("a") assert joined["c_right"].is_null().sum() == 1 assert joined["b"].series_equal(Series("", [1, 3, 2, 2, 4])) joined = df_left.join(df_right, left_on="a", right_on="a", how="outer").sort("a") assert joined["c_right"].null_count() == 1 assert joined["c"].null_count() == 2 assert joined["b"].null_count() == 2 df_a = DataFrame({"a": [1, 2, 1, 1], "b": ["a", "b", "c", "c"]}) df_b = DataFrame({ "foo": [1, 1, 1], "bar": ["a", "c", "c"], "ham": ["let", "var", "const"] }) # just check if join on multiple columns runs df_a.join(df_b, left_on=["a", "b"], right_on=["foo", "bar"]) eager_join = df_a.join(df_b, left_on="a", right_on="foo") lazy_join = df_a.lazy().join(df_b.lazy(), left_on="a", right_on="foo").collect() assert lazy_join.shape == eager_join.shape
def test_custom_groupby(): df = DataFrame({"A": ["a", "a", "c", "c"], "B": [1, 3, 5, 2]}) assert df.groupby("A").select("B").apply(lambda x: x.sum()).shape == (2, 2) assert df.groupby("A").select("B").apply( lambda x: Series("", np.array(x))).shape == ( 2, 2, ) df = DataFrame({"a": [1, 2, 1, 1], "b": ["a", "b", "c", "c"]}) out = (df.lazy().groupby("b").agg( [col("a").apply(lambda x: x.sum(), dtype_out=int)]).collect()) assert out.shape == (3, 2)
def test_series_slice( srs: pl.Series, start: int | None, stop: int | None, step: int | None, ) -> None: py_data = srs.to_list() s = slice(start, stop, step) sliced_py_data = py_data[s] sliced_pl_data = srs[s].to_list() assert sliced_py_data == sliced_pl_data, f"slice [{start}:{stop}:{step}] failed" assert_series_equal(srs, srs, check_exact=True)
def test_cast(): a = Series("a", range(20)) assert a.cast(Float32).dtype == Float32 assert a.cast(Float64).dtype == Float64 assert a.cast(Int32).dtype == Int32 assert a.cast(UInt32).dtype == UInt32 assert a.cast(Date64).dtype == Date64 assert a.cast(Date32).dtype == Date32
def verify_series_and_expr_api( input: pl.Series, expected: pl.Series, op: str, *args: Any, **kwargs: Any ) -> None: """ Small helper function to test element-wise functions for both the series and expressions api. Examples -------- >>> s = pl.Series([1, 3, 2]) >>> expected = pl.Series([1, 2, 3]) >>> verify_series_and_expr_api(s, expected, "sort") """ expr = _getattr_multi(pl.col("*"), op)(*args, **kwargs) result_expr: pl.Series = input.to_frame().select(expr)[:, 0] # type: ignore result_series = _getattr_multi(input, op)(*args, **kwargs) testing.assert_series_equal(result_expr, expected) testing.assert_series_equal(result_series, expected)
def test_equality(): a = create_series() b = a cmp = a == b assert isinstance(cmp, Series) assert cmp.sum() == 2 assert (a != b).sum() == 0 assert (a >= b).sum() == 2 assert (a <= b).sum() == 2 assert (a > b).sum() == 0 assert (a < b).sum() == 0 assert a.sum() == 3 assert a.series_equal(b) a = Series("name", ["ham", "foo", "bar"]) assert (a == "ham").to_list() == [True, False, False]
def test_strategy_null_probability( s: pl.Series, df1: pl.DataFrame, df2: pl.DataFrame, df3: pl.DataFrame, ) -> None: for obj in (s, df1, df2, df3): assert len(obj) == 50 # type: ignore[arg-type] assert s.null_count() < df1.null_count().fold(sum).sum() assert df1.null_count().fold(sum).sum() < df2.null_count().fold(sum).sum() assert df2.null_count().fold(sum).sum() < df3.null_count().fold(sum).sum() nulls_col0, nulls_col1 = df2.null_count().rows()[0] assert nulls_col0 > nulls_col1 assert nulls_col0 < 50 nulls_col0, nulls_colx = df3.null_count().rows()[0] assert nulls_col0 > nulls_colx assert nulls_col0 == 50
def test_hstack(): df = DataFrame({"a": [2, 1, 3], "b": ["a", "b", "c"]}) df.hstack([Series("stacked", [-1, -1, -1])], in_place=True) assert df.shape == (3, 3) assert df.columns == ["a", "b", "stacked"]
def test_groupby(): df = DataFrame({ "a": ["a", "b", "a", "b", "b", "c"], "b": [1, 2, 3, 4, 5, 6], "c": [6, 5, 4, 3, 2, 1], }) # use __getitem__ to map to select assert (df.groupby("a")["b"].sum().sort(by="a").frame_equal( DataFrame({ "a": ["a", "b", "c"], "": [4, 11, 6] }))) assert (df.groupby("a").select("b").sum().sort(by="a").frame_equal( DataFrame({ "a": ["a", "b", "c"], "": [4, 11, 6] }))) assert (df.groupby("a").select("c").sum().sort(by="a").frame_equal( DataFrame({ "a": ["a", "b", "c"], "": [10, 10, 1] }))) assert (df.groupby("a").select("b").min().sort(by="a").frame_equal( DataFrame({ "a": ["a", "b", "c"], "": [1, 2, 6] }))) assert (df.groupby("a").select("b").max().sort(by="a").frame_equal( DataFrame({ "a": ["a", "b", "c"], "": [3, 5, 6] }))) assert (df.groupby("a").select("b").mean().sort(by="a").frame_equal( DataFrame({ "a": ["a", "b", "c"], "": [2.0, (2 + 4 + 5) / 3, 6.0] }))) assert (df.groupby("a").select("b").last().sort(by="a").frame_equal( DataFrame({ "a": ["a", "b", "c"], "": [3, 5, 6] }))) # check if it runs (df.groupby("a").select("b").n_unique()) (df.groupby("a").select("b").quantile(0.3)) (df.groupby("a").select("b").agg_list()) gb_df = df.groupby("a").agg({"b": ["sum", "min"], "c": "count"}) assert "b_sum" in gb_df.columns assert "b_min" in gb_df.columns # # # TODO: is false because count is u32 # df.groupby(by="a", select="b", agg="count").frame_equal( # DataFrame({"a": ["a", "b", "c"], "": [2, 3, 1]}) # ) assert df.groupby("a").apply(lambda df: df[["c"]].sum()).sort( "c")["c"][0] == 1 assert df.groupby("a").groups().sort("a")["a"].series_equal( Series(["a", "b", "c"])) for subdf in df.groupby("a"): if subdf["a"][0] == "b": assert subdf.shape == (3, 3) assert df.groupby("a").get_group("c").shape == (1, 3) assert df.groupby("a").get_group("b").shape == (3, 3) assert df.groupby("a").get_group("a").shape == (2, 3) # Use lazy API in eager groupby assert df.groupby("a").agg([pl.sum("b")]).shape == (3, 2)
def create_series() -> "Series": return Series("a", [1, 2])
def test_various(): a = create_series() assert a.is_null().sum() == 0 assert a.name == "a" a.rename("b") assert a.name == "b" assert a.len() == 2 assert len(a) == 2 b = a.slice(1, 1) assert b.len() == 1 assert b.series_equal(Series("", [2])) a.append(b) assert a.series_equal(Series("", [1, 2, 2])) a = Series("a", range(20)) assert a.head(5).len() == 5 assert a.tail(5).len() == 5 assert a.head(5) != a.tail(5) a = Series("a", [2, 1, 4]) a.sort(in_place=True) assert a.series_equal(Series("", [1, 2, 4])) a = Series("a", [2, 1, 1, 4, 4, 4]) assert a.arg_unique().to_list() == [0, 1, 3] assert a.take([2, 3]).series_equal(Series("", [1, 4])) assert a.is_numeric() a = Series("bool", [True, False]) assert not a.is_numeric()
def test_shape(): s = Series([1, 2, 3]) assert s.shape == (3, )
def test_quantile(): s = Series([1, 2, 3]) assert s.quantile(0.5) == 2
def test_median(): s = Series([1, 2, 3]) assert s.median() == 2
def test_object(): vals = [[12], "foo", 9] a = Series("a", vals) assert a.dtype == Object assert a.to_list() == vals assert a[1] == "foo"
def test_rolling(): a = Series("a", [1, 2, 3, 2, 1]) assert a.rolling_min(2) == [None, 1, 2, 2, 1] assert a.rolling_max(2) == [None, 2, 3, 3, 2] assert a.rolling_sum(2) == [None, 3, 5, 5, 3]