def test_cast_inner() -> None: a = pl.Series([[1, 2]]) for t in [bool, pl.Boolean]: b = a.cast(pl.List(t)) assert b.dtype == pl.List(pl.Boolean) assert b.to_list() == [[True, True]] # this creates an inner null type df = pl.from_pandas(pd.DataFrame(data=[[[]], [[]]], columns=["A"])) assert df["A"].cast(pl.List(int)).dtype.inner == pl.Int64 # type: ignore[arg-type, attr-defined]
def test_init_only_columns() -> None: df = pl.DataFrame(columns=["a", "b", "c"]) truth = pl.DataFrame({"a": [], "b": [], "c": []}) assert df.shape == (0, 3) assert df.frame_equal(truth, null_equal=True) assert df.dtypes == [pl.Float32, pl.Float32, pl.Float32] # Validate construction with various flavours of no/empty data no_data: Any for no_data in (None, {}, []): df = pl.DataFrame( data=no_data, columns=[ # type: ignore[arg-type] ("a", pl.Date), ("b", pl.UInt64), ("c", pl.datatypes.Int8), ("d", pl.List(pl.UInt8)), ], ) truth = pl.DataFrame({ "a": [], "b": [], "c": [] }).with_columns([ pl.col("a").cast(pl.Date), pl.col("b").cast(pl.UInt64), pl.col("c").cast(pl.Int8), ]) truth.insert_at_idx(3, pl.Series("d", [], pl.List(pl.UInt8))) assert df.shape == (0, 4) assert df.frame_equal(truth, null_equal=True) assert df.dtypes == [pl.Date, pl.UInt64, pl.Int8, pl.List] assert df.schema["d"].inner == pl.UInt8 # type: ignore[attr-defined] dfe = df.cleared() assert (df.schema == dfe.schema) and (dfe.shape == df.shape)
def recursive_logical_type() -> None: df = pl.DataFrame({ "str": ["A", "B", "A", "B", "C"], "group": [1, 1, 2, 1, 2] }) df = df.with_column(pl.col("str").cast(pl.Categorical)) df_groups = df.groupby("group").agg( [pl.col("str").list().alias("cat_list")]) f = io.BytesIO() df_groups.write_parquet(f, use_pyarrow=True) f.seek(0) read = pl.read_parquet(f, use_pyarrow=True) assert read.dtypes == [pl.Int64, pl.List(pl.Categorical)] assert read.shape == (2, 2)
def test_dtype() -> None: # inferred a = pl.Series("a", [[1, 2, 3], [2, 5], [6, 7, 8, 9]]) assert a.dtype == pl.List assert a.inner_dtype == pl.Int64 assert a.dtype.inner == pl.Int64 # type: ignore[attr-defined] # explicit df = pl.DataFrame( data={ "i": [[1, 2, 3]], "tm": [[time(10, 30, 45)]], "dt": [[date(2022, 12, 31)]], "dtm": [[datetime(2022, 12, 31, 1, 2, 3)]], }, columns=[ ("i", pl.List(pl.Int8)), ("tm", pl.List(pl.Time)), ("dt", pl.List(pl.Date)), ("dtm", pl.List(pl.Datetime)), ], ) assert df.schema == { "i": pl.List(pl.Int8), "tm": pl.List(pl.Time), "dt": pl.List(pl.Date), "dtm": pl.List(pl.Datetime), } assert df.schema["i"].inner == pl.Int8 # type: ignore[attr-defined] assert df.rows() == [ ( [1, 2, 3], [time(10, 30, 45)], [date(2022, 12, 31)], [datetime(2022, 12, 31, 1, 2, 3)], ) ]
def test_list_hash() -> None: out = pl.DataFrame({"a": [[1, 2, 3], [3, 4], [1, 2, 3]]}).with_column( pl.col("a").hash().alias("b") ) assert out.dtypes == [pl.List(pl.Int64), pl.UInt64] assert out[0, "b"] == out[2, "b"]
def test_init_dict() -> None: # Empty dictionary df = pl.DataFrame({}) assert df.shape == (0, 0) # Empty dictionary/values df = pl.DataFrame({"a": [], "b": []}) assert df.shape == (0, 2) assert df.schema == {"a": pl.Float32, "b": pl.Float32} for df in ( pl.DataFrame({}, columns={ "a": pl.Date, "b": pl.Utf8 }), pl.DataFrame({ "a": [], "b": [] }, columns={ "a": pl.Date, "b": pl.Utf8 }), ): assert df.shape == (0, 2) assert df.schema == {"a": pl.Date, "b": pl.Utf8} # List of empty list/tuple df = pl.DataFrame({"a": [[]], "b": [()]}) assert df.schema == {"a": pl.List(pl.Float64), "b": pl.List(pl.Float64)} assert df.rows() == [([], [])] # Mixed dtypes df = pl.DataFrame({"a": [1, 2, 3], "b": [1.0, 2.0, 3.0]}) assert df.shape == (3, 2) assert df.columns == ["a", "b"] assert df.dtypes == [pl.Int64, pl.Float64] df = pl.DataFrame( data={ "a": [1, 2, 3], "b": [1.0, 2.0, 3.0] }, columns=[("a", pl.Int8), ("b", pl.Float32)], ) assert df.schema == {"a": pl.Int8, "b": pl.Float32} # Values contained in tuples df = pl.DataFrame({"a": (1, 2, 3), "b": [1.0, 2.0, 3.0]}) assert df.shape == (3, 2) # Datetime/Date types (from both python and integer values) py_datetimes = ( datetime(2022, 12, 31, 23, 59, 59), datetime(2022, 12, 31, 23, 59, 59), ) py_dates = (date(2022, 12, 31), date(2022, 12, 31)) int_datetimes = [1672531199000000, 1672531199000000] int_dates = [19357, 19357] for dates, datetimes, coldefs in ( # test inferred and explicit (given both py/polars dtypes) (py_dates, py_datetimes, None), (py_dates, py_datetimes, [("dt", date), ("dtm", datetime)]), (py_dates, py_datetimes, [("dt", pl.Date), ("dtm", pl.Datetime)]), (int_dates, int_datetimes, [("dt", date), ("dtm", datetime)]), (int_dates, int_datetimes, [("dt", pl.Date), ("dtm", pl.Datetime)]), ): df = pl.DataFrame( data={ "dt": dates, "dtm": datetimes }, columns=coldefs, ) assert df.schema == {"dt": pl.Date, "dtm": pl.Datetime} assert df.rows() == list(zip(py_dates, py_datetimes)) # Overriding dict column names/types df = pl.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}, columns=["c", "d"]) assert df.columns == ["c", "d"] df = pl.DataFrame( { "a": [1, 2, 3], "b": [4, 5, 6] }, columns=["c", ("d", pl.Int8)] # type: ignore[arg-type] ) # partial type info (allowed, but mypy doesn't like it ;p) assert df.schema == {"c": pl.Int64, "d": pl.Int8} df = pl.DataFrame({ "a": [1, 2, 3], "b": [4, 5, 6] }, columns=[("c", pl.Int8), ("d", pl.Int16)]) assert df.schema == {"c": pl.Int8, "d": pl.Int16} dfe = df.cleared() assert (df.schema == dfe.schema) and (len(dfe) == 0)