def test_init_series() -> None: # List of Series df = pl.DataFrame([pl.Series("a", [1, 2, 3]), pl.Series("b", [4, 5, 6])]) truth = pl.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) assert df.frame_equal(truth) # Tuple of Series df = pl.DataFrame((pl.Series("a", (1, 2, 3)), pl.Series("b", (4, 5, 6)))) assert df.frame_equal(truth) df = pl.DataFrame( (pl.Series("a", (1, 2, 3)), pl.Series("b", (4, 5, 6))), columns=[("x", pl.Float64), ("y", pl.Float64)], ) assert df.schema == {"x": pl.Float64, "y": pl.Float64} assert df.rows() == [(1.0, 4.0), (2.0, 5.0), (3.0, 6.0)] # List of unnamed Series df = pl.DataFrame([pl.Series([1, 2, 3]), pl.Series([4, 5, 6])]) truth = pl.DataFrame( [pl.Series("column_0", [1, 2, 3]), pl.Series("column_1", [4, 5, 6])]) assert df.frame_equal(truth) df = pl.DataFrame([pl.Series([0.0]), pl.Series([1.0])]) assert df.schema == {"column_0": pl.Float64, "column_1": pl.Float64} assert df.rows() == [(0.0, 1.0)] df = pl.DataFrame( [pl.Series([None]), pl.Series([1.0])], columns=[("x", pl.Date), ("y", pl.Boolean)], ) assert df.schema == {"x": pl.Date, "y": pl.Boolean} assert df.rows() == [(None, True)] # Single Series df = pl.DataFrame(pl.Series("a", [1, 2, 3])) truth = pl.DataFrame({"a": [1, 2, 3]}) assert df.schema == {"a": pl.Int64} assert df.frame_equal(truth) df = pl.DataFrame(pl.Series("a", [1, 2, 3]), columns=[("a", pl.UInt32)]) assert df.rows() == [(1, ), (2, ), (3, )] assert df.schema == {"a": pl.UInt32}
def test_sort_by_exprs() -> None: # make sure that the expression does not overwrite columns in the dataframe df = pl.DataFrame({"a": [1, 2, -1, -2]}) out = df.sort(pl.col("a").abs()).to_series() assert out.to_list() == [1, -1, 2, -2]
def test_init_dict() -> None: # Empty dictionary df = pl.DataFrame({}) assert df.shape == (0, 0) # Empty dictionary/values df = pl.DataFrame({"a": [], "b": []}) assert df.shape == (0, 2) assert df.schema == {"a": pl.Float32, "b": pl.Float32} for df in ( pl.DataFrame({}, columns={ "a": pl.Date, "b": pl.Utf8 }), pl.DataFrame({ "a": [], "b": [] }, columns={ "a": pl.Date, "b": pl.Utf8 }), ): assert df.shape == (0, 2) assert df.schema == {"a": pl.Date, "b": pl.Utf8} # List of empty list/tuple df = pl.DataFrame({"a": [[]], "b": [()]}) assert df.schema == {"a": pl.List(pl.Float64), "b": pl.List(pl.Float64)} assert df.rows() == [([], [])] # Mixed dtypes df = pl.DataFrame({"a": [1, 2, 3], "b": [1.0, 2.0, 3.0]}) assert df.shape == (3, 2) assert df.columns == ["a", "b"] assert df.dtypes == [pl.Int64, pl.Float64] df = pl.DataFrame( data={ "a": [1, 2, 3], "b": [1.0, 2.0, 3.0] }, columns=[("a", pl.Int8), ("b", pl.Float32)], ) assert df.schema == {"a": pl.Int8, "b": pl.Float32} # Values contained in tuples df = pl.DataFrame({"a": (1, 2, 3), "b": [1.0, 2.0, 3.0]}) assert df.shape == (3, 2) # Datetime/Date types (from both python and integer values) py_datetimes = ( datetime(2022, 12, 31, 23, 59, 59), datetime(2022, 12, 31, 23, 59, 59), ) py_dates = (date(2022, 12, 31), date(2022, 12, 31)) int_datetimes = [1672531199000000, 1672531199000000] int_dates = [19357, 19357] for dates, datetimes, coldefs in ( # test inferred and explicit (given both py/polars dtypes) (py_dates, py_datetimes, None), (py_dates, py_datetimes, [("dt", date), ("dtm", datetime)]), (py_dates, py_datetimes, [("dt", pl.Date), ("dtm", pl.Datetime)]), (int_dates, int_datetimes, [("dt", date), ("dtm", datetime)]), (int_dates, int_datetimes, [("dt", pl.Date), ("dtm", pl.Datetime)]), ): df = pl.DataFrame( data={ "dt": dates, "dtm": datetimes }, columns=coldefs, ) assert df.schema == {"dt": pl.Date, "dtm": pl.Datetime} assert df.rows() == list(zip(py_dates, py_datetimes)) # Overriding dict column names/types df = pl.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}, columns=["c", "d"]) assert df.columns == ["c", "d"] df = pl.DataFrame( { "a": [1, 2, 3], "b": [4, 5, 6] }, columns=["c", ("d", pl.Int8)] # type: ignore[arg-type] ) # partial type info (allowed, but mypy doesn't like it ;p) assert df.schema == {"c": pl.Int64, "d": pl.Int8} df = pl.DataFrame({ "a": [1, 2, 3], "b": [4, 5, 6] }, columns=[("c", pl.Int8), ("d", pl.Int16)]) assert df.schema == {"c": pl.Int8, "d": pl.Int16} dfe = df.cleared() assert (df.schema == dfe.schema) and (len(dfe) == 0)
def test_init_ndarray_deprecated() -> None: with pytest.deprecated_call(): # 2D array - default to row orientation df = pl.DataFrame(np.array([[1, 2], [3, 4]])) truth = pl.DataFrame({"column_0": [1, 3], "column_1": [2, 4]}) assert df.frame_equal(truth)
def test_init_ndarray() -> None: # Empty array df = pl.DataFrame(np.array([])) assert df.frame_equal(pl.DataFrame()) # 1D array df = pl.DataFrame(np.array([1, 2, 3]), columns=["a"]) truth = pl.DataFrame({"a": [1, 2, 3]}) assert df.frame_equal(truth) df = pl.DataFrame(np.array([1, 2, 3]), columns=[("a", pl.Int32)]) truth = pl.DataFrame({ "a": [1, 2, 3] }).with_column(pl.col("a").cast(pl.Int32)) assert df.frame_equal(truth) # 2D array - default to column orientation df = pl.DataFrame(np.array([[1, 2], [3, 4]]), orient="col") truth = pl.DataFrame({"column_0": [1, 2], "column_1": [3, 4]}) assert df.frame_equal(truth) df = pl.DataFrame([[1, 2.0, "a"], [None, None, None]], orient="row") truth = pl.DataFrame({ "column_0": [1, None], "column_1": [2.0, None], "column_2": ["a", None] }) assert df.frame_equal(truth) df = pl.DataFrame( data=[[1, 2.0, "a"], [None, None, None]], columns=[("x", pl.Boolean), ("y", pl.Int32), "z"], # type: ignore[arg-type] orient="row", ) assert df.rows() == [(True, 2, "a"), (None, None, None)] assert df.schema == {"x": pl.Boolean, "y": pl.Int32, "z": pl.Utf8} # TODO: Uncomment tests below when removing deprecation warning # # 2D array - default to column orientation # df = pl.DataFrame(np.array([[1, 2], [3, 4]])) # truth = pl.DataFrame({"column_0": [1, 2], "column_1": [3, 4]}) # assert df.frame_equal(truth) # # 2D array - row orientation inferred # df = pl.DataFrame(np.array([[1, 2, 3], [4, 5, 6]]), columns=["a", "b", "c"]) # truth = pl.DataFrame({"a": [1, 4], "b": [2, 5], "c": [3, 6]}) # assert df.frame_equal(truth) # # 2D array - column orientation inferred # df = pl.DataFrame(np.array([[1, 2, 3], [4, 5, 6]]), columns=["a", "b"]) # truth = pl.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) # assert df.frame_equal(truth) # 2D array - orientation conflicts with columns with pytest.raises(ValueError): pl.DataFrame(np.array([[1, 2, 3], [4, 5, 6]]), columns=["a", "b"], orient="row") with pytest.raises(ValueError): pl.DataFrame( np.array([[1, 2, 3], [4, 5, 6]]), columns=[("a", pl.UInt32), ("b", pl.UInt32)], orient="row", ) # 3D array with pytest.raises(ValueError): _ = pl.DataFrame(np.random.randn(2, 2, 2)) # Wrong orient value with pytest.raises(ValueError): df = pl.DataFrame( np.array([[1, 2, 3], [4, 5, 6]]), orient="wrong", # type: ignore[arg-type] ) # numpy not available with patch("polars.internals.frame._NUMPY_AVAILABLE", False): with pytest.raises(ValueError): pl.DataFrame(np.array([1, 2, 3]), columns=["a"])
import polars as pl from polars.lazy import col df = pl.DataFrame({ "date": ["2020-01-02", "2020-01-03", "2020-01-04"], "index": [1, 2, 3] }) parsed = df.lazy().with_column( col("date").str_parse_date(pl.datatypes.Date32, "%Y-%m-%d")) if __name__ == "__main__": with open("book/src/outputs/how_can_i_parse_dates_0.txt", "w") as f: f.write(str(df)) with open("book/src/outputs/how_can_i_parse_dates_1.txt", "w") as f: f.write(str(parsed.collect()))
def test_arg_sort(): df = pl.DataFrame({"a": [4, 1, 3]}) assert df[col("a").arg_sort()]["a"] == [1, 2, 0]
def test_arange(): df = pl.DataFrame({"a": [1, 1, 1]}).lazy() result = df.filter(pl.lazy.col("a") >= pl.lazy.arange(0, 3)).collect() expected = pl.DataFrame({"a": [1, 1]}) assert result.frame_equal(expected)
def test_groupby(): df = pl.DataFrame({ "a": [1.0, None, 3.0, 4.0], "groups": ["a", "a", "b", "b"] }) out = df.lazy().groupby("groups").agg(pl.mean("a")).collect()