def test_from_to_buffer(df: pl.DataFrame, compressions: List[str]) -> None: for compression in compressions: buf = io.BytesIO() df.write_ipc(buf, compression=compression) # type: ignore buf.seek(0) read_df = pl.read_ipc(buf) assert df.frame_equal(read_df)
def test_lazy_concat(df: pl.DataFrame) -> None: shape = df.shape shape = (shape[0] * 2, shape[1]) out = pl.concat([df.lazy(), df.lazy()]).collect() assert out.shape == shape assert out.frame_equal(df.vstack(df.clone()), null_equal=True)
def test_to_from_buffer(df: pl.DataFrame) -> None: df = df.drop("strings_nulls") for to_fn, from_fn, text_based in zip( [df.to_parquet, df.to_csv, df.to_ipc, df.to_json], [ pl.read_parquet, partial(pl.read_csv, parse_dates=True), pl.read_ipc, pl.read_json, ], [False, True, False, True], ): f = io.BytesIO() to_fn(f) # type: ignore f.seek(0) df_1 = from_fn(f) # type: ignore # some type information is lost due to text conversion if text_based: df_1 = df_1.with_columns([ pl.col("cat").cast(pl.Categorical), pl.col("time").cast(pl.Time) ]) assert df.frame_equal(df_1)
def test_selection(): df = DataFrame({ "a": [1, 2, 3], "b": [1.0, 2.0, 3.0], "c": ["a", "b", "c"] }) assert df["a"].dtype == Int64 assert df["b"].dtype == Float64 assert df["c"].dtype == Utf8 assert df[["a", "b"]].columns == ["a", "b"] assert df[[True, False, True]].height == 2 assert df[[True, False, True], "b"].shape == (2, 1) assert df[[True, False, False], ["a", "b"]].shape == (1, 2) assert df[[0, 1], "b"].shape == (2, 1) assert df[[2], ["a", "b"]].shape == (1, 2) assert df.select_at_idx(0).name == "a" assert (df.a == df["a"]).sum() == 3 assert (df.c == df["a"]).sum() == 0 assert df[:, "a":"b"].shape == (3, 2) assert df[:, "a":"c"].columns == ["a", "b", "c"] expect = pl.DataFrame({"c": ["b"]}) assert df[1, [2]].frame_equal(expect) expect = pl.DataFrame({"b": [1.0, 3.0]}) assert df[[0, 2], [1]].frame_equal(expect)
def test_from_to_buffer(df: pl.DataFrame, compressions: list[str]) -> None: for compression in compressions: buf = io.BytesIO() df.write_ipc(buf, compression=compression) # type: ignore[arg-type] buf.seek(0) read_df = pl.read_ipc(buf) assert_frame_equal_local_categoricals(df, read_df)
def test_binary_function(): df = DataFrame({"a": [1, 2, 3], "b": [1.0, 2.0, 3.0]}) out = ( df.lazy() .with_column(map_binary(col("a"), col("b"), lambda a, b: a + b)) .collect() ) assert out["binary_function"] == (out.a + out.b)
def test_from_to_file(io_test_dir: str, example_df: pl.DataFrame, compressions: List[str]) -> None: f = os.path.join(io_test_dir, "small.avro") for compression in compressions: example_df.write_avro(f, compression=compression) # type: ignore df_read = pl.read_avro(str(f)) assert example_df.frame_equal(df_read)
def test_to_pandas(): df = get_complete_df() df.to_arrow() df.to_pandas() # test shifted df df.shift(2).to_pandas() df = DataFrame({"col": Series([True, False, True])}) df.shift(2).to_pandas()
def test_set_null(): df = DataFrame({"a": [1, 2, 3], "b": [1.0, 2.0, 3.0]}) out = (df.lazy().with_column( when(col("a") > 1).then( lit(None)).otherwise(100).alias("foo")).collect()) s = out["foo"] assert s[0] == 100 assert s[1] is None assert s[2] is None
def test_row_tuple(): df = DataFrame({ "a": ["foo", "bar", "2"], "b": [1, 2, 3], "c": [1.0, 2.0, 3.0] }) assert df.row(0) == ("foo", 1, 1.0) assert df.row(1) == ("bar", 2, 2.0) assert df.row(-1) == ("2", 3, 3.0)
def test_from_to_buffer(example_df: pl.DataFrame, compressions: list[str]) -> None: for compression in compressions: buf = io.BytesIO() example_df.write_avro( buf, compression=compression) # type: ignore[arg-type] buf.seek(0) read_df = pl.read_avro(buf) assert example_df.frame_equal(read_df)
def test_to_from_buffer(df: pl.DataFrame) -> None: for buf in (io.BytesIO(), io.StringIO()): df.write_json(buf) buf.seek(0) read_df = pl.read_json(buf) read_df = read_df.with_columns( [pl.col("cat").cast(pl.Categorical), pl.col("time").cast(pl.Time)]) assert_frame_equal_local_categoricals(df, read_df)
def test_lazy(): df = DataFrame({"a": [1, 2, 3], "b": [1.0, 2.0, 3.0]}) ldf = df.lazy().with_column(lit(1).alias("foo")).select( [col("a"), col("foo")]) print(ldf.collect()) # test if it executes new = (df.lazy().with_column( when(col("a").gt(lit(2))).then(lit(10)).otherwise( lit(1)).alias("new")).collect())
def test_from_to_file(io_test_dir: str, df: pl.DataFrame, compressions: List[str]) -> None: f = os.path.join(io_test_dir, "small.ipc") # does not yet work on windows because we hold an mmap? if os.name != "nt": for compression in compressions: df.write_ipc(f, compression=compression) # type: ignore df_read = pl.read_ipc(str(f)) assert df.frame_equal(df_read)
def test_to_from_buffer(df: pl.DataFrame) -> None: buf = io.BytesIO() df.write_csv(buf) buf.seek(0) read_df = pl.read_csv(buf, parse_dates=True) read_df = read_df.with_columns( [pl.col("cat").cast(pl.Categorical), pl.col("time").cast(pl.Time)] ) assert df.frame_equal(read_df)
def test_null_count(df: pl.DataFrame) -> None: # note: the zero-row and zero-col cases are always passed as explicit examples null_count, ncols = df.null_count(), len(df.columns) if ncols == 0: assert null_count.shape == (0, 0) else: assert null_count.shape == (1, ncols) for idx, count in enumerate(null_count.rows()[0]): assert count == sum(v is None for v in df.select_at_idx(idx).to_list()) print(null_count.rows())
def test_from_to_file(io_test_dir: str, df: pl.DataFrame, compressions: list[str]) -> None: f_ipc = os.path.join(io_test_dir, "small.ipc") # does not yet work on windows because we hold an mmap? if os.name != "nt": for compression in compressions: for f in (str(f_ipc), Path(f_ipc)): df.write_ipc(f, compression=compression) # type: ignore[arg-type] df_read = pl.read_ipc(f) # type: ignore[arg-type] assert_frame_equal_local_categoricals(df, df_read)
def test_head_tail(fruits_cars: pl.DataFrame) -> None: res_expr = fruits_cars.select([pl.head("A", 2)]) res_series = pl.head(fruits_cars["A"], 2) expected = pl.Series("A", [1, 2]) assert res_expr.to_series(0).series_equal(expected) assert res_series.series_equal(expected) res_expr = fruits_cars.select([pl.tail("A", 2)]) res_series = pl.tail(fruits_cars["A"], 2) expected = pl.Series("A", [4, 5]) assert res_expr.to_series(0).series_equal(expected) assert res_series.series_equal(expected)
def test_selection(): df = DataFrame({"a": [1, 2, 3], "b": [1.0, 2.0, 3.0], "c": ["a", "b", "c"]}) # column selection by string(s) in first dimension assert df["a"] == [1, 2, 3] assert df["b"] == [1.0, 2.0, 3.0] assert df["c"] == ["a", "b", "c"] # row selection by integers(s) in first dimension assert df[0].frame_equal(pl.DataFrame({"a": [1], "b": [1.0], "c": ["a"]})) assert df[-1].frame_equal(pl.DataFrame({"a": [3], "b": [3.0], "c": ["c"]})) # row, column selection when using two dimensions assert df[:, 0] == [1, 2, 3] assert df[:, 1] == [1.0, 2.0, 3.0] assert df[:2, 2] == ["a", "b"] assert df[[1, 2]].frame_equal( pl.DataFrame({"a": [2, 3], "b": [2.0, 3.0], "c": ["b", "c"]}) ) assert df[[-1, -2]].frame_equal( pl.DataFrame({"a": [3, 2], "b": [3.0, 2.0], "c": ["c", "b"]}) ) assert df[[True, False, True]].frame_equal( pl.DataFrame({"a": [1, 3], "b": [1.0, 3.0], "c": ["a", "c"]}) ) assert df[["a", "b"]].columns == ["a", "b"] assert df[[1, 2], [1, 2]].frame_equal( pl.DataFrame({"b": [2.0, 3.0], "c": ["b", "c"]}) ) assert df[1, 2] == "b" assert df[1, 1] == 2.0 assert df[2, 0] == 3 assert df[[True, False, True], "b"].shape == (2, 1) assert df[[True, False, False], ["a", "b"]].shape == (1, 2) assert df[[0, 1], "b"].shape == (2, 1) assert df[[2], ["a", "b"]].shape == (1, 2) assert df.select_at_idx(0).name == "a" assert (df.a == df["a"]).sum() == 3 assert (df.c == df["a"]).sum() == 0 assert df[:, "a":"b"].shape == (3, 2) assert df[:, "a":"c"].columns == ["a", "b", "c"] expect = pl.DataFrame({"c": ["b"]}) assert df[1, [2]].frame_equal(expect) expect = pl.DataFrame({"b": [1.0, 3.0]}) assert df[[0, 2], [1]].frame_equal(expect) assert df[0, "c"] == "a" assert df[1, "c"] == "b" assert df[2, "c"] == "c" assert df[0, "a"] == 1
def test_to_from_file(io_test_dir: str, df: pl.DataFrame) -> None: df = df.drop("strings_nulls") f = os.path.join(io_test_dir, "small.csv") df.write_csv(f) read_df = pl.read_csv(f, parse_dates=True) read_df = read_df.with_columns( [pl.col("cat").cast(pl.Categorical), pl.col("time").cast(pl.Time)] ) assert df.frame_equal(read_df)
def test_to_from_buffer(df: pl.DataFrame, compressions: list[str]) -> None: for compression in compressions: if compression == "lzo": # lzo compression is not supported now with pytest.raises(pl.ArrowError): buf = io.BytesIO() df.write_parquet(buf, compression=compression) buf.seek(0) _ = pl.read_parquet(buf) with pytest.raises(OSError): buf = io.BytesIO() df.write_parquet(buf, compression=compression, use_pyarrow=True) buf.seek(0) _ = pl.read_parquet(buf) else: buf = io.BytesIO() df.write_parquet(buf, compression=compression) buf.seek(0) read_df = pl.read_parquet(buf) assert_frame_equal_local_categoricals(df, read_df) for use_pyarrow in [True, False]: buf = io.BytesIO() df.write_parquet(buf, use_pyarrow=use_pyarrow) buf.seek(0) read_df = pl.read_parquet(buf, use_pyarrow=use_pyarrow) assert_frame_equal_local_categoricals(df, read_df)
def test_quantile(fruits_cars: pl.DataFrame) -> None: assert fruits_cars.lazy().quantile(0.25, "nearest").collect()["A"][0] == 2 assert fruits_cars.select(pl.col("A").quantile(0.25, "nearest"))["A"][0] == 2 assert fruits_cars.lazy().quantile(0.24, "lower").collect()["A"][0] == 1 assert fruits_cars.select(pl.col("A").quantile(0.24, "lower"))["A"][0] == 1 assert fruits_cars.lazy().quantile(0.26, "higher").collect()["A"][0] == 3 assert fruits_cars.select(pl.col("A").quantile(0.26, "higher"))["A"][0] == 3 assert fruits_cars.lazy().quantile(0.24, "midpoint").collect()["A"][0] == 1.5 assert fruits_cars.select(pl.col("A").quantile(0.24, "midpoint"))["A"][0] == 1.5 assert fruits_cars.lazy().quantile(0.24, "linear").collect()["A"][0] == 1.96 assert fruits_cars.select(pl.col("A").quantile(0.24, "linear"))["A"][0] == 1.96
def test_last(fruits_cars: pl.DataFrame) -> None: assert ( fruits_cars.lazy() .last() .collect() .frame_equal(fruits_cars[(len(fruits_cars) - 1) :, :]) )
def test_head_tail(): df = DataFrame({"a": range(10), "b": range(10)}) assert df.head(5).height == 5 assert df.tail(5).height == 5 assert not df.head(5).frame_equal(df.tail(5)) # check if it doesn't fail when out of bounds assert df.head(100).height == 10 assert df.tail(100).height == 10
def test_is_between(fruits_cars: pl.DataFrame) -> None: assert fruits_cars.select(pl.col("A").is_between( 2, 4))["is_between"].series_equal( # type: ignore[arg-type] pl.Series("is_between", [False, False, True, False, False])) assert fruits_cars.select(pl.col("A").is_between( 2, 4, False))["is_between"].series_equal( # type: ignore[arg-type] pl.Series("is_between", [False, False, True, False, False])) assert fruits_cars.select(pl.col("A").is_between( 2, 4, [False, False]))["is_between"].series_equal( # type: ignore[arg-type] pl.Series("is_between", [False, False, True, False, False])) assert fruits_cars.select(pl.col("A").is_between( 2, 4, True))["is_between"].series_equal( # type: ignore[arg-type] pl.Series("is_between", [False, True, True, True, False])) assert fruits_cars.select(pl.col("A").is_between( 2, 4, [True, True]))["is_between"].series_equal( # type: ignore[arg-type] pl.Series("is_between", [False, True, True, True, False])) assert fruits_cars.select(pl.col("A").is_between( 2, 4, [False, True]))["is_between"].series_equal( # type: ignore[arg-type] pl.Series("is_between", [False, False, True, True, False])) assert fruits_cars.select(pl.col("A").is_between( 2, 4, [True, False]))["is_between"].series_equal( # type: ignore[arg-type] pl.Series("is_between", [False, True, True, False, False]))
def test_set(): np.random.seed(1) df = DataFrame({ "foo": np.random.rand(10), "bar": np.arange(10), "ham": ["h"] * 10 }) df["new"] = np.random.rand(10) df[df["new"] > 0.5, "new"] = 1
def test_selection(): df = DataFrame({"a": [1, 2, 3], "b": [1.0, 2.0, 3.0], "c": ["a", "b", "c"]}) assert df["a"].dtype == "i64" assert df["b"].dtype == "f64" assert df["c"].dtype == "str" assert df[["a", "b"]].columns == ["a", "b"] assert df[[True, False, True]].height == 2 assert df[[True, False, True], "b"].shape == (2, 1) assert df[[True, False, False], ["a", "b"]].shape == (1, 2) assert df[[0, 1], "b"].shape == (2, 1) assert df[[2], ["a", "b"]].shape == (1, 2) assert df.select_idx(0).name == "a" assert (df.a == df["a"]).sum() == 3 assert (df.c == df["a"]).sum() == 0
def test_file_buffer(): f = BytesIO() f.write(b"1,2,3,4,5,6\n7,8,9,10,11,12") f.seek(0) df = DataFrame.read_csv(f, has_headers=False) assert df.shape == (2, 6) f.seek(0) # check if not fails on TryClone and Length impl in file.rs with pytest.raises(RuntimeError) as e: df.read_parquet(f) assert "Invalid Parquet file" in str(e.value)
def test_shift(fruits_cars: pl.DataFrame) -> None: df = pl.DataFrame({"a": [1, 2, 3, 4, 5], "b": [1, 2, 3, 4, 5]}) out = df.select(col("a").shift(1)) assert out["a"].series_equal(pl.Series("a", [None, 1, 2, 3, 4]), null_equal=True) res = fruits_cars.lazy().shift(2).collect() expected = pl.DataFrame({ "A": [None, None, 1, 2, 3], "fruits": [None, None, "banana", "banana", "apple"], "B": [None, None, 5, 4, 3], "cars": [None, None, "beetle", "audi", "beetle"], }) res.frame_equal(expected, null_equal=True) # negative value res = fruits_cars.lazy().shift(-2).collect() for rows in [3, 4]: for cols in range(4): assert res[rows, cols] is None
def test_drop(): df = DataFrame({"a": [2, 1, 3], "b": ["a", "b", "c"], "c": [1, 2, 3]}) df = df.drop("a") assert df.shape == (3, 2) df = DataFrame({"a": [2, 1, 3], "b": ["a", "b", "c"], "c": [1, 2, 3]}) s = df.drop_in_place("a") assert s.name == "a"