Пример #1
0
def test_from_to_buffer(df: pl.DataFrame, compressions: List[str]) -> None:
    for compression in compressions:
        buf = io.BytesIO()
        df.write_ipc(buf, compression=compression)  # type: ignore
        buf.seek(0)
        read_df = pl.read_ipc(buf)
        assert df.frame_equal(read_df)
Пример #2
0
def test_lazy_concat(df: pl.DataFrame) -> None:
    shape = df.shape
    shape = (shape[0] * 2, shape[1])

    out = pl.concat([df.lazy(), df.lazy()]).collect()
    assert out.shape == shape
    assert out.frame_equal(df.vstack(df.clone()), null_equal=True)
Пример #3
0
def test_to_from_buffer(df: pl.DataFrame) -> None:
    df = df.drop("strings_nulls")

    for to_fn, from_fn, text_based in zip(
        [df.to_parquet, df.to_csv, df.to_ipc, df.to_json],
        [
            pl.read_parquet,
            partial(pl.read_csv, parse_dates=True),
            pl.read_ipc,
            pl.read_json,
        ],
        [False, True, False, True],
    ):
        f = io.BytesIO()
        to_fn(f)  # type: ignore
        f.seek(0)

        df_1 = from_fn(f)  # type: ignore
        # some type information is lost due to text conversion
        if text_based:
            df_1 = df_1.with_columns([
                pl.col("cat").cast(pl.Categorical),
                pl.col("time").cast(pl.Time)
            ])
        assert df.frame_equal(df_1)
Пример #4
0
def test_selection():
    df = DataFrame({
        "a": [1, 2, 3],
        "b": [1.0, 2.0, 3.0],
        "c": ["a", "b", "c"]
    })

    assert df["a"].dtype == Int64
    assert df["b"].dtype == Float64
    assert df["c"].dtype == Utf8

    assert df[["a", "b"]].columns == ["a", "b"]
    assert df[[True, False, True]].height == 2

    assert df[[True, False, True], "b"].shape == (2, 1)
    assert df[[True, False, False], ["a", "b"]].shape == (1, 2)

    assert df[[0, 1], "b"].shape == (2, 1)
    assert df[[2], ["a", "b"]].shape == (1, 2)
    assert df.select_at_idx(0).name == "a"
    assert (df.a == df["a"]).sum() == 3
    assert (df.c == df["a"]).sum() == 0
    assert df[:, "a":"b"].shape == (3, 2)
    assert df[:, "a":"c"].columns == ["a", "b", "c"]
    expect = pl.DataFrame({"c": ["b"]})
    assert df[1, [2]].frame_equal(expect)
    expect = pl.DataFrame({"b": [1.0, 3.0]})
    assert df[[0, 2], [1]].frame_equal(expect)
Пример #5
0
def test_from_to_buffer(df: pl.DataFrame, compressions: list[str]) -> None:
    for compression in compressions:
        buf = io.BytesIO()
        df.write_ipc(buf, compression=compression)  # type: ignore[arg-type]
        buf.seek(0)
        read_df = pl.read_ipc(buf)
        assert_frame_equal_local_categoricals(df, read_df)
Пример #6
0
def test_binary_function():
    df = DataFrame({"a": [1, 2, 3], "b": [1.0, 2.0, 3.0]})
    out = (
        df.lazy()
        .with_column(map_binary(col("a"), col("b"), lambda a, b: a + b))
        .collect()
    )
    assert out["binary_function"] == (out.a + out.b)
Пример #7
0
def test_from_to_file(io_test_dir: str, example_df: pl.DataFrame,
                      compressions: List[str]) -> None:
    f = os.path.join(io_test_dir, "small.avro")

    for compression in compressions:
        example_df.write_avro(f, compression=compression)  # type: ignore
        df_read = pl.read_avro(str(f))
        assert example_df.frame_equal(df_read)
Пример #8
0
def test_to_pandas():
    df = get_complete_df()
    df.to_arrow()
    df.to_pandas()
    # test shifted df
    df.shift(2).to_pandas()
    df = DataFrame({"col": Series([True, False, True])})
    df.shift(2).to_pandas()
Пример #9
0
def test_set_null():
    df = DataFrame({"a": [1, 2, 3], "b": [1.0, 2.0, 3.0]})
    out = (df.lazy().with_column(
        when(col("a") > 1).then(
            lit(None)).otherwise(100).alias("foo")).collect())
    s = out["foo"]
    assert s[0] == 100
    assert s[1] is None
    assert s[2] is None
Пример #10
0
def test_row_tuple():
    df = DataFrame({
        "a": ["foo", "bar", "2"],
        "b": [1, 2, 3],
        "c": [1.0, 2.0, 3.0]
    })
    assert df.row(0) == ("foo", 1, 1.0)
    assert df.row(1) == ("bar", 2, 2.0)
    assert df.row(-1) == ("2", 3, 3.0)
Пример #11
0
def test_from_to_buffer(example_df: pl.DataFrame,
                        compressions: list[str]) -> None:
    for compression in compressions:
        buf = io.BytesIO()
        example_df.write_avro(
            buf, compression=compression)  # type: ignore[arg-type]
        buf.seek(0)
        read_df = pl.read_avro(buf)
        assert example_df.frame_equal(read_df)
Пример #12
0
def test_to_from_buffer(df: pl.DataFrame) -> None:
    for buf in (io.BytesIO(), io.StringIO()):
        df.write_json(buf)
        buf.seek(0)
        read_df = pl.read_json(buf)
        read_df = read_df.with_columns(
            [pl.col("cat").cast(pl.Categorical),
             pl.col("time").cast(pl.Time)])
        assert_frame_equal_local_categoricals(df, read_df)
Пример #13
0
def test_lazy():
    df = DataFrame({"a": [1, 2, 3], "b": [1.0, 2.0, 3.0]})
    ldf = df.lazy().with_column(lit(1).alias("foo")).select(
        [col("a"), col("foo")])

    print(ldf.collect())
    # test if it executes
    new = (df.lazy().with_column(
        when(col("a").gt(lit(2))).then(lit(10)).otherwise(
            lit(1)).alias("new")).collect())
Пример #14
0
def test_from_to_file(io_test_dir: str, df: pl.DataFrame,
                      compressions: List[str]) -> None:
    f = os.path.join(io_test_dir, "small.ipc")

    # does not yet work on windows because we hold an mmap?
    if os.name != "nt":
        for compression in compressions:
            df.write_ipc(f, compression=compression)  # type: ignore
            df_read = pl.read_ipc(str(f))
            assert df.frame_equal(df_read)
Пример #15
0
def test_to_from_buffer(df: pl.DataFrame) -> None:
    buf = io.BytesIO()
    df.write_csv(buf)
    buf.seek(0)

    read_df = pl.read_csv(buf, parse_dates=True)

    read_df = read_df.with_columns(
        [pl.col("cat").cast(pl.Categorical), pl.col("time").cast(pl.Time)]
    )
    assert df.frame_equal(read_df)
Пример #16
0
def test_null_count(df: pl.DataFrame) -> None:
    # note: the zero-row and zero-col cases are always passed as explicit examples
    null_count, ncols = df.null_count(), len(df.columns)
    if ncols == 0:
        assert null_count.shape == (0, 0)
    else:
        assert null_count.shape == (1, ncols)
        for idx, count in enumerate(null_count.rows()[0]):
            assert count == sum(v is None
                                for v in df.select_at_idx(idx).to_list())
    print(null_count.rows())
Пример #17
0
def test_from_to_file(io_test_dir: str, df: pl.DataFrame,
                      compressions: list[str]) -> None:
    f_ipc = os.path.join(io_test_dir, "small.ipc")

    # does not yet work on windows because we hold an mmap?
    if os.name != "nt":
        for compression in compressions:
            for f in (str(f_ipc), Path(f_ipc)):
                df.write_ipc(f,
                             compression=compression)  # type: ignore[arg-type]
                df_read = pl.read_ipc(f)  # type: ignore[arg-type]
                assert_frame_equal_local_categoricals(df, df_read)
Пример #18
0
def test_head_tail(fruits_cars: pl.DataFrame) -> None:
    res_expr = fruits_cars.select([pl.head("A", 2)])
    res_series = pl.head(fruits_cars["A"], 2)
    expected = pl.Series("A", [1, 2])
    assert res_expr.to_series(0).series_equal(expected)
    assert res_series.series_equal(expected)

    res_expr = fruits_cars.select([pl.tail("A", 2)])
    res_series = pl.tail(fruits_cars["A"], 2)
    expected = pl.Series("A", [4, 5])
    assert res_expr.to_series(0).series_equal(expected)
    assert res_series.series_equal(expected)
Пример #19
0
def test_selection():
    df = DataFrame({"a": [1, 2, 3], "b": [1.0, 2.0, 3.0], "c": ["a", "b", "c"]})

    # column selection by string(s) in first dimension
    assert df["a"] == [1, 2, 3]
    assert df["b"] == [1.0, 2.0, 3.0]
    assert df["c"] == ["a", "b", "c"]

    # row selection by integers(s) in first dimension
    assert df[0].frame_equal(pl.DataFrame({"a": [1], "b": [1.0], "c": ["a"]}))
    assert df[-1].frame_equal(pl.DataFrame({"a": [3], "b": [3.0], "c": ["c"]}))

    # row, column selection when using two dimensions
    assert df[:, 0] == [1, 2, 3]
    assert df[:, 1] == [1.0, 2.0, 3.0]
    assert df[:2, 2] == ["a", "b"]

    assert df[[1, 2]].frame_equal(
        pl.DataFrame({"a": [2, 3], "b": [2.0, 3.0], "c": ["b", "c"]})
    )
    assert df[[-1, -2]].frame_equal(
        pl.DataFrame({"a": [3, 2], "b": [3.0, 2.0], "c": ["c", "b"]})
    )

    assert df[[True, False, True]].frame_equal(
        pl.DataFrame({"a": [1, 3], "b": [1.0, 3.0], "c": ["a", "c"]})
    )
    assert df[["a", "b"]].columns == ["a", "b"]
    assert df[[1, 2], [1, 2]].frame_equal(
        pl.DataFrame({"b": [2.0, 3.0], "c": ["b", "c"]})
    )
    assert df[1, 2] == "b"
    assert df[1, 1] == 2.0
    assert df[2, 0] == 3

    assert df[[True, False, True], "b"].shape == (2, 1)
    assert df[[True, False, False], ["a", "b"]].shape == (1, 2)

    assert df[[0, 1], "b"].shape == (2, 1)
    assert df[[2], ["a", "b"]].shape == (1, 2)
    assert df.select_at_idx(0).name == "a"
    assert (df.a == df["a"]).sum() == 3
    assert (df.c == df["a"]).sum() == 0
    assert df[:, "a":"b"].shape == (3, 2)
    assert df[:, "a":"c"].columns == ["a", "b", "c"]
    expect = pl.DataFrame({"c": ["b"]})
    assert df[1, [2]].frame_equal(expect)
    expect = pl.DataFrame({"b": [1.0, 3.0]})
    assert df[[0, 2], [1]].frame_equal(expect)
    assert df[0, "c"] == "a"
    assert df[1, "c"] == "b"
    assert df[2, "c"] == "c"
    assert df[0, "a"] == 1
Пример #20
0
def test_to_from_file(io_test_dir: str, df: pl.DataFrame) -> None:
    df = df.drop("strings_nulls")

    f = os.path.join(io_test_dir, "small.csv")
    df.write_csv(f)

    read_df = pl.read_csv(f, parse_dates=True)

    read_df = read_df.with_columns(
        [pl.col("cat").cast(pl.Categorical), pl.col("time").cast(pl.Time)]
    )
    assert df.frame_equal(read_df)
Пример #21
0
def test_to_from_buffer(df: pl.DataFrame, compressions: list[str]) -> None:
    for compression in compressions:
        if compression == "lzo":
            # lzo compression is not supported now
            with pytest.raises(pl.ArrowError):
                buf = io.BytesIO()
                df.write_parquet(buf, compression=compression)
                buf.seek(0)
                _ = pl.read_parquet(buf)

            with pytest.raises(OSError):
                buf = io.BytesIO()
                df.write_parquet(buf,
                                 compression=compression,
                                 use_pyarrow=True)
                buf.seek(0)
                _ = pl.read_parquet(buf)
        else:
            buf = io.BytesIO()
            df.write_parquet(buf, compression=compression)
            buf.seek(0)
            read_df = pl.read_parquet(buf)
            assert_frame_equal_local_categoricals(df, read_df)

    for use_pyarrow in [True, False]:
        buf = io.BytesIO()
        df.write_parquet(buf, use_pyarrow=use_pyarrow)
        buf.seek(0)
        read_df = pl.read_parquet(buf, use_pyarrow=use_pyarrow)
        assert_frame_equal_local_categoricals(df, read_df)
Пример #22
0
def test_quantile(fruits_cars: pl.DataFrame) -> None:
    assert fruits_cars.lazy().quantile(0.25, "nearest").collect()["A"][0] == 2
    assert fruits_cars.select(pl.col("A").quantile(0.25, "nearest"))["A"][0] == 2

    assert fruits_cars.lazy().quantile(0.24, "lower").collect()["A"][0] == 1
    assert fruits_cars.select(pl.col("A").quantile(0.24, "lower"))["A"][0] == 1

    assert fruits_cars.lazy().quantile(0.26, "higher").collect()["A"][0] == 3
    assert fruits_cars.select(pl.col("A").quantile(0.26, "higher"))["A"][0] == 3

    assert fruits_cars.lazy().quantile(0.24, "midpoint").collect()["A"][0] == 1.5
    assert fruits_cars.select(pl.col("A").quantile(0.24, "midpoint"))["A"][0] == 1.5

    assert fruits_cars.lazy().quantile(0.24, "linear").collect()["A"][0] == 1.96
    assert fruits_cars.select(pl.col("A").quantile(0.24, "linear"))["A"][0] == 1.96
Пример #23
0
def test_last(fruits_cars: pl.DataFrame) -> None:
    assert (
        fruits_cars.lazy()
        .last()
        .collect()
        .frame_equal(fruits_cars[(len(fruits_cars) - 1) :, :])
    )
Пример #24
0
def test_head_tail():
    df = DataFrame({"a": range(10), "b": range(10)})
    assert df.head(5).height == 5
    assert df.tail(5).height == 5

    assert not df.head(5).frame_equal(df.tail(5))
    # check if it doesn't fail when out of bounds
    assert df.head(100).height == 10
    assert df.tail(100).height == 10
Пример #25
0
def test_is_between(fruits_cars: pl.DataFrame) -> None:
    assert fruits_cars.select(pl.col("A").is_between(
        2, 4))["is_between"].series_equal(  # type: ignore[arg-type]
            pl.Series("is_between", [False, False, True, False, False]))
    assert fruits_cars.select(pl.col("A").is_between(
        2, 4, False))["is_between"].series_equal(  # type: ignore[arg-type]
            pl.Series("is_between", [False, False, True, False, False]))
    assert fruits_cars.select(pl.col("A").is_between(
        2, 4,
        [False, False]))["is_between"].series_equal(  # type: ignore[arg-type]
            pl.Series("is_between", [False, False, True, False, False]))
    assert fruits_cars.select(pl.col("A").is_between(
        2, 4, True))["is_between"].series_equal(  # type: ignore[arg-type]
            pl.Series("is_between", [False, True, True, True, False]))
    assert fruits_cars.select(pl.col("A").is_between(
        2, 4,
        [True, True]))["is_between"].series_equal(  # type: ignore[arg-type]
            pl.Series("is_between", [False, True, True, True, False]))
    assert fruits_cars.select(pl.col("A").is_between(
        2, 4,
        [False, True]))["is_between"].series_equal(  # type: ignore[arg-type]
            pl.Series("is_between", [False, False, True, True, False]))
    assert fruits_cars.select(pl.col("A").is_between(
        2, 4,
        [True, False]))["is_between"].series_equal(  # type: ignore[arg-type]
            pl.Series("is_between", [False, True, True, False, False]))
Пример #26
0
def test_set():
    np.random.seed(1)
    df = DataFrame({
        "foo": np.random.rand(10),
        "bar": np.arange(10),
        "ham": ["h"] * 10
    })
    df["new"] = np.random.rand(10)
    df[df["new"] > 0.5, "new"] = 1
Пример #27
0
def test_selection():
    df = DataFrame({"a": [1, 2, 3], "b": [1.0, 2.0, 3.0], "c": ["a", "b", "c"]})

    assert df["a"].dtype == "i64"
    assert df["b"].dtype == "f64"
    assert df["c"].dtype == "str"

    assert df[["a", "b"]].columns == ["a", "b"]
    assert df[[True, False, True]].height == 2

    assert df[[True, False, True], "b"].shape == (2, 1)
    assert df[[True, False, False], ["a", "b"]].shape == (1, 2)

    assert df[[0, 1], "b"].shape == (2, 1)
    assert df[[2], ["a", "b"]].shape == (1, 2)
    assert df.select_idx(0).name == "a"
    assert (df.a == df["a"]).sum() == 3
    assert (df.c == df["a"]).sum() == 0
Пример #28
0
def test_file_buffer():
    f = BytesIO()
    f.write(b"1,2,3,4,5,6\n7,8,9,10,11,12")
    f.seek(0)
    df = DataFrame.read_csv(f, has_headers=False)
    assert df.shape == (2, 6)
    f.seek(0)

    # check if not fails on TryClone and Length impl in file.rs
    with pytest.raises(RuntimeError) as e:
        df.read_parquet(f)
    assert "Invalid Parquet file" in str(e.value)
Пример #29
0
def test_shift(fruits_cars: pl.DataFrame) -> None:
    df = pl.DataFrame({"a": [1, 2, 3, 4, 5], "b": [1, 2, 3, 4, 5]})
    out = df.select(col("a").shift(1))
    assert out["a"].series_equal(pl.Series("a", [None, 1, 2, 3, 4]),
                                 null_equal=True)

    res = fruits_cars.lazy().shift(2).collect()

    expected = pl.DataFrame({
        "A": [None, None, 1, 2, 3],
        "fruits": [None, None, "banana", "banana", "apple"],
        "B": [None, None, 5, 4, 3],
        "cars": [None, None, "beetle", "audi", "beetle"],
    })
    res.frame_equal(expected, null_equal=True)

    # negative value
    res = fruits_cars.lazy().shift(-2).collect()
    for rows in [3, 4]:
        for cols in range(4):
            assert res[rows, cols] is None
Пример #30
0
def test_drop():
    df = DataFrame({"a": [2, 1, 3], "b": ["a", "b", "c"], "c": [1, 2, 3]})
    df = df.drop("a")
    assert df.shape == (3, 2)
    df = DataFrame({"a": [2, 1, 3], "b": ["a", "b", "c"], "c": [1, 2, 3]})
    s = df.drop_in_place("a")
    assert s.name == "a"