示例#1
0
def test_init_series() -> None:
    # List of Series
    df = pl.DataFrame([pl.Series("a", [1, 2, 3]), pl.Series("b", [4, 5, 6])])
    truth = pl.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]})
    assert df.frame_equal(truth)

    # Tuple of Series
    df = pl.DataFrame((pl.Series("a", (1, 2, 3)), pl.Series("b", (4, 5, 6))))
    assert df.frame_equal(truth)

    df = pl.DataFrame(
        (pl.Series("a", (1, 2, 3)), pl.Series("b", (4, 5, 6))),
        columns=[("x", pl.Float64), ("y", pl.Float64)],
    )
    assert df.schema == {"x": pl.Float64, "y": pl.Float64}
    assert df.rows() == [(1.0, 4.0), (2.0, 5.0), (3.0, 6.0)]

    # List of unnamed Series
    df = pl.DataFrame([pl.Series([1, 2, 3]), pl.Series([4, 5, 6])])
    truth = pl.DataFrame(
        [pl.Series("column_0", [1, 2, 3]),
         pl.Series("column_1", [4, 5, 6])])
    assert df.frame_equal(truth)

    df = pl.DataFrame([pl.Series([0.0]), pl.Series([1.0])])
    assert df.schema == {"column_0": pl.Float64, "column_1": pl.Float64}
    assert df.rows() == [(0.0, 1.0)]

    df = pl.DataFrame(
        [pl.Series([None]), pl.Series([1.0])],
        columns=[("x", pl.Date), ("y", pl.Boolean)],
    )
    assert df.schema == {"x": pl.Date, "y": pl.Boolean}
    assert df.rows() == [(None, True)]

    # Single Series
    df = pl.DataFrame(pl.Series("a", [1, 2, 3]))
    truth = pl.DataFrame({"a": [1, 2, 3]})
    assert df.schema == {"a": pl.Int64}
    assert df.frame_equal(truth)

    df = pl.DataFrame(pl.Series("a", [1, 2, 3]), columns=[("a", pl.UInt32)])
    assert df.rows() == [(1, ), (2, ), (3, )]
    assert df.schema == {"a": pl.UInt32}
示例#2
0
def test_sort_by_exprs() -> None:
    # make sure that the expression does not overwrite columns in the dataframe
    df = pl.DataFrame({"a": [1, 2, -1, -2]})
    out = df.sort(pl.col("a").abs()).to_series()

    assert out.to_list() == [1, -1, 2, -2]
示例#3
0
def test_init_dict() -> None:
    # Empty dictionary
    df = pl.DataFrame({})
    assert df.shape == (0, 0)

    # Empty dictionary/values
    df = pl.DataFrame({"a": [], "b": []})
    assert df.shape == (0, 2)
    assert df.schema == {"a": pl.Float32, "b": pl.Float32}

    for df in (
            pl.DataFrame({}, columns={
                "a": pl.Date,
                "b": pl.Utf8
            }),
            pl.DataFrame({
                "a": [],
                "b": []
            },
                         columns={
                             "a": pl.Date,
                             "b": pl.Utf8
                         }),
    ):
        assert df.shape == (0, 2)
        assert df.schema == {"a": pl.Date, "b": pl.Utf8}

    # List of empty list/tuple
    df = pl.DataFrame({"a": [[]], "b": [()]})
    assert df.schema == {"a": pl.List(pl.Float64), "b": pl.List(pl.Float64)}
    assert df.rows() == [([], [])]

    # Mixed dtypes
    df = pl.DataFrame({"a": [1, 2, 3], "b": [1.0, 2.0, 3.0]})
    assert df.shape == (3, 2)
    assert df.columns == ["a", "b"]
    assert df.dtypes == [pl.Int64, pl.Float64]

    df = pl.DataFrame(
        data={
            "a": [1, 2, 3],
            "b": [1.0, 2.0, 3.0]
        },
        columns=[("a", pl.Int8), ("b", pl.Float32)],
    )
    assert df.schema == {"a": pl.Int8, "b": pl.Float32}

    # Values contained in tuples
    df = pl.DataFrame({"a": (1, 2, 3), "b": [1.0, 2.0, 3.0]})
    assert df.shape == (3, 2)

    # Datetime/Date types (from both python and integer values)
    py_datetimes = (
        datetime(2022, 12, 31, 23, 59, 59),
        datetime(2022, 12, 31, 23, 59, 59),
    )
    py_dates = (date(2022, 12, 31), date(2022, 12, 31))
    int_datetimes = [1672531199000000, 1672531199000000]
    int_dates = [19357, 19357]

    for dates, datetimes, coldefs in (
            # test inferred and explicit (given both py/polars dtypes)
        (py_dates, py_datetimes, None),
        (py_dates, py_datetimes, [("dt", date), ("dtm", datetime)]),
        (py_dates, py_datetimes, [("dt", pl.Date), ("dtm", pl.Datetime)]),
        (int_dates, int_datetimes, [("dt", date), ("dtm", datetime)]),
        (int_dates, int_datetimes, [("dt", pl.Date), ("dtm", pl.Datetime)]),
    ):
        df = pl.DataFrame(
            data={
                "dt": dates,
                "dtm": datetimes
            },
            columns=coldefs,
        )
        assert df.schema == {"dt": pl.Date, "dtm": pl.Datetime}
        assert df.rows() == list(zip(py_dates, py_datetimes))

    # Overriding dict column names/types
    df = pl.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}, columns=["c", "d"])
    assert df.columns == ["c", "d"]

    df = pl.DataFrame(
        {
            "a": [1, 2, 3],
            "b": [4, 5, 6]
        },
        columns=["c", ("d", pl.Int8)]  # type: ignore[arg-type]
    )  # partial type info (allowed, but mypy doesn't like it ;p)
    assert df.schema == {"c": pl.Int64, "d": pl.Int8}

    df = pl.DataFrame({
        "a": [1, 2, 3],
        "b": [4, 5, 6]
    },
                      columns=[("c", pl.Int8), ("d", pl.Int16)])
    assert df.schema == {"c": pl.Int8, "d": pl.Int16}

    dfe = df.cleared()
    assert (df.schema == dfe.schema) and (len(dfe) == 0)
示例#4
0
def test_init_ndarray_deprecated() -> None:
    with pytest.deprecated_call():
        # 2D array - default to row orientation
        df = pl.DataFrame(np.array([[1, 2], [3, 4]]))
        truth = pl.DataFrame({"column_0": [1, 3], "column_1": [2, 4]})
        assert df.frame_equal(truth)
示例#5
0
def test_init_ndarray() -> None:
    # Empty array
    df = pl.DataFrame(np.array([]))
    assert df.frame_equal(pl.DataFrame())

    # 1D array
    df = pl.DataFrame(np.array([1, 2, 3]), columns=["a"])
    truth = pl.DataFrame({"a": [1, 2, 3]})
    assert df.frame_equal(truth)

    df = pl.DataFrame(np.array([1, 2, 3]), columns=[("a", pl.Int32)])
    truth = pl.DataFrame({
        "a": [1, 2, 3]
    }).with_column(pl.col("a").cast(pl.Int32))
    assert df.frame_equal(truth)

    # 2D array - default to column orientation
    df = pl.DataFrame(np.array([[1, 2], [3, 4]]), orient="col")
    truth = pl.DataFrame({"column_0": [1, 2], "column_1": [3, 4]})
    assert df.frame_equal(truth)

    df = pl.DataFrame([[1, 2.0, "a"], [None, None, None]], orient="row")
    truth = pl.DataFrame({
        "column_0": [1, None],
        "column_1": [2.0, None],
        "column_2": ["a", None]
    })
    assert df.frame_equal(truth)

    df = pl.DataFrame(
        data=[[1, 2.0, "a"], [None, None, None]],
        columns=[("x", pl.Boolean), ("y", pl.Int32),
                 "z"],  # type: ignore[arg-type]
        orient="row",
    )
    assert df.rows() == [(True, 2, "a"), (None, None, None)]
    assert df.schema == {"x": pl.Boolean, "y": pl.Int32, "z": pl.Utf8}

    # TODO: Uncomment tests below when removing deprecation warning
    # # 2D array - default to column orientation
    # df = pl.DataFrame(np.array([[1, 2], [3, 4]]))
    # truth = pl.DataFrame({"column_0": [1, 2], "column_1": [3, 4]})
    # assert df.frame_equal(truth)

    # # 2D array - row orientation inferred
    # df = pl.DataFrame(np.array([[1, 2, 3], [4, 5, 6]]), columns=["a", "b", "c"])
    # truth = pl.DataFrame({"a": [1, 4], "b": [2, 5], "c": [3, 6]})
    # assert df.frame_equal(truth)

    # # 2D array - column orientation inferred
    # df = pl.DataFrame(np.array([[1, 2, 3], [4, 5, 6]]), columns=["a", "b"])
    # truth = pl.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]})
    # assert df.frame_equal(truth)

    # 2D array - orientation conflicts with columns
    with pytest.raises(ValueError):
        pl.DataFrame(np.array([[1, 2, 3], [4, 5, 6]]),
                     columns=["a", "b"],
                     orient="row")
    with pytest.raises(ValueError):
        pl.DataFrame(
            np.array([[1, 2, 3], [4, 5, 6]]),
            columns=[("a", pl.UInt32), ("b", pl.UInt32)],
            orient="row",
        )

    # 3D array
    with pytest.raises(ValueError):
        _ = pl.DataFrame(np.random.randn(2, 2, 2))

    # Wrong orient value
    with pytest.raises(ValueError):
        df = pl.DataFrame(
            np.array([[1, 2, 3], [4, 5, 6]]),
            orient="wrong",  # type: ignore[arg-type]
        )

    # numpy not available
    with patch("polars.internals.frame._NUMPY_AVAILABLE", False):
        with pytest.raises(ValueError):
            pl.DataFrame(np.array([1, 2, 3]), columns=["a"])
示例#6
0
import polars as pl
from polars.lazy import col

df = pl.DataFrame({
    "date": ["2020-01-02", "2020-01-03", "2020-01-04"],
    "index": [1, 2, 3]
})

parsed = df.lazy().with_column(
    col("date").str_parse_date(pl.datatypes.Date32, "%Y-%m-%d"))

if __name__ == "__main__":
    with open("book/src/outputs/how_can_i_parse_dates_0.txt", "w") as f:
        f.write(str(df))
    with open("book/src/outputs/how_can_i_parse_dates_1.txt", "w") as f:
        f.write(str(parsed.collect()))
示例#7
0
def test_arg_sort():
    df = pl.DataFrame({"a": [4, 1, 3]})
    assert df[col("a").arg_sort()]["a"] == [1, 2, 0]
示例#8
0
def test_arange():
    df = pl.DataFrame({"a": [1, 1, 1]}).lazy()
    result = df.filter(pl.lazy.col("a") >= pl.lazy.arange(0, 3)).collect()
    expected = pl.DataFrame({"a": [1, 1]})
    assert result.frame_equal(expected)
示例#9
0
def test_groupby():
    df = pl.DataFrame({
        "a": [1.0, None, 3.0, 4.0],
        "groups": ["a", "a", "b", "b"]
    })
    out = df.lazy().groupby("groups").agg(pl.mean("a")).collect()