示例#1
0
def test_downsample():
    s = pl.Series(
        "datetime",
        [
            946684800000,
            946684860000,
            946684920000,
            946684980000,
            946685040000,
            946685100000,
            946685160000,
            946685220000,
            946685280000,
            946685340000,
            946685400000,
            946685460000,
            946685520000,
            946685580000,
            946685640000,
            946685700000,
            946685760000,
            946685820000,
            946685880000,
            946685940000,
        ],
    ).cast(Date64)
    s2 = s.clone()
    df = pl.DataFrame({"a": s, "b": s2})
    out = df.downsample("a", rule="minute", n=5).first()
    assert out.shape == (4, 2)

    # OLHC
    out = df.downsample("a", rule="minute",
                        n=5).agg({"b": ["first", "min", "max", "last"]})
    assert out.shape == (4, 5)

    # test to_pandas as well.
    out = df.to_pandas()
    assert out["a"].dtype == "datetime64[ns]"
示例#2
0
def test_shift(fruits_cars: pl.DataFrame) -> None:
    df = pl.DataFrame({"a": [1, 2, 3, 4, 5], "b": [1, 2, 3, 4, 5]})
    out = df.select(col("a").shift(1))
    assert out["a"].series_equal(pl.Series("a", [None, 1, 2, 3, 4]), null_equal=True)

    res = fruits_cars.lazy().shift(2).collect()

    expected = pl.DataFrame(
        {
            "A": [None, None, 1, 2, 3],
            "fruits": [None, None, "banana", "banana", "apple"],
            "B": [None, None, 5, 4, 3],
            "cars": [None, None, "beetle", "audi", "beetle"],
        }
    )
    res.frame_equal(expected, null_equal=True)

    # negative value
    res = fruits_cars.lazy().shift(-2).collect()
    for rows in [3, 4]:
        for cols in range(4):
            assert res[rows, cols] is None
示例#3
0
def test_datetime_strptime_patterns() -> None:
    # note that all should be year first
    df = pl.Series(
        "date",
        [
            "09-05-2019"
            "2018-09-05",
            "2018-09-05T04:05:01",
            "2018-09-05T04:24:01.9",
            "2018-09-05T04:24:02.11",
            "2018-09-05T14:24:02.123",
            "2018-09-05T14:24:02.123Z",
            "2019-04-18T02:45:55.555000000",
            "2019-04-18T22:45:55.555123",
        ],
    ).to_frame()
    s = df.with_columns([
        pl.col("date").str.strptime(pl.Datetime, fmt=None,
                                    strict=False).alias("parsed"),
    ])["parsed"]
    assert s.null_count() == 1
    assert s[0] is None
示例#4
0
def _handle_columns_arg(
    data: List["PySeries"],
    columns: Optional[Sequence[str]] = None,
    nullable: bool = True,
) -> List["PySeries"]:
    """
    Rename data according to columns argument.
    """
    if columns is None:
        return data
    else:
        if not data:
            return [
                pl.Series(c, None, nullable=nullable).inner() for c in columns
            ]
        elif len(data) == len(columns):
            for i, c in enumerate(columns):
                data[i].rename(c)
            return data
        else:
            raise ValueError(
                "Dimensions of columns arg must match data dimensions.")
示例#5
0
def test_split_exact() -> None:
    df = pl.DataFrame(dict(x=["a_a", None, "b", "c_c"]))
    out = df.select([pl.col("x").str.split_exact("_", 2, inclusive=False)]).unnest("x")

    expected = pl.DataFrame(
        {
            "field_0": ["a", None, "b", "c"],
            "field_1": ["a", None, None, "c"],
            "field_2": pl.Series([None, None, None, None], dtype=pl.Utf8),
        }
    )

    assert out.frame_equal(expected)

    out = df.select([pl.col("x").str.split_exact("_", 1, inclusive=True)]).unnest("x")

    expected = pl.DataFrame(
        {"field_0": ["a_", None, "b", "c_"], "field_1": ["a", None, None, "c"]}
    )
    assert out.frame_equal(expected)
    assert df["x"].str.split_exact("_", 1).dtype == pl.Struct
    assert df["x"].str.split_exact("_", 1, inclusive=False).dtype == pl.Struct
示例#6
0
def test_true_divide() -> None:
    s = pl.Series("a", [1, 2])
    testing.assert_series_equal(s / 2, pl.Series("a", [0.5, 1.0]))
    testing.assert_series_equal(
        pl.DataFrame([s]).select(pl.col("a") / 2)["a"], pl.Series("a", [0.5, 1.0])
    )

    # rtruediv
    testing.assert_series_equal(
        pl.DataFrame([s]).select(2 / pl.col("a"))["literal"],
        pl.Series("literal", [2.0, 1.0]),
    )

    # https://github.com/pola-rs/polars/issues/1369
    vals = [3000000000, 2, 3]
    foo = pl.Series(vals)
    testing.assert_series_equal(foo / 1, pl.Series(vals, dtype=Float64))
    testing.assert_series_equal(
        pl.DataFrame({"a": vals}).select([pl.col("a") / 1])["a"],
        pl.Series("a", vals, dtype=Float64),
    )
示例#7
0
def test_argsort_nulls() -> None:
    a = pl.Series("a", [1.0, 2.0, 3.0, None, None])
    assert a.argsort(nulls_last=True).to_list() == [0, 1, 2, 4, 3]
    assert a.argsort(nulls_last=False).to_list() == [3, 4, 0, 1, 2]

    assert a.to_frame().sort(by="a",
                             nulls_last=False).to_series().to_list() == [
                                 None,
                                 None,
                                 1.0,
                                 2.0,
                                 3.0,
                             ]
    assert a.to_frame().sort(by="a",
                             nulls_last=True).to_series().to_list() == [
                                 1.0,
                                 2.0,
                                 3.0,
                                 None,
                                 None,
                             ]
    with pytest.raises(ValueError):
        a.to_frame().sort(by=["a", "b"], nulls_last=True)
示例#8
0
def test_dtype() -> None:
    # inferred
    a = pl.Series("a", [[1, 2, 3], [2, 5], [6, 7, 8, 9]])
    assert a.dtype == pl.List
    assert a.inner_dtype == pl.Int64
    assert a.dtype.inner == pl.Int64  # type: ignore[attr-defined]

    # explicit
    df = pl.DataFrame(
        data={
            "i": [[1, 2, 3]],
            "tm": [[time(10, 30, 45)]],
            "dt": [[date(2022, 12, 31)]],
            "dtm": [[datetime(2022, 12, 31, 1, 2, 3)]],
        },
        columns=[
            ("i", pl.List(pl.Int8)),
            ("tm", pl.List(pl.Time)),
            ("dt", pl.List(pl.Date)),
            ("dtm", pl.List(pl.Datetime)),
        ],
    )
    assert df.schema == {
        "i": pl.List(pl.Int8),
        "tm": pl.List(pl.Time),
        "dt": pl.List(pl.Date),
        "dtm": pl.List(pl.Datetime),
    }
    assert df.schema["i"].inner == pl.Int8  # type: ignore[attr-defined]
    assert df.rows() == [
        (
            [1, 2, 3],
            [time(10, 30, 45)],
            [date(2022, 12, 31)],
            [datetime(2022, 12, 31, 1, 2, 3)],
        )
    ]
示例#9
0
def test_init_only_columns() -> None:
    df = pl.DataFrame(columns=["a", "b", "c"])
    truth = pl.DataFrame({"a": [], "b": [], "c": []})
    assert df.shape == (0, 3)
    assert df.frame_equal(truth, null_equal=True)
    assert df.dtypes == [pl.Float32, pl.Float32, pl.Float32]

    # Validate construction with various flavours of no/empty data
    no_data: Any
    for no_data in (None, {}, []):
        df = pl.DataFrame(
            data=no_data,
            columns=[  # type: ignore[arg-type]
                ("a", pl.Date),
                ("b", pl.UInt64),
                ("c", pl.datatypes.Int8),
                ("d", pl.List(pl.UInt8)),
            ],
        )
        truth = pl.DataFrame({
            "a": [],
            "b": [],
            "c": []
        }).with_columns([
            pl.col("a").cast(pl.Date),
            pl.col("b").cast(pl.UInt64),
            pl.col("c").cast(pl.Int8),
        ])
        truth.insert_at_idx(3, pl.Series("d", [], pl.List(pl.UInt8)))

        assert df.shape == (0, 4)
        assert df.frame_equal(truth, null_equal=True)
        assert df.dtypes == [pl.Date, pl.UInt64, pl.Int8, pl.List]
        assert df.schema["d"].inner == pl.UInt8  # type: ignore[attr-defined]

        dfe = df.cleared()
        assert (df.schema == dfe.schema) and (dfe.shape == df.shape)
示例#10
0
def test_is_between(fruits_cars: pl.DataFrame) -> None:
    assert fruits_cars.select(pl.col("A").is_between(2, 4))["is_between"].series_equal(  # type: ignore
        pl.Series("is_between", [False, False, True, False, False])
    )
    assert fruits_cars.select(pl.col("A").is_between(2, 4, False))["is_between"].series_equal(  # type: ignore
        pl.Series("is_between", [False, False, True, False, False])
    )
    assert fruits_cars.select(pl.col("A").is_between(2, 4, [False, False]))["is_between"].series_equal(  # type: ignore
        pl.Series("is_between", [False, False, True, False, False])
    )
    assert fruits_cars.select(pl.col("A").is_between(2, 4, True))["is_between"].series_equal(  # type: ignore
        pl.Series("is_between", [False, True, True, True, False])
    )
    assert fruits_cars.select(pl.col("A").is_between(2, 4, [True, True]))["is_between"].series_equal(  # type: ignore
        pl.Series("is_between", [False, True, True, True, False])
    )
    assert fruits_cars.select(pl.col("A").is_between(2, 4, [False, True]))["is_between"].series_equal(  # type: ignore
        pl.Series("is_between", [False, False, True, True, False])
    )
    assert fruits_cars.select(pl.col("A").is_between(2, 4, [True, False]))["is_between"].series_equal(  # type: ignore
        pl.Series("is_between", [False, True, True, False, False])
    )
示例#11
0
def test_take(fruits_cars: pl.DataFrame) -> None:
    df = fruits_cars

    # out of bounds error
    with pytest.raises(pl.ComputeError):
        (
            df.sort("fruits").select(
                [col("B").reverse().take([1, 2]).list().over("fruits"), "fruits"]
            )
        )

    for index in [[0, 1], pl.Series([0, 1]), np.array([0, 1])]:
        out = df.sort("fruits").select(
            [col("B").reverse().take(index).list().over("fruits"), "fruits"]  # type: ignore
        )

        assert out[0, "B"] == [2, 3]
        assert out[4, "B"] == [1, 4]

    out = df.sort("fruits").select(
        [col("B").reverse().take(pl.lit(1)).list().over("fruits"), "fruits"]
    )
    assert out[0, "B"] == 3
    assert out[4, "B"] == 4
示例#12
0
def test_dt_datetimes() -> None:
    s = pl.Series(["2020-01-01 00:00:00", "2020-02-02 03:20:10"])
    s = s.str.strptime(pl.Datetime, fmt="%Y-%m-%d %H:%M:%S")

    # hours, minutes, seconds and nanoseconds
    verify_series_and_expr_api(s, pl.Series("", [0, 3], dtype=UInt32), "dt.hour")
    verify_series_and_expr_api(s, pl.Series("", [0, 20], dtype=UInt32), "dt.minute")
    verify_series_and_expr_api(s, pl.Series("", [0, 10], dtype=UInt32), "dt.second")
    verify_series_and_expr_api(s, pl.Series("", [0, 0], dtype=UInt32), "dt.nanosecond")

    # epoch methods
    verify_series_and_expr_api(
        s, pl.Series("", [18262, 18294], dtype=Int32), "dt.epoch_days"
    )
    verify_series_and_expr_api(
        s,
        pl.Series("", [1_577_836_800, 1_580_613_610], dtype=Int64),
        "dt.epoch_seconds",
    )
    verify_series_and_expr_api(
        s,
        pl.Series("", [1_577_836_800_000, 1_580_613_610_000], dtype=Int64),
        "dt.epoch_milliseconds",
    )
示例#13
0
def test_agg_logical() -> None:
    dates = [date(2001, 1, 1), date(2002, 1, 1)]
    s = pl.Series(dates)
    assert s.max() == dates[1]
    assert s.min() == dates[0]
示例#14
0
def test_asof_join() -> None:
    fmt = "%F %T%.3f"
    dates = """2016-05-25 13:30:00.023
2016-05-25 13:30:00.023
2016-05-25 13:30:00.030
2016-05-25 13:30:00.041
2016-05-25 13:30:00.048
2016-05-25 13:30:00.049
2016-05-25 13:30:00.072
2016-05-25 13:30:00.075""".split(
        "\n"
    )

    ticker = """GOOG
MSFT
MSFT
MSFT
GOOG
AAPL
GOOG
MSFT""".split(
        "\n"
    )

    quotes = pl.DataFrame(
        {
            "dates": pl.Series(dates).str.strptime(pl.Datetime, fmt=fmt),
            "ticker": ticker,
            "bid": [720.5, 51.95, 51.97, 51.99, 720.50, 97.99, 720.50, 52.01],
        }
    )

    dates = """2016-05-25 13:30:00.023
2016-05-25 13:30:00.038
2016-05-25 13:30:00.048
2016-05-25 13:30:00.048
2016-05-25 13:30:00.048""".split(
        "\n"
    )

    ticker = """MSFT
MSFT
GOOG
GOOG
AAPL""".split(
        "\n"
    )

    trades = pl.DataFrame(
        {
            "dates": pl.Series(dates).str.strptime(pl.Datetime, fmt=fmt),
            "ticker": ticker,
            "bid": [51.95, 51.95, 720.77, 720.92, 98.0],
        }
    )

    out = trades.join_asof(quotes, on="dates", strategy="backward")
    assert out.columns == ["dates", "ticker", "bid", "ticker_right", "bid_right"]
    assert (out["dates"].cast(int) / 1000).to_list() == [
        1464183000023,
        1464183000038,
        1464183000048,
        1464183000048,
        1464183000048,
    ]
    assert trades.join_asof(quotes, on="dates", strategy="forward")[
        "bid_right"
    ].to_list() == [720.5, 51.99, 720.5, 720.5, 720.5]

    out = trades.join_asof(quotes, on="dates", by="ticker")
    assert out["bid_right"].to_list() == [51.95, 51.97, 720.5, 720.5, None]

    out = quotes.join_asof(trades, on="dates", by="ticker")
    assert out["bid_right"].to_list() == [
        None,
        51.95,
        51.95,
        51.95,
        720.92,
        98.0,
        720.92,
        51.95,
    ]
    assert quotes.join_asof(trades, on="dates", strategy="backward", tolerance="5ms")[
        "bid_right"
    ].to_list() == [51.95, 51.95, None, 51.95, 98.0, 98.0, None, None]
    assert quotes.join_asof(trades, on="dates", strategy="forward", tolerance="5ms")[
        "bid_right"
    ].to_list() == [51.95, 51.95, None, None, 720.77, None, None, None]
示例#15
0
def test_series_add_datetime() -> None:
    deltas = pl.Series([timedelta(10_000), timedelta(20_000), timedelta(30_000)])
    out = pl.Series(
        [datetime(2027, 5, 19), datetime(2054, 10, 4), datetime(2082, 2, 19)]
    )
    assert (deltas + pl.Series([datetime(2000, 1, 1)])) == out
示例#16
0
def test_strptime_dates_datetimes() -> None:
    s = pl.Series("date", ["2021-04-22", "2022-01-04 00:00:00"])
    assert s.str.strptime(pl.Datetime).to_list() == [
        datetime(2021, 4, 22, 0, 0),
        datetime(2022, 1, 4, 0, 0),
    ]
示例#17
0
def test_to_arrow() -> None:
    date_series = pl.Series("dates", ["2022-01-16", "2022-01-17"]).str.strptime(
        pl.Date, "%Y-%m-%d"
    )
    arr = date_series.to_arrow()
    assert arr.type == pa.date32()
示例#18
0
def test_from_numpy() -> None:
    # numpy support is limited; will be stored as object
    x = np.asarray(range(100_000, 200_000, 10_000), dtype="datetime64[s]")
    s = pl.Series(x)
    assert s[0] == x[0]
    assert len(s) == 10
示例#19
0
def test_add_eager_column():
    df = pl.DataFrame({"a": [1, 2, 3], "b": [1.0, 2.0, 3.0]})
    out = df.lazy().with_column(pl.lit(pl.Series("c", [1, 2, 3]))).collect()
    assert out["c"].sum() == 6
示例#20
0
def test_to_pandas_series() -> None:
    assert (pl.Series("a", [1, 2, 3]).to_pandas() == pd.Series([1, 2,
                                                                3])).all()
示例#21
0
def test_compare_series_value_mismatch() -> None:
    srs1 = pl.Series([1, 2, 3])
    srs2 = pl.Series([2, 3, 4])
    with pytest.raises(AssertionError, match="Series are different\n\nValue mismatch"):
        pl.testing.assert_series_equal(srs1, srs2)
示例#22
0
def test_compare_series_nulls_are_equal() -> None:
    srs1 = pl.Series([1, 2, None])
    srs2 = pl.Series([1, 2, None])
    pl.testing.assert_series_equal(srs1, srs2)
示例#23
0
def test_compare_series_shape_mismatch() -> None:
    srs1 = pl.Series(values=[1, 2, 3, 4], name="srs1")
    srs2 = pl.Series(values=[1, 2, 3], name="srs2")
    with pytest.raises(AssertionError, match="Series are different\n\nShape mismatch"):
        pl.testing.assert_series_equal(srs1, srs2)
示例#24
0
def test_estimated_size() -> None:
    a = pl.Series([1, 2, 3])
    assert a.estimated_size() == a.to_frame().estimated_size()
示例#25
0
def test_assert_frame_equal_types() -> None:
    df1 = pl.DataFrame({"a": [1, 2]})
    srs1 = pl.Series(values=[1, 2], name="a")
    with pytest.raises(AssertionError):
        pl.testing.assert_frame_equal(df1, srs1)  # type: ignore
示例#26
0
def test_time_zero_3828() -> None:
    assert pl.Series(values=[time(0)], dtype=pl.Time).to_list() == [time(0)]
示例#27
0
def test_respect_dtype_with_series_from_numpy() -> None:
    assert pl.Series("foo", np.array([1, 2, 3]),
                     dtype=pl.UInt32).dtype == pl.UInt32
示例#28
0
def test_time_microseconds_3843() -> None:
    in_val = [time(0, 9, 11, 558332)]
    s = pl.Series(in_val)
    assert s.to_list() == in_val
示例#29
0
def test_rolling_apply():
    s = pl.Series("A", [1.0, 2.0, 9.0, 2.0, 13.0])
    out = s.rolling_apply(window_size=3, function=lambda s: s.std())
    assert out[0] is None
    assert out[1] is None
    assert out[2] == 4.358898943540674
示例#30
0
def test_year_empty_df() -> None:
    df = pl.DataFrame(pl.Series(name="date", dtype=pl.Date))
    assert df.select(pl.col("date").dt.year()).dtypes == [pl.Int32]