def test_downsample(): s = pl.Series( "datetime", [ 946684800000, 946684860000, 946684920000, 946684980000, 946685040000, 946685100000, 946685160000, 946685220000, 946685280000, 946685340000, 946685400000, 946685460000, 946685520000, 946685580000, 946685640000, 946685700000, 946685760000, 946685820000, 946685880000, 946685940000, ], ).cast(Date64) s2 = s.clone() df = pl.DataFrame({"a": s, "b": s2}) out = df.downsample("a", rule="minute", n=5).first() assert out.shape == (4, 2) # OLHC out = df.downsample("a", rule="minute", n=5).agg({"b": ["first", "min", "max", "last"]}) assert out.shape == (4, 5) # test to_pandas as well. out = df.to_pandas() assert out["a"].dtype == "datetime64[ns]"
def test_shift(fruits_cars: pl.DataFrame) -> None: df = pl.DataFrame({"a": [1, 2, 3, 4, 5], "b": [1, 2, 3, 4, 5]}) out = df.select(col("a").shift(1)) assert out["a"].series_equal(pl.Series("a", [None, 1, 2, 3, 4]), null_equal=True) res = fruits_cars.lazy().shift(2).collect() expected = pl.DataFrame( { "A": [None, None, 1, 2, 3], "fruits": [None, None, "banana", "banana", "apple"], "B": [None, None, 5, 4, 3], "cars": [None, None, "beetle", "audi", "beetle"], } ) res.frame_equal(expected, null_equal=True) # negative value res = fruits_cars.lazy().shift(-2).collect() for rows in [3, 4]: for cols in range(4): assert res[rows, cols] is None
def test_datetime_strptime_patterns() -> None: # note that all should be year first df = pl.Series( "date", [ "09-05-2019" "2018-09-05", "2018-09-05T04:05:01", "2018-09-05T04:24:01.9", "2018-09-05T04:24:02.11", "2018-09-05T14:24:02.123", "2018-09-05T14:24:02.123Z", "2019-04-18T02:45:55.555000000", "2019-04-18T22:45:55.555123", ], ).to_frame() s = df.with_columns([ pl.col("date").str.strptime(pl.Datetime, fmt=None, strict=False).alias("parsed"), ])["parsed"] assert s.null_count() == 1 assert s[0] is None
def _handle_columns_arg( data: List["PySeries"], columns: Optional[Sequence[str]] = None, nullable: bool = True, ) -> List["PySeries"]: """ Rename data according to columns argument. """ if columns is None: return data else: if not data: return [ pl.Series(c, None, nullable=nullable).inner() for c in columns ] elif len(data) == len(columns): for i, c in enumerate(columns): data[i].rename(c) return data else: raise ValueError( "Dimensions of columns arg must match data dimensions.")
def test_split_exact() -> None: df = pl.DataFrame(dict(x=["a_a", None, "b", "c_c"])) out = df.select([pl.col("x").str.split_exact("_", 2, inclusive=False)]).unnest("x") expected = pl.DataFrame( { "field_0": ["a", None, "b", "c"], "field_1": ["a", None, None, "c"], "field_2": pl.Series([None, None, None, None], dtype=pl.Utf8), } ) assert out.frame_equal(expected) out = df.select([pl.col("x").str.split_exact("_", 1, inclusive=True)]).unnest("x") expected = pl.DataFrame( {"field_0": ["a_", None, "b", "c_"], "field_1": ["a", None, None, "c"]} ) assert out.frame_equal(expected) assert df["x"].str.split_exact("_", 1).dtype == pl.Struct assert df["x"].str.split_exact("_", 1, inclusive=False).dtype == pl.Struct
def test_true_divide() -> None: s = pl.Series("a", [1, 2]) testing.assert_series_equal(s / 2, pl.Series("a", [0.5, 1.0])) testing.assert_series_equal( pl.DataFrame([s]).select(pl.col("a") / 2)["a"], pl.Series("a", [0.5, 1.0]) ) # rtruediv testing.assert_series_equal( pl.DataFrame([s]).select(2 / pl.col("a"))["literal"], pl.Series("literal", [2.0, 1.0]), ) # https://github.com/pola-rs/polars/issues/1369 vals = [3000000000, 2, 3] foo = pl.Series(vals) testing.assert_series_equal(foo / 1, pl.Series(vals, dtype=Float64)) testing.assert_series_equal( pl.DataFrame({"a": vals}).select([pl.col("a") / 1])["a"], pl.Series("a", vals, dtype=Float64), )
def test_argsort_nulls() -> None: a = pl.Series("a", [1.0, 2.0, 3.0, None, None]) assert a.argsort(nulls_last=True).to_list() == [0, 1, 2, 4, 3] assert a.argsort(nulls_last=False).to_list() == [3, 4, 0, 1, 2] assert a.to_frame().sort(by="a", nulls_last=False).to_series().to_list() == [ None, None, 1.0, 2.0, 3.0, ] assert a.to_frame().sort(by="a", nulls_last=True).to_series().to_list() == [ 1.0, 2.0, 3.0, None, None, ] with pytest.raises(ValueError): a.to_frame().sort(by=["a", "b"], nulls_last=True)
def test_dtype() -> None: # inferred a = pl.Series("a", [[1, 2, 3], [2, 5], [6, 7, 8, 9]]) assert a.dtype == pl.List assert a.inner_dtype == pl.Int64 assert a.dtype.inner == pl.Int64 # type: ignore[attr-defined] # explicit df = pl.DataFrame( data={ "i": [[1, 2, 3]], "tm": [[time(10, 30, 45)]], "dt": [[date(2022, 12, 31)]], "dtm": [[datetime(2022, 12, 31, 1, 2, 3)]], }, columns=[ ("i", pl.List(pl.Int8)), ("tm", pl.List(pl.Time)), ("dt", pl.List(pl.Date)), ("dtm", pl.List(pl.Datetime)), ], ) assert df.schema == { "i": pl.List(pl.Int8), "tm": pl.List(pl.Time), "dt": pl.List(pl.Date), "dtm": pl.List(pl.Datetime), } assert df.schema["i"].inner == pl.Int8 # type: ignore[attr-defined] assert df.rows() == [ ( [1, 2, 3], [time(10, 30, 45)], [date(2022, 12, 31)], [datetime(2022, 12, 31, 1, 2, 3)], ) ]
def test_init_only_columns() -> None: df = pl.DataFrame(columns=["a", "b", "c"]) truth = pl.DataFrame({"a": [], "b": [], "c": []}) assert df.shape == (0, 3) assert df.frame_equal(truth, null_equal=True) assert df.dtypes == [pl.Float32, pl.Float32, pl.Float32] # Validate construction with various flavours of no/empty data no_data: Any for no_data in (None, {}, []): df = pl.DataFrame( data=no_data, columns=[ # type: ignore[arg-type] ("a", pl.Date), ("b", pl.UInt64), ("c", pl.datatypes.Int8), ("d", pl.List(pl.UInt8)), ], ) truth = pl.DataFrame({ "a": [], "b": [], "c": [] }).with_columns([ pl.col("a").cast(pl.Date), pl.col("b").cast(pl.UInt64), pl.col("c").cast(pl.Int8), ]) truth.insert_at_idx(3, pl.Series("d", [], pl.List(pl.UInt8))) assert df.shape == (0, 4) assert df.frame_equal(truth, null_equal=True) assert df.dtypes == [pl.Date, pl.UInt64, pl.Int8, pl.List] assert df.schema["d"].inner == pl.UInt8 # type: ignore[attr-defined] dfe = df.cleared() assert (df.schema == dfe.schema) and (dfe.shape == df.shape)
def test_is_between(fruits_cars: pl.DataFrame) -> None: assert fruits_cars.select(pl.col("A").is_between(2, 4))["is_between"].series_equal( # type: ignore pl.Series("is_between", [False, False, True, False, False]) ) assert fruits_cars.select(pl.col("A").is_between(2, 4, False))["is_between"].series_equal( # type: ignore pl.Series("is_between", [False, False, True, False, False]) ) assert fruits_cars.select(pl.col("A").is_between(2, 4, [False, False]))["is_between"].series_equal( # type: ignore pl.Series("is_between", [False, False, True, False, False]) ) assert fruits_cars.select(pl.col("A").is_between(2, 4, True))["is_between"].series_equal( # type: ignore pl.Series("is_between", [False, True, True, True, False]) ) assert fruits_cars.select(pl.col("A").is_between(2, 4, [True, True]))["is_between"].series_equal( # type: ignore pl.Series("is_between", [False, True, True, True, False]) ) assert fruits_cars.select(pl.col("A").is_between(2, 4, [False, True]))["is_between"].series_equal( # type: ignore pl.Series("is_between", [False, False, True, True, False]) ) assert fruits_cars.select(pl.col("A").is_between(2, 4, [True, False]))["is_between"].series_equal( # type: ignore pl.Series("is_between", [False, True, True, False, False]) )
def test_take(fruits_cars: pl.DataFrame) -> None: df = fruits_cars # out of bounds error with pytest.raises(pl.ComputeError): ( df.sort("fruits").select( [col("B").reverse().take([1, 2]).list().over("fruits"), "fruits"] ) ) for index in [[0, 1], pl.Series([0, 1]), np.array([0, 1])]: out = df.sort("fruits").select( [col("B").reverse().take(index).list().over("fruits"), "fruits"] # type: ignore ) assert out[0, "B"] == [2, 3] assert out[4, "B"] == [1, 4] out = df.sort("fruits").select( [col("B").reverse().take(pl.lit(1)).list().over("fruits"), "fruits"] ) assert out[0, "B"] == 3 assert out[4, "B"] == 4
def test_dt_datetimes() -> None: s = pl.Series(["2020-01-01 00:00:00", "2020-02-02 03:20:10"]) s = s.str.strptime(pl.Datetime, fmt="%Y-%m-%d %H:%M:%S") # hours, minutes, seconds and nanoseconds verify_series_and_expr_api(s, pl.Series("", [0, 3], dtype=UInt32), "dt.hour") verify_series_and_expr_api(s, pl.Series("", [0, 20], dtype=UInt32), "dt.minute") verify_series_and_expr_api(s, pl.Series("", [0, 10], dtype=UInt32), "dt.second") verify_series_and_expr_api(s, pl.Series("", [0, 0], dtype=UInt32), "dt.nanosecond") # epoch methods verify_series_and_expr_api( s, pl.Series("", [18262, 18294], dtype=Int32), "dt.epoch_days" ) verify_series_and_expr_api( s, pl.Series("", [1_577_836_800, 1_580_613_610], dtype=Int64), "dt.epoch_seconds", ) verify_series_and_expr_api( s, pl.Series("", [1_577_836_800_000, 1_580_613_610_000], dtype=Int64), "dt.epoch_milliseconds", )
def test_agg_logical() -> None: dates = [date(2001, 1, 1), date(2002, 1, 1)] s = pl.Series(dates) assert s.max() == dates[1] assert s.min() == dates[0]
def test_asof_join() -> None: fmt = "%F %T%.3f" dates = """2016-05-25 13:30:00.023 2016-05-25 13:30:00.023 2016-05-25 13:30:00.030 2016-05-25 13:30:00.041 2016-05-25 13:30:00.048 2016-05-25 13:30:00.049 2016-05-25 13:30:00.072 2016-05-25 13:30:00.075""".split( "\n" ) ticker = """GOOG MSFT MSFT MSFT GOOG AAPL GOOG MSFT""".split( "\n" ) quotes = pl.DataFrame( { "dates": pl.Series(dates).str.strptime(pl.Datetime, fmt=fmt), "ticker": ticker, "bid": [720.5, 51.95, 51.97, 51.99, 720.50, 97.99, 720.50, 52.01], } ) dates = """2016-05-25 13:30:00.023 2016-05-25 13:30:00.038 2016-05-25 13:30:00.048 2016-05-25 13:30:00.048 2016-05-25 13:30:00.048""".split( "\n" ) ticker = """MSFT MSFT GOOG GOOG AAPL""".split( "\n" ) trades = pl.DataFrame( { "dates": pl.Series(dates).str.strptime(pl.Datetime, fmt=fmt), "ticker": ticker, "bid": [51.95, 51.95, 720.77, 720.92, 98.0], } ) out = trades.join_asof(quotes, on="dates", strategy="backward") assert out.columns == ["dates", "ticker", "bid", "ticker_right", "bid_right"] assert (out["dates"].cast(int) / 1000).to_list() == [ 1464183000023, 1464183000038, 1464183000048, 1464183000048, 1464183000048, ] assert trades.join_asof(quotes, on="dates", strategy="forward")[ "bid_right" ].to_list() == [720.5, 51.99, 720.5, 720.5, 720.5] out = trades.join_asof(quotes, on="dates", by="ticker") assert out["bid_right"].to_list() == [51.95, 51.97, 720.5, 720.5, None] out = quotes.join_asof(trades, on="dates", by="ticker") assert out["bid_right"].to_list() == [ None, 51.95, 51.95, 51.95, 720.92, 98.0, 720.92, 51.95, ] assert quotes.join_asof(trades, on="dates", strategy="backward", tolerance="5ms")[ "bid_right" ].to_list() == [51.95, 51.95, None, 51.95, 98.0, 98.0, None, None] assert quotes.join_asof(trades, on="dates", strategy="forward", tolerance="5ms")[ "bid_right" ].to_list() == [51.95, 51.95, None, None, 720.77, None, None, None]
def test_series_add_datetime() -> None: deltas = pl.Series([timedelta(10_000), timedelta(20_000), timedelta(30_000)]) out = pl.Series( [datetime(2027, 5, 19), datetime(2054, 10, 4), datetime(2082, 2, 19)] ) assert (deltas + pl.Series([datetime(2000, 1, 1)])) == out
def test_strptime_dates_datetimes() -> None: s = pl.Series("date", ["2021-04-22", "2022-01-04 00:00:00"]) assert s.str.strptime(pl.Datetime).to_list() == [ datetime(2021, 4, 22, 0, 0), datetime(2022, 1, 4, 0, 0), ]
def test_to_arrow() -> None: date_series = pl.Series("dates", ["2022-01-16", "2022-01-17"]).str.strptime( pl.Date, "%Y-%m-%d" ) arr = date_series.to_arrow() assert arr.type == pa.date32()
def test_from_numpy() -> None: # numpy support is limited; will be stored as object x = np.asarray(range(100_000, 200_000, 10_000), dtype="datetime64[s]") s = pl.Series(x) assert s[0] == x[0] assert len(s) == 10
def test_add_eager_column(): df = pl.DataFrame({"a": [1, 2, 3], "b": [1.0, 2.0, 3.0]}) out = df.lazy().with_column(pl.lit(pl.Series("c", [1, 2, 3]))).collect() assert out["c"].sum() == 6
def test_to_pandas_series() -> None: assert (pl.Series("a", [1, 2, 3]).to_pandas() == pd.Series([1, 2, 3])).all()
def test_compare_series_value_mismatch() -> None: srs1 = pl.Series([1, 2, 3]) srs2 = pl.Series([2, 3, 4]) with pytest.raises(AssertionError, match="Series are different\n\nValue mismatch"): pl.testing.assert_series_equal(srs1, srs2)
def test_compare_series_nulls_are_equal() -> None: srs1 = pl.Series([1, 2, None]) srs2 = pl.Series([1, 2, None]) pl.testing.assert_series_equal(srs1, srs2)
def test_compare_series_shape_mismatch() -> None: srs1 = pl.Series(values=[1, 2, 3, 4], name="srs1") srs2 = pl.Series(values=[1, 2, 3], name="srs2") with pytest.raises(AssertionError, match="Series are different\n\nShape mismatch"): pl.testing.assert_series_equal(srs1, srs2)
def test_estimated_size() -> None: a = pl.Series([1, 2, 3]) assert a.estimated_size() == a.to_frame().estimated_size()
def test_assert_frame_equal_types() -> None: df1 = pl.DataFrame({"a": [1, 2]}) srs1 = pl.Series(values=[1, 2], name="a") with pytest.raises(AssertionError): pl.testing.assert_frame_equal(df1, srs1) # type: ignore
def test_time_zero_3828() -> None: assert pl.Series(values=[time(0)], dtype=pl.Time).to_list() == [time(0)]
def test_respect_dtype_with_series_from_numpy() -> None: assert pl.Series("foo", np.array([1, 2, 3]), dtype=pl.UInt32).dtype == pl.UInt32
def test_time_microseconds_3843() -> None: in_val = [time(0, 9, 11, 558332)] s = pl.Series(in_val) assert s.to_list() == in_val
def test_rolling_apply(): s = pl.Series("A", [1.0, 2.0, 9.0, 2.0, 13.0]) out = s.rolling_apply(window_size=3, function=lambda s: s.std()) assert out[0] is None assert out[1] is None assert out[2] == 4.358898943540674
def test_year_empty_df() -> None: df = pl.DataFrame(pl.Series(name="date", dtype=pl.Date)) assert df.select(pl.col("date").dt.year()).dtypes == [pl.Int32]