def test_is_in(): df = pl.DataFrame({"a": [1, 2, 3]}) assert df.select(pl.col("a").is_in([1, 2]))["a"].to_list() == [ True, True, False, ]
def test_value_counts_expr() -> None: df = pl.DataFrame({ "id": ["a", "b", "b", "c", "c", "c", "d", "d"], }) out = (df.select([ pl.col("id").value_counts(sort=True), ]).to_series().to_list()) assert out == [ { "id": "c", "counts": 3 }, { "id": "b", "counts": 2 }, { "id": "d", "counts": 2 }, { "id": "a", "counts": 1 }, ]
def test_struct_arr_eval() -> None: df = pl.DataFrame({ "col_struct": [[{ "a": 1, "b": 11 }, { "a": 2, "b": 12 }, { "a": 1, "b": 11 }]] }) assert df.with_column( pl.col("col_struct").arr.eval( pl.element().first()).alias("first")).to_dict(False) == { "col_struct": [[{ "a": 1, "b": 11 }, { "a": 2, "b": 12 }, { "a": 1, "b": 11 }]], "first": [[{ "a": 1, "b": 11 }]], }
def test_type_coercion_when_then_otherwise_2806() -> None: out = (pl.DataFrame({ "names": ["foo", "spam", "spam"], "nrs": [1, 2, 3] }).select([ pl.when((pl.col("names") == "spam")).then( (pl.col("nrs") * 2)).otherwise(pl.lit("other")).alias("new_col"), ]).to_series()) expected = pl.Series("new_col", ["other", "4", "6"]) assert out.to_list() == expected.to_list() # test it remains float32 assert (pl.Series( "a", [1.0, 2.0, 3.0], dtype=pl.Float32).to_frame().select( pl.when(pl.col("a") > 2.0).then( pl.col("a")).otherwise(0.0))).to_series().dtype == pl.Float32
def test_arr_lengths_dispatch() -> None: s = pl.Series("a", [[1, 2], [1, 2, 3]]) testing.assert_series_equal(s.arr.lengths(), pl.Series("a", [2, 3], dtype=UInt32)) df = pl.DataFrame([s]) testing.assert_series_equal( df.select(pl.col("a").arr.lengths())["a"], pl.Series("a", [2, 3], dtype=UInt32) )
def test_sqrt_dispatch() -> None: s = pl.Series("a", [1, 2]) testing.assert_series_equal(s.sqrt(), pl.Series("a", [1.0, np.sqrt(2)])) df = pl.DataFrame([s]) testing.assert_series_equal( df.select(pl.col("a").sqrt())["a"], pl.Series("a", [1.0, np.sqrt(2)]) )
def test_categorical_lexical_ordering_after_concat() -> None: with pl.StringCache(): ldf1 = (pl.DataFrame([ pl.Series("key1", [8, 5]), pl.Series("key2", ["fox", "baz"]) ]).lazy().with_column( pl.col("key2").cast(pl.Categorical).cat.set_ordering("lexical"))) ldf2 = (pl.DataFrame([ pl.Series("key1", [6, 8, 6]), pl.Series("key2", ["fox", "foo", "bar"]) ]).lazy().with_column( pl.col("key2").cast(pl.Categorical).cat.set_ordering("lexical"))) df = (pl.concat([ldf1, ldf2]).with_column( pl.col("key2").cat.set_ordering("lexical")).collect()) df.sort(["key1", "key2"])
def test_kurtosis_dispatch() -> None: s = pl.Series("a", [1, 2, 3, 2, 2, 3, 0]) expected = -0.6406250000000004 assert s.kurtosis() == pytest.approx(expected) df = pl.DataFrame([s]) assert np.isclose(df.select(pl.col("a").kurtosis())["a"][0], expected)
def test_rank_dispatch(): s = pl.Series("a", [1, 2, 3, 2, 2, 3, 0]) assert list(s.rank("dense")) == [2, 3, 4, 3, 3, 4, 1] df = pl.DataFrame([s]) df.select(pl.col("a").rank("dense"))["a"] == [2, 3, 4, 3, 3, 4, 1]
def test_map_alias() -> None: out = pl.DataFrame({"foo": [1, 2, 3]}).select( (pl.col("foo") * 2).map_alias(lambda name: f"{name}{name}") ) expected = pl.DataFrame({"foofoo": [2, 4, 6]}) assert out.frame_equal(expected)
def test_sort_dates_multiples(): df = pl.DataFrame([ pl.Series( "date", [ "2021-01-01 00:00:00", "2021-01-01 00:00:00", "2021-01-02 00:00:00", "2021-01-02 00:00:00", "2021-01-03 00:00:00", ], ).str.strptime(pl.datatypes.Date64, "%Y-%m-%d %T"), pl.Series("values", [5, 4, 3, 2, 1]), ]) expected = [4, 5, 2, 3, 1] # date64 out = df.sort(["date", "values"]) assert out["values"].to_list() == expected # date32 out = df.with_column(pl.col("date").cast(pl.Date32)).sort( ["date", "values"]) assert out["values"].to_list() == expected
def test_list_concat_supertype() -> None: df = pl.DataFrame( [pl.Series("a", [1, 2], pl.UInt8), pl.Series("b", [10000, 20000], pl.UInt16)] ) assert df.with_column(pl.concat_list(pl.col(["a", "b"])).alias("concat_list"))[ "concat_list" ].to_list() == [[1, 10000], [2, 20000]]
def test_strict_cast(): with pytest.raises(RuntimeError): pl.Series("a", [2**16]).cast(dtype=pl.Int16, strict=True) with pytest.raises(RuntimeError): pl.DataFrame({ "a": [2**16] }).select([pl.col("a").cast(pl.Int16, strict=True)])
def test_panic(): # may contain some tests that yielded a panic in polars or arrow # https://github.com/pola-rs/polars/issues/1110 a = pl.DataFrame({ "col1": ["a"] * 500 + ["b"] * 500, }) a.filter(pl.col("col1") != "b")
def test_diff_dispatch(): s = pl.Series("a", [1, 2, 3, 2, 2, 3, 0]) expected = [1, 1, -1, 0, 1, -3] assert list(s.diff(null_behavior="drop")) == expected df = pl.DataFrame([s]) assert df.select(pl.col("a").diff())["a"].to_list() == [None, 1, 1, -1, 0, 1, -3]
def test_std(dtype): if dtype == pl.Int32: values = [1, 2, 3, 4] else: values = [1.0, 2.0, 3.0, 4.0] df = pl.DataFrame([ pl.Series("groups", ["a", "a", "b", "b"]), pl.Series("values", values, dtype=dtype), ]) out = df.select(pl.col("values").std().over("groups")) assert np.isclose(out["values"][0], 0.7071067690849304) out = df.select(pl.col("values").var().over("groups")) assert np.isclose(out["values"][0], 0.5) out = df.select(pl.col("values").mean().over("groups")) assert np.isclose(out["values"][0], 1.5)
def test_diff_datetime() -> None: df = pl.DataFrame( { "timestamp": ["2021-02-01", "2021-03-1", "2850-04-1"], "guild": [1, 2, 3], "char": ["a", "a", "b"], } ) out = ( df.with_columns( [ pl.col("timestamp").str.strptime(pl.Date, fmt="%Y-%m-%d"), ] ).with_columns([pl.col("timestamp").diff().list().over("char")]) )["timestamp"] assert out[0] == out[1]
def test_to_polars_dataframe(report): # This relies entirely on Arrow, so as long as those tests work, # these should too. df = report.to_polars() df_csv = pl.read_csv(MOCK_CSV_PATH) df_csv = df_csv.with_column( pl.col("day").str.strptime(pl.Date).cast(pl.Datetime)) assert df.frame_equal(df_csv)
def test_unique_and_drop_stability() -> None: # see: 2898 # the original cause was that we wrote: # expr_a = a.unique() # expr_a.filter(a.unique().is_not_null()) # meaning that the a.unique was executed twice, which is an unstable algorithm df = pl.DataFrame({"a": [1, None, 1, None]}) assert df.select(pl.col("a").unique().drop_nulls()).to_series()[0] == 1
def test_upsample() -> None: df = pl.DataFrame( { "time": [ datetime(2021, 2, 1), datetime(2021, 4, 1), datetime(2021, 5, 1), datetime(2021, 6, 1), ], "admin": ["Åland", "Netherlands", "Åland", "Netherlands"], "test2": [0, 1, 2, 3], } ).with_column(pl.col("time").dt.with_time_zone("UTC")) up = df.upsample( time_column="time", every="1mo", by="admin", maintain_order=True ).select(pl.all().forward_fill()) # this print will panic if timezones feature is not activated # don't remove print(up) expected = pl.DataFrame( { "time": [ datetime(2021, 2, 1, 0, 0), datetime(2021, 3, 1, 0, 0), datetime(2021, 4, 1, 0, 0), datetime(2021, 5, 1, 0, 0), datetime(2021, 4, 1, 0, 0), datetime(2021, 5, 1, 0, 0), datetime(2021, 6, 1, 0, 0), ], "admin": [ "Åland", "Åland", "Åland", "Åland", "Netherlands", "Netherlands", "Netherlands", ], "test2": [0, 0, 0, 2, 1, 1, 3], } ).with_column(pl.col("time").dt.with_time_zone("UTC")) assert up.frame_equal(expected)
def test_list_concat_dispatch() -> None: s0 = pl.Series("a", [[1, 2]]) s1 = pl.Series("b", [[3, 4, 5]]) expected = pl.Series("a", [[1, 2, 3, 4, 5]]) out = s0.arr.concat([s1]) assert out.series_equal(expected) out = s0.arr.concat(s1) assert out.series_equal(expected) df = pl.DataFrame([s0, s1]) assert df.select(pl.concat_list(["a", "b"]).alias("a"))["a"].series_equal(expected) assert df.select(pl.col("a").arr.concat("b").alias("a"))["a"].series_equal(expected) assert df.select(pl.col("a").arr.concat(["b"]).alias("a"))["a"].series_equal( expected )
def test_skew_dispatch() -> None: s = pl.Series("a", [1, 2, 3, 2, 2, 3, 0]) assert s.skew(True) == pytest.approx(-0.5953924651018018) assert s.skew(False) == pytest.approx(-0.7717168360221258) df = pl.DataFrame([s]) assert np.isclose(df.select(pl.col("a").skew(False))["a"][0], -0.7717168360221258)
def test_fold() -> None: df = pl.DataFrame({"a": [1, 2, 3], "b": [1.0, 2.0, 3.0]}) out = df.select( [ pl.sum(["a", "b"]), pl.max(["a", pl.col("b") ** 2]), pl.min(["a", pl.col("b") ** 2]), ] ) assert out["sum"].series_equal(pl.Series("sum", [2.0, 4.0, 6.0])) assert out["max"].series_equal(pl.Series("max", [1.0, 4.0, 9.0])) assert out["min"].series_equal(pl.Series("min", [1.0, 2.0, 3.0])) out = df.select( pl.fold(acc=lit(0), f=lambda acc, x: acc + x, exprs=pl.col("*")).alias("foo") ) assert out["foo"] == [2, 4, 6]
def test_apply() -> None: df = pl.DataFrame({"a": [1, 2, 3], "b": [1.0, 2.0, 3.0]}) new = df.lazy().with_column(col("a").map(lambda s: s * 2).alias("foo")).collect() expected = df.clone() expected["foo"] = expected["a"] * 2 assert new.frame_equal(expected)
def build_compound_synonym_df(compound_file, output_dir): # Get metadata file and compound_df compound_metadata = pl.read_csv(compound_file, null_values="NA") compound_df = pl.from_arrow( fread(os.path.join(output_dir, "compound.jay")).to_arrow()) dataset_df = pl.from_arrow( fread(os.path.join(output_dir, "dataset.jay")).to_arrow()) # Find all columns relevant to tissueid compound_cols = [ col for col in compound_metadata.columns if re.match(".*drugid$", col) and col != "unique.drugid" ] # Read in which datasets we are working with dataset_names = os.listdir("procdata") clean_dataset_names = [re.sub("_.*$", "", name) for name in dataset_names] dataset_regex = re.compile("|".join(clean_dataset_names)) # Filter the cellid columns to only valid datasets compound_columns = [ name for name in compound_cols if re.match(dataset_regex, name) ] # Get all unique synonyms and join with cell_df compound_meta_long = compound_metadata \ .melt(id_vars="unique.drugid", value_vars=compound_columns) \ .drop_nulls() \ .drop_duplicates() \ .rename({"value": "compound_name", "variable": "dataset_id"}) \ .filter(col("compound_name") != "") compound_synonym_df = compound_df \ .join(compound_meta_long, left_on="name", right_on="unique.drugid", how="left") \ .rename({"id": "compound_id"}) \ .select(["compound_id", "dataset_id", "compound_name"]) \ .drop_nulls() \ .drop_duplicates() # Create a map from dataset dataset_map = { dct["name"]: str(dct["id"]) for dct in dataset_df.to_pandas().to_dict(orient="record") } # Regex the dataset identifiers to match the dataset map compound_synonym_df["dataset_id"] = compound_synonym_df["dataset_id"] \ .apply(lambda x: re.sub("\.drugid$|[_.].*$", "", x)) \ .apply(lambda x: re.sub("GDSC2019", "GDSC_v2", x)) \ .apply(lambda x: re.sub("GDSC1.*$", "GDSC_v1", x)) \ .apply(lambda x: dataset_map[x]) \ .cast(pl.Int64) compound_synonym_df = compound_synonym_df.drop_duplicates() compound_synonym_df["id"] = range(1, compound_synonym_df.shape[0] + 1) # Convert to datatable.Frame for memory mapped output file df = dt.Frame(compound_synonym_df.to_arrow()) df.to_jay(os.path.join(output_dir, "compound_synonym.jay"))
def build_tissue_synonym_df(tissue_file, output_dir): # Get metadata file and tissue_df (assume that tissue_df is also in output_dir) tissue_metadata = pl.read_csv(tissue_file) # will read NA as string! tissue_df = pl.from_arrow( fread(os.path.join(output_dir, "tissue.jay")).to_arrow()) dataset_df = pl.from_arrow( fread(os.path.join(output_dir, "dataset.jay")).to_arrow()) # Find all columns relevant to tissueid tissue_cols = [ col for col in tissue_metadata.columns if re.match(".*tissueid$", col) and col != "unique.tissueid" ] # Read in which datasets we are working with dataset_names = os.listdir("procdata") clean_dataset_names = [re.sub("_.*$", "", name) for name in dataset_names] dataset_regex = re.compile("|".join(clean_dataset_names)) # Filter the cellid columns to only valid datasets tissue_columns = [ name for name in tissue_cols if re.match(dataset_regex, name) ] # Get all unique synonyms and join with cell_df tissue_meta_long = tissue_metadata \ .melt(id_vars="unique.tissueid", value_vars=tissue_columns) \ .drop_nulls() \ .drop_duplicates() \ .rename({"value": "tissue_name", "variable": "dataset_id"}) tissue_synonym_df = tissue_df \ .join(tissue_meta_long, left_on="name", right_on="unique.tissueid", how="left") \ .drop("name") \ .rename({"id": "tissue_id"}) \ .filter(col("tissue_name") != "") \ .drop_duplicates() \ .drop_nulls() # Create a map from dataset dataset_map = { dct["name"]: str(dct["id"]) for dct in dataset_df.to_pandas().to_dict(orient="record") } # Regex the dataset identifiers to match the dataset map tissue_synonym_df["dataset_id"] = tissue_synonym_df["dataset_id"] \ .apply(lambda x: re.sub("\.cellid$|[_.].*$", "", x)) \ .apply(lambda x: re.sub("GDSC$", "GDSC_v2", x)) \ .apply(lambda x: re.sub("GDSC1.*$", "GDSC_v1", x)) \ .apply(lambda x: dataset_map[x]) \ .cast(pl.Int64) tissue_synonym_df = tissue_synonym_df.drop_duplicates() tissue_synonym_df["id"] = range(1, tissue_synonym_df.shape[0] + 1) # Convert to datatable.Frame for fast write to disk tissue_synonym_dt = dt.Frame(tissue_synonym_df.to_arrow()) tissue_synonym_dt.to_jay(os.path.join(output_dir, "tissue_synonym.jay"))
def test_is_between(fruits_cars: pl.DataFrame) -> None: assert fruits_cars.select(pl.col("A").is_between( 2, 4))["is_between"].series_equal( # type: ignore[arg-type] pl.Series("is_between", [False, False, True, False, False])) assert fruits_cars.select(pl.col("A").is_between( 2, 4, False))["is_between"].series_equal( # type: ignore[arg-type] pl.Series("is_between", [False, False, True, False, False])) assert fruits_cars.select(pl.col("A").is_between( 2, 4, [False, False]))["is_between"].series_equal( # type: ignore[arg-type] pl.Series("is_between", [False, False, True, False, False])) assert fruits_cars.select(pl.col("A").is_between( 2, 4, True))["is_between"].series_equal( # type: ignore[arg-type] pl.Series("is_between", [False, True, True, True, False])) assert fruits_cars.select(pl.col("A").is_between( 2, 4, [True, True]))["is_between"].series_equal( # type: ignore[arg-type] pl.Series("is_between", [False, True, True, True, False])) assert fruits_cars.select(pl.col("A").is_between( 2, 4, [False, True]))["is_between"].series_equal( # type: ignore[arg-type] pl.Series("is_between", [False, False, True, True, False])) assert fruits_cars.select(pl.col("A").is_between( 2, 4, [True, False]))["is_between"].series_equal( # type: ignore[arg-type] pl.Series("is_between", [False, True, True, False, False]))
def test_windows_not_cached() -> None: ldf = ( pl.DataFrame( [ pl.Series("key", ["a", "a", "b", "b"]), pl.Series("val", [2, 2, 1, 3]), ] ) .lazy() .filter( (pl.col("key").cumcount().over("key") == 0) | (pl.col("val").shift(1).over("key").is_not_null()) | (pl.col("val") != pl.col("val").shift(1).over("key")) ) ) # this might fail if they are cached for _ in range(1000): ldf.collect()
def test_filter_str(): # use a str instead of a column expr df = pl.DataFrame({ "time": ["11:11:00", "11:12:00", "11:13:00", "11:14:00"], "bools": [True, False, True, False], }) q = df.lazy() # last row based on a filter q.filter(pl.col("bools")).select(pl.last("*"))
def test_fold(): df = pl.DataFrame({"a": [1, 2, 3], "b": [1.0, 2.0, 3.0]}) out = df.lazy().select(pl.sum(["a", "b"])).collect() assert out["sum"].series_equal(pl.Series("sum", [2, 4, 6])) out = df.select( pl.fold(acc=lit(0), f=lambda acc, x: acc + x, exprs=pl.col("*")).alias("foo")) assert out["foo"] == [2, 4, 6]