예제 #1
0
def test_is_in():
    df = pl.DataFrame({"a": [1, 2, 3]})
    assert df.select(pl.col("a").is_in([1, 2]))["a"].to_list() == [
        True,
        True,
        False,
    ]
예제 #2
0
def test_value_counts_expr() -> None:
    df = pl.DataFrame({
        "id": ["a", "b", "b", "c", "c", "c", "d", "d"],
    })

    out = (df.select([
        pl.col("id").value_counts(sort=True),
    ]).to_series().to_list())
    assert out == [
        {
            "id": "c",
            "counts": 3
        },
        {
            "id": "b",
            "counts": 2
        },
        {
            "id": "d",
            "counts": 2
        },
        {
            "id": "a",
            "counts": 1
        },
    ]
예제 #3
0
def test_struct_arr_eval() -> None:
    df = pl.DataFrame({
        "col_struct": [[{
            "a": 1,
            "b": 11
        }, {
            "a": 2,
            "b": 12
        }, {
            "a": 1,
            "b": 11
        }]]
    })
    assert df.with_column(
        pl.col("col_struct").arr.eval(
            pl.element().first()).alias("first")).to_dict(False) == {
                "col_struct": [[{
                    "a": 1,
                    "b": 11
                }, {
                    "a": 2,
                    "b": 12
                }, {
                    "a": 1,
                    "b": 11
                }]],
                "first": [[{
                    "a": 1,
                    "b": 11
                }]],
            }
예제 #4
0
def test_type_coercion_when_then_otherwise_2806() -> None:
    out = (pl.DataFrame({
        "names": ["foo", "spam", "spam"],
        "nrs": [1, 2, 3]
    }).select([
        pl.when((pl.col("names") == "spam")).then(
            (pl.col("nrs") * 2)).otherwise(pl.lit("other")).alias("new_col"),
    ]).to_series())
    expected = pl.Series("new_col", ["other", "4", "6"])
    assert out.to_list() == expected.to_list()

    # test it remains float32
    assert (pl.Series(
        "a", [1.0, 2.0, 3.0], dtype=pl.Float32).to_frame().select(
            pl.when(pl.col("a") > 2.0).then(
                pl.col("a")).otherwise(0.0))).to_series().dtype == pl.Float32
예제 #5
0
파일: test_series.py 프로젝트: ghuls/polars
def test_arr_lengths_dispatch() -> None:
    s = pl.Series("a", [[1, 2], [1, 2, 3]])
    testing.assert_series_equal(s.arr.lengths(), pl.Series("a", [2, 3], dtype=UInt32))
    df = pl.DataFrame([s])
    testing.assert_series_equal(
        df.select(pl.col("a").arr.lengths())["a"], pl.Series("a", [2, 3], dtype=UInt32)
    )
예제 #6
0
파일: test_series.py 프로젝트: ghuls/polars
def test_sqrt_dispatch() -> None:
    s = pl.Series("a", [1, 2])
    testing.assert_series_equal(s.sqrt(), pl.Series("a", [1.0, np.sqrt(2)]))
    df = pl.DataFrame([s])
    testing.assert_series_equal(
        df.select(pl.col("a").sqrt())["a"], pl.Series("a", [1.0, np.sqrt(2)])
    )
예제 #7
0
def test_categorical_lexical_ordering_after_concat() -> None:
    with pl.StringCache():
        ldf1 = (pl.DataFrame([
            pl.Series("key1", [8, 5]),
            pl.Series("key2", ["fox", "baz"])
        ]).lazy().with_column(
            pl.col("key2").cast(pl.Categorical).cat.set_ordering("lexical")))
        ldf2 = (pl.DataFrame([
            pl.Series("key1", [6, 8, 6]),
            pl.Series("key2", ["fox", "foo", "bar"])
        ]).lazy().with_column(
            pl.col("key2").cast(pl.Categorical).cat.set_ordering("lexical")))
        df = (pl.concat([ldf1, ldf2]).with_column(
            pl.col("key2").cat.set_ordering("lexical")).collect())

        df.sort(["key1", "key2"])
예제 #8
0
파일: test_series.py 프로젝트: ghuls/polars
def test_kurtosis_dispatch() -> None:
    s = pl.Series("a", [1, 2, 3, 2, 2, 3, 0])
    expected = -0.6406250000000004

    assert s.kurtosis() == pytest.approx(expected)
    df = pl.DataFrame([s])
    assert np.isclose(df.select(pl.col("a").kurtosis())["a"][0], expected)
예제 #9
0
def test_rank_dispatch():
    s = pl.Series("a", [1, 2, 3, 2, 2, 3, 0])

    assert list(s.rank("dense")) == [2, 3, 4, 3, 3, 4, 1]

    df = pl.DataFrame([s])
    df.select(pl.col("a").rank("dense"))["a"] == [2, 3, 4, 3, 3, 4, 1]
예제 #10
0
def test_map_alias() -> None:
    out = pl.DataFrame({"foo": [1, 2, 3]}).select(
        (pl.col("foo") * 2).map_alias(lambda name: f"{name}{name}")
    )

    expected = pl.DataFrame({"foofoo": [2, 4, 6]})
    assert out.frame_equal(expected)
예제 #11
0
def test_sort_dates_multiples():
    df = pl.DataFrame([
        pl.Series(
            "date",
            [
                "2021-01-01 00:00:00",
                "2021-01-01 00:00:00",
                "2021-01-02 00:00:00",
                "2021-01-02 00:00:00",
                "2021-01-03 00:00:00",
            ],
        ).str.strptime(pl.datatypes.Date64, "%Y-%m-%d %T"),
        pl.Series("values", [5, 4, 3, 2, 1]),
    ])

    expected = [4, 5, 2, 3, 1]

    # date64
    out = df.sort(["date", "values"])
    assert out["values"].to_list() == expected

    # date32
    out = df.with_column(pl.col("date").cast(pl.Date32)).sort(
        ["date", "values"])
    assert out["values"].to_list() == expected
예제 #12
0
def test_list_concat_supertype() -> None:
    df = pl.DataFrame(
        [pl.Series("a", [1, 2], pl.UInt8), pl.Series("b", [10000, 20000], pl.UInt16)]
    )
    assert df.with_column(pl.concat_list(pl.col(["a", "b"])).alias("concat_list"))[
        "concat_list"
    ].to_list() == [[1, 10000], [2, 20000]]
예제 #13
0
def test_strict_cast():
    with pytest.raises(RuntimeError):
        pl.Series("a", [2**16]).cast(dtype=pl.Int16, strict=True)
    with pytest.raises(RuntimeError):
        pl.DataFrame({
            "a": [2**16]
        }).select([pl.col("a").cast(pl.Int16, strict=True)])
예제 #14
0
def test_panic():
    # may contain some tests that yielded a panic in polars or arrow
    # https://github.com/pola-rs/polars/issues/1110
    a = pl.DataFrame({
        "col1": ["a"] * 500 + ["b"] * 500,
    })
    a.filter(pl.col("col1") != "b")
예제 #15
0
def test_diff_dispatch():
    s = pl.Series("a", [1, 2, 3, 2, 2, 3, 0])
    expected = [1, 1, -1, 0, 1, -3]

    assert list(s.diff(null_behavior="drop")) == expected

    df = pl.DataFrame([s])
    assert df.select(pl.col("a").diff())["a"].to_list() == [None, 1, 1, -1, 0, 1, -3]
예제 #16
0
def test_std(dtype):
    if dtype == pl.Int32:
        values = [1, 2, 3, 4]
    else:
        values = [1.0, 2.0, 3.0, 4.0]
    df = pl.DataFrame([
        pl.Series("groups", ["a", "a", "b", "b"]),
        pl.Series("values", values, dtype=dtype),
    ])

    out = df.select(pl.col("values").std().over("groups"))
    assert np.isclose(out["values"][0], 0.7071067690849304)

    out = df.select(pl.col("values").var().over("groups"))
    assert np.isclose(out["values"][0], 0.5)
    out = df.select(pl.col("values").mean().over("groups"))
    assert np.isclose(out["values"][0], 1.5)
예제 #17
0
def test_diff_datetime() -> None:
    df = pl.DataFrame(
        {
            "timestamp": ["2021-02-01", "2021-03-1", "2850-04-1"],
            "guild": [1, 2, 3],
            "char": ["a", "a", "b"],
        }
    )

    out = (
        df.with_columns(
            [
                pl.col("timestamp").str.strptime(pl.Date, fmt="%Y-%m-%d"),
            ]
        ).with_columns([pl.col("timestamp").diff().list().over("char")])
    )["timestamp"]
    assert out[0] == out[1]
예제 #18
0
def test_to_polars_dataframe(report):
    # This relies entirely on Arrow, so as long as those tests work,
    # these should too.
    df = report.to_polars()
    df_csv = pl.read_csv(MOCK_CSV_PATH)
    df_csv = df_csv.with_column(
        pl.col("day").str.strptime(pl.Date).cast(pl.Datetime))
    assert df.frame_equal(df_csv)
예제 #19
0
def test_unique_and_drop_stability() -> None:
    # see: 2898
    # the original cause was that we wrote:
    # expr_a = a.unique()
    # expr_a.filter(a.unique().is_not_null())
    # meaning that the a.unique was executed twice, which is an unstable algorithm
    df = pl.DataFrame({"a": [1, None, 1, None]})
    assert df.select(pl.col("a").unique().drop_nulls()).to_series()[0] == 1
예제 #20
0
def test_upsample() -> None:
    df = pl.DataFrame(
        {
            "time": [
                datetime(2021, 2, 1),
                datetime(2021, 4, 1),
                datetime(2021, 5, 1),
                datetime(2021, 6, 1),
            ],
            "admin": ["Åland", "Netherlands", "Åland", "Netherlands"],
            "test2": [0, 1, 2, 3],
        }
    ).with_column(pl.col("time").dt.with_time_zone("UTC"))

    up = df.upsample(
        time_column="time", every="1mo", by="admin", maintain_order=True
    ).select(pl.all().forward_fill())
    # this print will panic if timezones feature is not activated
    # don't remove
    print(up)

    expected = pl.DataFrame(
        {
            "time": [
                datetime(2021, 2, 1, 0, 0),
                datetime(2021, 3, 1, 0, 0),
                datetime(2021, 4, 1, 0, 0),
                datetime(2021, 5, 1, 0, 0),
                datetime(2021, 4, 1, 0, 0),
                datetime(2021, 5, 1, 0, 0),
                datetime(2021, 6, 1, 0, 0),
            ],
            "admin": [
                "Åland",
                "Åland",
                "Åland",
                "Åland",
                "Netherlands",
                "Netherlands",
                "Netherlands",
            ],
            "test2": [0, 0, 0, 2, 1, 1, 3],
        }
    ).with_column(pl.col("time").dt.with_time_zone("UTC"))

    assert up.frame_equal(expected)
예제 #21
0
def test_list_concat_dispatch() -> None:
    s0 = pl.Series("a", [[1, 2]])
    s1 = pl.Series("b", [[3, 4, 5]])
    expected = pl.Series("a", [[1, 2, 3, 4, 5]])

    out = s0.arr.concat([s1])
    assert out.series_equal(expected)

    out = s0.arr.concat(s1)
    assert out.series_equal(expected)

    df = pl.DataFrame([s0, s1])
    assert df.select(pl.concat_list(["a", "b"]).alias("a"))["a"].series_equal(expected)
    assert df.select(pl.col("a").arr.concat("b").alias("a"))["a"].series_equal(expected)
    assert df.select(pl.col("a").arr.concat(["b"]).alias("a"))["a"].series_equal(
        expected
    )
예제 #22
0
def test_skew_dispatch() -> None:
    s = pl.Series("a", [1, 2, 3, 2, 2, 3, 0])

    assert s.skew(True) == pytest.approx(-0.5953924651018018)
    assert s.skew(False) == pytest.approx(-0.7717168360221258)

    df = pl.DataFrame([s])
    assert np.isclose(df.select(pl.col("a").skew(False))["a"][0], -0.7717168360221258)
예제 #23
0
def test_fold() -> None:
    df = pl.DataFrame({"a": [1, 2, 3], "b": [1.0, 2.0, 3.0]})
    out = df.select(
        [
            pl.sum(["a", "b"]),
            pl.max(["a", pl.col("b") ** 2]),
            pl.min(["a", pl.col("b") ** 2]),
        ]
    )
    assert out["sum"].series_equal(pl.Series("sum", [2.0, 4.0, 6.0]))
    assert out["max"].series_equal(pl.Series("max", [1.0, 4.0, 9.0]))
    assert out["min"].series_equal(pl.Series("min", [1.0, 2.0, 3.0]))

    out = df.select(
        pl.fold(acc=lit(0), f=lambda acc, x: acc + x, exprs=pl.col("*")).alias("foo")
    )
    assert out["foo"] == [2, 4, 6]
예제 #24
0
def test_apply() -> None:
    df = pl.DataFrame({"a": [1, 2, 3], "b": [1.0, 2.0, 3.0]})
    new = df.lazy().with_column(col("a").map(lambda s: s * 2).alias("foo")).collect()

    expected = df.clone()
    expected["foo"] = expected["a"] * 2

    assert new.frame_equal(expected)
예제 #25
0
def build_compound_synonym_df(compound_file, output_dir):
    # Get metadata file and compound_df
    compound_metadata = pl.read_csv(compound_file, null_values="NA")
    compound_df = pl.from_arrow(
        fread(os.path.join(output_dir, "compound.jay")).to_arrow())
    dataset_df = pl.from_arrow(
        fread(os.path.join(output_dir, "dataset.jay")).to_arrow())

    # Find all columns relevant to tissueid
    compound_cols = [
        col for col in compound_metadata.columns
        if re.match(".*drugid$", col) and col != "unique.drugid"
    ]

    # Read in which datasets we are working with
    dataset_names = os.listdir("procdata")
    clean_dataset_names = [re.sub("_.*$", "", name) for name in dataset_names]
    dataset_regex = re.compile("|".join(clean_dataset_names))

    # Filter the cellid columns to only valid datasets
    compound_columns = [
        name for name in compound_cols if re.match(dataset_regex, name)
    ]

    # Get all unique synonyms and join with cell_df
    compound_meta_long = compound_metadata \
        .melt(id_vars="unique.drugid", value_vars=compound_columns) \
        .drop_nulls() \
        .drop_duplicates() \
        .rename({"value": "compound_name", "variable": "dataset_id"}) \
        .filter(col("compound_name") != "")

    compound_synonym_df = compound_df \
        .join(compound_meta_long, left_on="name", right_on="unique.drugid", how="left") \
        .rename({"id": "compound_id"}) \
        .select(["compound_id", "dataset_id", "compound_name"]) \
        .drop_nulls() \
        .drop_duplicates()

    # Create a map from dataset
    dataset_map = {
        dct["name"]: str(dct["id"])
        for dct in dataset_df.to_pandas().to_dict(orient="record")
    }
    # Regex the dataset identifiers to match the dataset map
    compound_synonym_df["dataset_id"] = compound_synonym_df["dataset_id"] \
        .apply(lambda x: re.sub("\.drugid$|[_.].*$", "", x)) \
        .apply(lambda x: re.sub("GDSC2019", "GDSC_v2", x)) \
        .apply(lambda x: re.sub("GDSC1.*$", "GDSC_v1", x)) \
        .apply(lambda x: dataset_map[x]) \
        .cast(pl.Int64)

    compound_synonym_df = compound_synonym_df.drop_duplicates()
    compound_synonym_df["id"] = range(1, compound_synonym_df.shape[0] + 1)

    # Convert to datatable.Frame for memory mapped output file
    df = dt.Frame(compound_synonym_df.to_arrow())
    df.to_jay(os.path.join(output_dir, "compound_synonym.jay"))
예제 #26
0
def build_tissue_synonym_df(tissue_file, output_dir):
    # Get metadata file and tissue_df (assume that tissue_df is also in output_dir)
    tissue_metadata = pl.read_csv(tissue_file)  # will read NA as string!
    tissue_df = pl.from_arrow(
        fread(os.path.join(output_dir, "tissue.jay")).to_arrow())
    dataset_df = pl.from_arrow(
        fread(os.path.join(output_dir, "dataset.jay")).to_arrow())

    # Find all columns relevant to tissueid
    tissue_cols = [
        col for col in tissue_metadata.columns
        if re.match(".*tissueid$", col) and col != "unique.tissueid"
    ]

    # Read in which datasets we are working with
    dataset_names = os.listdir("procdata")
    clean_dataset_names = [re.sub("_.*$", "", name) for name in dataset_names]
    dataset_regex = re.compile("|".join(clean_dataset_names))

    # Filter the cellid columns to only valid datasets
    tissue_columns = [
        name for name in tissue_cols if re.match(dataset_regex, name)
    ]

    # Get all unique synonyms and join with cell_df
    tissue_meta_long = tissue_metadata \
        .melt(id_vars="unique.tissueid", value_vars=tissue_columns) \
        .drop_nulls() \
        .drop_duplicates() \
        .rename({"value": "tissue_name", "variable": "dataset_id"})

    tissue_synonym_df = tissue_df \
        .join(tissue_meta_long, left_on="name", right_on="unique.tissueid", how="left") \
        .drop("name") \
        .rename({"id": "tissue_id"}) \
        .filter(col("tissue_name") != "") \
        .drop_duplicates() \
        .drop_nulls()

    # Create a map from dataset
    dataset_map = {
        dct["name"]: str(dct["id"])
        for dct in dataset_df.to_pandas().to_dict(orient="record")
    }
    # Regex the dataset identifiers to match the dataset map
    tissue_synonym_df["dataset_id"] = tissue_synonym_df["dataset_id"] \
        .apply(lambda x: re.sub("\.cellid$|[_.].*$", "", x)) \
        .apply(lambda x: re.sub("GDSC$", "GDSC_v2", x)) \
        .apply(lambda x: re.sub("GDSC1.*$", "GDSC_v1", x)) \
        .apply(lambda x: dataset_map[x]) \
        .cast(pl.Int64)

    tissue_synonym_df = tissue_synonym_df.drop_duplicates()
    tissue_synonym_df["id"] = range(1, tissue_synonym_df.shape[0] + 1)

    # Convert to datatable.Frame for fast write to disk
    tissue_synonym_dt = dt.Frame(tissue_synonym_df.to_arrow())
    tissue_synonym_dt.to_jay(os.path.join(output_dir, "tissue_synonym.jay"))
예제 #27
0
def test_is_between(fruits_cars: pl.DataFrame) -> None:
    assert fruits_cars.select(pl.col("A").is_between(
        2, 4))["is_between"].series_equal(  # type: ignore[arg-type]
            pl.Series("is_between", [False, False, True, False, False]))
    assert fruits_cars.select(pl.col("A").is_between(
        2, 4, False))["is_between"].series_equal(  # type: ignore[arg-type]
            pl.Series("is_between", [False, False, True, False, False]))
    assert fruits_cars.select(pl.col("A").is_between(
        2, 4,
        [False, False]))["is_between"].series_equal(  # type: ignore[arg-type]
            pl.Series("is_between", [False, False, True, False, False]))
    assert fruits_cars.select(pl.col("A").is_between(
        2, 4, True))["is_between"].series_equal(  # type: ignore[arg-type]
            pl.Series("is_between", [False, True, True, True, False]))
    assert fruits_cars.select(pl.col("A").is_between(
        2, 4,
        [True, True]))["is_between"].series_equal(  # type: ignore[arg-type]
            pl.Series("is_between", [False, True, True, True, False]))
    assert fruits_cars.select(pl.col("A").is_between(
        2, 4,
        [False, True]))["is_between"].series_equal(  # type: ignore[arg-type]
            pl.Series("is_between", [False, False, True, True, False]))
    assert fruits_cars.select(pl.col("A").is_between(
        2, 4,
        [True, False]))["is_between"].series_equal(  # type: ignore[arg-type]
            pl.Series("is_between", [False, True, True, False, False]))
예제 #28
0
파일: various.py 프로젝트: pola-rs/polars
def test_windows_not_cached() -> None:
    ldf = (
        pl.DataFrame(
            [
                pl.Series("key", ["a", "a", "b", "b"]),
                pl.Series("val", [2, 2, 1, 3]),
            ]
        )
        .lazy()
        .filter(
            (pl.col("key").cumcount().over("key") == 0)
            | (pl.col("val").shift(1).over("key").is_not_null())
            | (pl.col("val") != pl.col("val").shift(1).over("key"))
        )
    )
    # this might fail if they are cached
    for _ in range(1000):
        ldf.collect()
예제 #29
0
def test_filter_str():
    # use a str instead of a column expr
    df = pl.DataFrame({
        "time": ["11:11:00", "11:12:00", "11:13:00", "11:14:00"],
        "bools": [True, False, True, False],
    })
    q = df.lazy()
    # last row based on a filter
    q.filter(pl.col("bools")).select(pl.last("*"))
예제 #30
0
def test_fold():
    df = pl.DataFrame({"a": [1, 2, 3], "b": [1.0, 2.0, 3.0]})
    out = df.lazy().select(pl.sum(["a", "b"])).collect()
    assert out["sum"].series_equal(pl.Series("sum", [2, 4, 6]))

    out = df.select(
        pl.fold(acc=lit(0), f=lambda acc, x: acc + x,
                exprs=pl.col("*")).alias("foo"))
    assert out["foo"] == [2, 4, 6]