示例#1
0
def test_to_from_buffer(df: pl.DataFrame, compressions: list[str]) -> None:
    for compression in compressions:
        if compression == "lzo":
            # lzo compression is not supported now
            with pytest.raises(pl.ArrowError):
                buf = io.BytesIO()
                df.write_parquet(buf, compression=compression)
                buf.seek(0)
                _ = pl.read_parquet(buf)

            with pytest.raises(OSError):
                buf = io.BytesIO()
                df.write_parquet(buf,
                                 compression=compression,
                                 use_pyarrow=True)
                buf.seek(0)
                _ = pl.read_parquet(buf)
        else:
            buf = io.BytesIO()
            df.write_parquet(buf, compression=compression)
            buf.seek(0)
            read_df = pl.read_parquet(buf)
            assert_frame_equal_local_categoricals(df, read_df)

    for use_pyarrow in [True, False]:
        buf = io.BytesIO()
        df.write_parquet(buf, use_pyarrow=use_pyarrow)
        buf.seek(0)
        read_df = pl.read_parquet(buf, use_pyarrow=use_pyarrow)
        assert_frame_equal_local_categoricals(df, read_df)
示例#2
0
def test_parquet_chunks():
    """
    This failed in https://github.com/pola-rs/polars/issues/545
    """
    cases = [
        1048576,
        1048577,
    ]

    for case in cases:
        f = io.BytesIO()
        # repeat until it has case instances
        df = pd.DataFrame(
            np.tile([1.0, pd.to_datetime("2010-10-10")], [case, 1]),
            columns=["floats", "dates"],
        )
        print(df)

        # write as parquet
        df.to_parquet(f)

        print(f"reading {case} dates with polars...", end="")
        f.seek(0)

        # read it with polars
        polars_df = pl.read_parquet(f)
示例#3
0
def test_parquet_datetime() -> None:
    """
    This failed because parquet writers cast datetime to Date
    """
    f = io.BytesIO()
    data = {
        "datetime": [  # unix timestamp in ms
            1618354800000,
            1618354740000,
            1618354680000,
            1618354620000,
            1618354560000,
        ],
        "laf_max":
        [73.1999969482, 71.0999984741, 74.5, 69.5999984741, 69.6999969482],
        "laf_eq": [59.5999984741, 61.0, 62.2999992371, 56.9000015259, 60.0],
    }
    df = pl.DataFrame(data)
    df = df.with_column(df["datetime"].cast(pl.Datetime))

    # todo! test all compressions here
    df.write_parquet(f, use_pyarrow=True, compression="snappy")
    f.seek(0)
    read = pl.read_parquet(f)
    assert read.frame_equal(df)
示例#4
0
 def decode(
     self,
     ctx: FlyteContext,
     flyte_value: literals.StructuredDataset,
     current_task_metadata: StructuredDatasetMetadata,
 ) -> pl.DataFrame:
     local_dir = ctx.file_access.get_random_local_directory()
     ctx.file_access.get_data(flyte_value.uri, local_dir, is_multipart=True)
     path = f"{local_dir}/00000"
     if current_task_metadata.structured_dataset_type and current_task_metadata.structured_dataset_type.columns:
         columns = [
             c.name
             for c in current_task_metadata.structured_dataset_type.columns
         ]
         return pl.read_parquet(path, columns=columns)
     return pl.read_parquet(path)
示例#5
0
def test_to_from_file(io_test_dir: str, df: pl.DataFrame,
                      compressions: List[str]) -> None:
    f = os.path.join(io_test_dir, "small.parquet")
    for compression in compressions:
        if compression == "lzo":
            # lzo compression is not supported now
            with pytest.raises(pl.ArrowError):
                df.write_parquet(f, compression=compression)
                _ = pl.read_parquet(f)

            with pytest.raises(OSError):
                df.write_parquet(f, compression=compression, use_pyarrow=True)
                _ = pl.read_parquet(f)
        else:
            df.write_parquet(f, compression=compression)
            read_df = pl.read_parquet(f)
            assert df.frame_equal(read_df)
示例#6
0
def test_select_projection() -> None:
    df = pl.DataFrame({"a": [1, 2, 3], "b": [True, False, True], "c": ["a", "b", "c"]})
    expected = pl.DataFrame({"b": [True, False, True], "c": ["a", "b", "c"]})
    f = io.BytesIO()
    df.write_parquet(f)
    f.seek(0)

    read_df = pl.read_parquet(f, columns=[1, 2], use_pyarrow=False)
    assert expected.frame_equal(read_df)
示例#7
0
def _scan_parquet_impl(uri: str, with_columns: list[str] | None) -> pli.DataFrame:
    """
    Takes the projected columns and materializes an arrow table.

    Parameters
    ----------
    uri
    with_columns
    """
    import polars as pl

    return pl.read_parquet(uri, with_columns)
示例#8
0
 def from_bytes(self, b: bytes, extension=None):
     if extension is None:
         extension = self.default_extension()
     f = BytesIO()
     f.write(b)
     f.seek(0)
     if extension == "csv":
         return pl.read_csv(f)
     elif extension == "parquet":
         return pl.read_parquet(f)
     raise Exception(
         f"Deserialization: file extension {extension} is not supported by polars data-frame type."
     )
示例#9
0
def test_read_utc_times_parquet() -> None:
    df = pd.DataFrame(
        data={
            "Timestamp": pd.date_range(
                "2022-01-01T00:00+00:00", "2022-01-01T10:00+00:00", freq="H"
            )
        }
    )
    f = io.BytesIO()
    df.to_parquet(f)
    f.seek(0)
    df_in = pl.read_parquet(f)
    assert df_in["Timestamp"][0] == datetime(2022, 1, 1, 0, 0)
示例#10
0
def test_nested_parquet() -> None:
    f = io.BytesIO()
    data = [
        {"a": [{"b": 0}]},
        {"a": [{"b": 1}, {"b": 2}]},
    ]
    df = pd.DataFrame(data)
    df.to_parquet(f)

    read = pl.read_parquet(f, use_pyarrow=True)
    assert read.columns == ["a"]
    assert isinstance(read.dtypes[0], pl.datatypes.List)
    assert isinstance(read.dtypes[0].inner, pl.datatypes.Struct)
示例#11
0
def test_to_from_buffer(df: pl.DataFrame, compressions: List[str]) -> None:
    for compression in compressions:
        if compression == "lzo":
            # lzo compression is not supported now
            with pytest.raises(pl.ArrowError):
                buf = io.BytesIO()
                df.write_parquet(buf, compression=compression)
                buf.seek(0)
                _ = pl.read_parquet(buf)

            with pytest.raises(OSError):
                buf = io.BytesIO()
                df.write_parquet(buf,
                                 compression=compression,
                                 use_pyarrow=True)
                buf.seek(0)
                _ = pl.read_parquet(buf)
        else:
            buf = io.BytesIO()
            df.write_parquet(buf, compression=compression)
            buf.seek(0)
            read_df = pl.read_parquet(buf)
            assert df.frame_equal(read_df, null_equal=True)
示例#12
0
def test_nested_dictionary() -> None:
    with pl.StringCache():
        df = (pl.DataFrame({
            "str": ["A", "B", "A", "B", "C"],
            "group": [1, 1, 2, 1, 2]
        }).with_column(pl.col("str").cast(
            pl.Categorical)).groupby("group").agg(
                [pl.col("str").list().alias("cat_list")]))
        f = io.BytesIO()
        df.write_parquet(f)
        f.seek(0)

        read_df = pl.read_parquet(f)
        assert df.frame_equal(read_df)
示例#13
0
def recursive_logical_type() -> None:
    df = pl.DataFrame({
        "str": ["A", "B", "A", "B", "C"],
        "group": [1, 1, 2, 1, 2]
    })
    df = df.with_column(pl.col("str").cast(pl.Categorical))

    df_groups = df.groupby("group").agg(
        [pl.col("str").list().alias("cat_list")])
    f = io.BytesIO()
    df_groups.write_parquet(f, use_pyarrow=True)
    f.seek(0)
    read = pl.read_parquet(f, use_pyarrow=True)
    assert read.dtypes == [pl.Int64, pl.List(pl.Categorical)]
    assert read.shape == (2, 2)
示例#14
0
def test_row_count(foods_parquet: str) -> None:
    df = pl.read_parquet(foods_parquet, row_count_name="row_count")
    assert df["row_count"].to_list() == list(range(27))

    df = (pl.scan_parquet(foods_parquet, row_count_name="row_count").filter(
        pl.col("category") == pl.lit("vegetables")).collect())

    assert df["row_count"].to_list() == [0, 6, 11, 13, 14, 20, 25]

    df = (pl.scan_parquet(
        foods_parquet, row_count_name="row_count").with_row_count(
            "foo",
            10).filter(pl.col("category") == pl.lit("vegetables")).collect())

    assert df["foo"].to_list() == [10, 16, 21, 23, 24, 30, 35]
示例#15
0
def test_chunked_round_trip() -> None:
    df1 = pl.DataFrame({
        "a": [1] * 2,
        "l": [[1] for j in range(0, 2)],
    })
    df2 = pl.DataFrame({
        "a": [2] * 3,
        "l": [[2] for j in range(0, 3)],
    })

    df = df1.vstack(df2)

    f = io.BytesIO()
    df.write_parquet(f)
    f.seek(0)
    assert pl.read_parquet(f).frame_equal(df)
示例#16
0
def test_glob_parquet(io_test_dir: str) -> None:
    path = os.path.join(io_test_dir, "small*.parquet")
    assert pl.read_parquet(path).shape == (3, 16)
    assert pl.scan_parquet(path).collect().shape == (3, 16)
示例#17
0
def test_null_parquet(io_test_dir: str) -> None:
    file = path.join(io_test_dir, "null.parquet")
    df = pl.DataFrame([pl.Series("foo", [], dtype=pl.Int8)])
    df.write_parquet(file)
    out = pl.read_parquet(file)
    assert out.frame_equal(df)