예제 #1
0
def generate_mp(dataset_metadata=None):
    return MetaPartition(
        label=uuid.uuid4().hex,
        table_meta={"table": make_meta(get_dataframe_alltypes())},
        files={"table": "fakefile"},
        dataset_metadata=dataset_metadata,
    )
예제 #2
0
파일: schema.py 프로젝트: xhochy/kartothek
 def setup(self, num_schemas, has_na):
     self.df = get_dataframe_alltypes()
     schema = make_meta(self.df, origin="df")
     self.schemas = [deepcopy(schema) for _ in range(num_schemas)]
     if has_na:
         empty_schema = make_meta(self.df[0:0], origin="empty")
         # insert alternating empty schemas
         self.schemas[::2] = [
             deepcopy(empty_schema) for _ in range(len(self.schemas[::2]))
         ]
def test_arrow_compat(arrow_version, reference_store, mocker):
    """
    Test if reading/writing across the supported arrow versions is actually
    compatible

    Generate new reference files by going to the `reference-data/arrow-compat` directory and
    executing `generate_reference.py` or `batch_generate_reference.sh`.
    """

    uuid_hook = mocker.patch("kartothek.core.uuid._uuid_hook_object")
    uuid_hook.return_value = uuid.UUID(
        bytes=b"\x82\xd6\xc1\x06Z\x08\x11\xe9\x85eJ\x00\x07\xf8\n\x10")

    orig = get_dataframe_alltypes()
    restored = ParquetSerializer().restore_dataframe(store=reference_store,
                                                     key=arrow_version +
                                                     ".parquet",
                                                     date_as_object=True)

    pdt.assert_frame_equal(orig, restored)
예제 #4
0
def test_arrow_compat(arrow_version, reference_store, mocker):
    """
    Test if reading/writing across the supported arrow versions is actually
    compatible

    Generate new reference files with::

        import pyarrow as pa
        ParquetSerializer().store(reference_store, pa.__version__, orig)
    """

    uuid_hook = mocker.patch("kartothek.core.uuid._uuid_hook_object")
    uuid_hook.return_value = uuid.UUID(
        bytes=b"\x82\xd6\xc1\x06Z\x08\x11\xe9\x85eJ\x00\x07\xf8\n\x10")

    orig = get_dataframe_alltypes()
    restored = ParquetSerializer().restore_dataframe(store=reference_store,
                                                     key=arrow_version +
                                                     ".parquet",
                                                     date_as_object=True)
    if arrow_version == "0.14.1" and not ARROW_LARGER_EQ_0141:
        orig = orig.astype({"null": float})
        pdt.assert_frame_equal(orig, restored)
예제 #5
0
def df_all_types():
    return get_dataframe_alltypes()
예제 #6
0
def test_get_dataframe_alltypes():
    df = get_dataframe_alltypes()
    assert isinstance(df, pd.DataFrame)
    assert not df.empty
    assert "byte" in df.columns
예제 #7
0
#!/usr/bin/env python
import os

import pyarrow as pa
from storefact import get_store_from_url

from kartothek.core.testing import get_dataframe_alltypes
from kartothek.serialization import ParquetSerializer

if __name__ == "__main__":
    ser = ParquetSerializer()
    dir_path = os.path.dirname(os.path.realpath(__file__))
    store = get_store_from_url(f"hfs://{dir_path}")

    df = get_dataframe_alltypes()
    df["byte"] = b"\x82\xd6\xc1\x06Z\x08\x11\xe9\x85eJ\x00\x07\xf8\n\x10"
    ref_file = f"{pa.__version__}"
    ser.store(store, ref_file, df)
예제 #8
0
파일: schema.py 프로젝트: xhochy/kartothek
 def setup(self, num_schemas):
     self.df = get_dataframe_alltypes()
     schema = make_meta(self.df, origin="df")
     self.schemas = [deepcopy(schema) for _ in range(num_schemas)]
예제 #9
0
파일: schema.py 프로젝트: xhochy/kartothek
 def setup(self):
     self.df = get_dataframe_alltypes()
예제 #10
0
def generate_mp():
    return MetaPartition(
        label=uuid.uuid4().hex,
        schema=make_meta(get_dataframe_alltypes(), origin="alltypes"),
        file="fakefile",
    )