示例#1
0
def test_commit_dataset_from_nested_metapartition(store):
    """
    Check it is possible to use `commit_dataset` with nested metapartitions as input.
    Original issue: https://github.com/JDASoftwareGroup/kartothek/issues/40
    """

    df = pd.DataFrame({"a": [1, 1, 2, 2], "b": [3, 4, 5, 6]})

    create_empty_dataset_header(
        store=store,
        dataset_uuid="uuid",
        schema=make_meta(df, "table", ["a"]),
        partition_on=["a"],
    )

    partitions = []
    for x in range(2):
        partitions.append(
            write_single_partition(
                store=store,
                dataset_uuid="uuid",
                data=df,
                partition_on=["a"],
            ))

    partition_labels = {mp_.label for mp in partitions for mp_ in mp}
    dm = commit_dataset(store=store,
                        dataset_uuid="uuid",
                        new_partitions=partitions,
                        partition_on=["a"])
    assert dm.partitions.keys() == partition_labels
示例#2
0
def test_write_single_partition(store_factory, mock_uuid, metadata_version):
    create_empty_dataset_header(
        store=store_factory(),
        table_meta={
            "table1": pd.DataFrame({"col": [1]}),
            "table2": pd.DataFrame({"other_col": ["a"]}),
        },
        dataset_uuid="some_dataset",
        metadata_version=metadata_version,
    )

    new_data = {
        "data": {
            "table1": pd.DataFrame({"col": [1, 2]}),
            "table2": pd.DataFrame({"other_col": ["a", "b"]}),
        }
    }
    keys_in_store = set(store_factory().keys())
    new_mp = write_single_partition(store=store_factory,
                                    dataset_uuid="some_dataset",
                                    data=new_data)

    keys_in_store.add("some_dataset/table1/auto_dataset_uuid.parquet")
    keys_in_store.add("some_dataset/table2/auto_dataset_uuid.parquet")
    assert set(store_factory().keys()) == keys_in_store
    expected_mp = MetaPartition(
        label="auto_dataset_uuid",  # this will be a hash of the input
        files={
            "table1": "some_dataset/table1/auto_dataset_uuid.parquet",
            "table2": "some_dataset/table2/auto_dataset_uuid.parquet",
        },
        metadata_version=4,
        table_meta={
            "table1":
            make_meta(pd.DataFrame({"col": [1, 2]}), origin="table1"),
            "table2":
            make_meta(pd.DataFrame({"other_col": ["a", "b"]}),
                      origin="table2"),
        },
    )

    assert new_mp == expected_mp

    with pytest.raises(ValueError):
        # col is an integer column so this is incompatible.
        new_data["data"]["table1"] = pd.DataFrame(
            {"col": [datetime.date(2010, 1, 1)]})
        write_single_partition(store=store_factory,
                               dataset_uuid="some_dataset",
                               data=new_data)
示例#3
0
def test_initial_commit(store):
    dataset_uuid = "dataset_uuid"
    df = pd.DataFrame(OrderedDict([("P", [5]), ("L", [5]), ("TARGET", [5])]))
    dataset = create_empty_dataset_header(
        store=store,
        table_meta={"core": make_meta(df, origin="1")},
        dataset_uuid=dataset_uuid,
        metadata_version=4,
    )
    assert dataset.explicit_partitions is False
    new_data = {"data": {"core": df}}
    new_metapartition = write_single_partition(store=store,
                                               dataset_uuid=dataset.uuid,
                                               data=new_data)

    new_partition = [{
        "label": new_metapartition.label,
        "data": [("core", None)]
    }]
    updated_dataset = commit_dataset(
        store=store,
        dataset_uuid=dataset.uuid,
        new_partitions=new_partition,
        delete_scope=None,
        partition_on=None,
    )
    assert updated_dataset.explicit_partitions is True
    actual = read_table(store=store,
                        table="core",
                        dataset_uuid=updated_dataset.uuid)
    df_expected = pd.DataFrame(
        OrderedDict([("L", [5]), ("P", [5]), ("TARGET", [5])]))

    assert_frame_equal(df_expected, actual)
示例#4
0
def test_create_dataset_header(store, metadata_storage_format, frozen_time):
    table_meta = {"table": make_meta(pd.DataFrame({"col": [1]}), origin="1")}
    new_dataset = create_empty_dataset_header(
        store=store,
        table_meta=table_meta,
        dataset_uuid="new_dataset_uuid",
        metadata_storage_format=metadata_storage_format,
        metadata_version=4,
    )

    expected_dataset = DatasetMetadata(
        uuid="new_dataset_uuid",
        metadata_version=4,
        explicit_partitions=False,
        table_meta=table_meta,
    )
    assert new_dataset == expected_dataset

    storage_keys = list(store.keys())
    assert len(storage_keys) == 2

    loaded = DatasetMetadata.load_from_store(store=store,
                                             uuid="new_dataset_uuid")
    assert loaded == expected_dataset

    # If the read succeeds, the schema is written
    read_schema_metadata(dataset_uuid=new_dataset.uuid,
                         store=store,
                         table="table")
示例#5
0
def test_create_dataset_header_minimal_version(store, metadata_storage_format):
    with pytest.raises(NotImplementedError):
        create_empty_dataset_header(
            store=store,
            table_meta={"table": pd.DataFrame({"col": [1]})},
            dataset_uuid="new_dataset_uuid",
            metadata_storage_format=metadata_storage_format,
            metadata_version=3,
        )
    create_empty_dataset_header(
        store=store,
        table_meta={"table": pd.DataFrame({"col": [1]})},
        dataset_uuid="new_dataset_uuid",
        metadata_storage_format=metadata_storage_format,
        metadata_version=4,
    )
示例#6
0
def test_initial_commit(store):
    dataset_uuid = "dataset_uuid"
    df = pd.DataFrame(OrderedDict([("P", [5]), ("L", [5]), ("TARGET", [5])]))
    dataset = create_empty_dataset_header(
        store=store,
        schema=make_meta(df, origin="1"),
        dataset_uuid=dataset_uuid,
        metadata_version=4,
    )
    assert dataset.explicit_partitions is False
    new_metapartition = write_single_partition(store=store,
                                               dataset_uuid=dataset.uuid,
                                               data=df)

    updated_dataset = commit_dataset(
        store=store,
        dataset_uuid=dataset.uuid,
        # FIXME: is this breaking and if so, is it expected?
        new_partitions=[new_metapartition],
        delete_scope=None,
        partition_on=None,
    )
    assert updated_dataset.explicit_partitions is True
    actual = read_table(store=store, dataset_uuid=updated_dataset.uuid)
    df_expected = pd.DataFrame(
        OrderedDict([("L", [5]), ("P", [5]), ("TARGET", [5])]))

    assert_frame_equal(df_expected, actual)
示例#7
0
def test_create_empty_header_from_pyarrow_schema(store_factory):
    # GH228
    df = pd.DataFrame(
        [{"part": 1, "id": 1, "col1": "abc"}, {"part": 2, "id": 2, "col1": np.nan}]
    )
    dataset_uuid = "sample_ds"
    schema = pa.Schema.from_pandas(df)

    dm = create_empty_dataset_header(
        store=store_factory,
        dataset_uuid=dataset_uuid,
        table_meta={"table": schema},
        partition_on=["part"],
    )

    new_partitions = [
        write_single_partition(
            store=store_factory,
            dataset_uuid=dataset_uuid,
            data=[{"table": df.loc[df["part"] == 1]}],
            partition_on=["part"],
        )
    ]
    assert len(dm.partitions) == 0
    dm = commit_dataset(
        store=store_factory,
        dataset_uuid=dataset_uuid,
        new_partitions=new_partitions,
        partition_on=["part"],
    )

    assert len(dm.partitions) == 1