예제 #1
0
def _rollback_transaction(existing_datasets, new_datasets, store):
    """
    Rollback changes made during tht write process.

    Parameters
    ----------
    existing_datasets: Dict[str, kartothek.core.dataset.DatasetMetadata]
        Datasets that existings before the write process started.
    new_datasets: Dict[str, kartothek.core.dataset.DatasetMetadata]
        Datasets that where created / changed during the write process.
    store: Union[Callable[[], simplekv.KeyValueStore], simplekv.KeyValueStore]
        KV store.
    """
    if callable(store):
        store = store()

    # delete newly created datasets that where not present before the "transaction"
    for ktk_cube_dataset_id in sorted(
            set(new_datasets) - set(existing_datasets)):
        store.delete(
            metadata_key_from_uuid(new_datasets[ktk_cube_dataset_id].uuid))

    # recover changes of old datasets
    for ktk_cube_dataset_id in sorted(
            set(new_datasets) & set(existing_datasets)):
        ds = existing_datasets[ktk_cube_dataset_id]
        builder = DatasetMetadataBuilder.from_dataset(ds)
        store.put(*builder.to_json())
        store_schema_metadata(schema=ds.schema,
                              dataset_uuid=ds.uuid,
                              store=store,
                              table=ds.table_name)
예제 #2
0
def create_empty_dataset_header(
    store,
    dataset_uuid,
    table_meta,
    partition_on=None,
    metadata=None,
    overwrite=False,
    metadata_storage_format=DEFAULT_METADATA_STORAGE_FORMAT,
    metadata_version=DEFAULT_METADATA_VERSION,
):
    """
    Create an dataset header without any partitions. This may be used in combination
    with :func:`~kartothek.io.eager.write_single_partition` to create implicitly partitioned datasets.

    .. note::

        The created dataset will **always** have explicit_partition==False

    .. warning::

        This function should only be used in very rare occasions. Usually you're better off using
        full end-to-end pipelines.

    Parameters
    ----------
    """
    store = lazy_store(store)()
    if not overwrite:
        raise_if_dataset_exists(dataset_uuid=dataset_uuid, store=store)

    for table, schema in table_meta.items():
        table_meta[table] = make_meta(schema,
                                      origin=table,
                                      partition_keys=partition_on)
        store_schema_metadata(
            schema=table_meta[table],
            dataset_uuid=dataset_uuid,
            store=store,
            table=table,
        )
    dataset_builder = DatasetMetadataBuilder(
        uuid=dataset_uuid,
        metadata_version=metadata_version,
        partition_keys=partition_on,
        explicit_partitions=False,
        table_meta=table_meta,
    )
    if metadata:
        for key, value in metadata.items():
            dataset_builder.add_metadata(key, value)
    if metadata_storage_format.lower() == "json":
        store.put(*dataset_builder.to_json())
    elif metadata_storage_format.lower() == "msgpack":
        store.put(*dataset_builder.to_msgpack())
    else:
        raise ValueError(
            "Unknown metadata storage format encountered: {}".format(
                metadata_storage_format))
    return dataset_builder.to_dataset()
예제 #3
0
def test_builder_empty_partition_keys(store, metadata_version, frozen_time):
    expected = {
        "dataset_uuid": "uuid",
        "dataset_metadata_version": metadata_version,
        "metadata": {
            "creation_time": TIME_TO_FREEZE_ISO
        },
        "partition_keys": ["L", "P"],
        "partitions": {},
    }

    builder = DatasetMetadataBuilder("uuid",
                                     metadata_version=4,
                                     partition_keys=["L", "P"])
    key, result = builder.to_json()
    result = simplejson.loads(result)
    assert key == "uuid.by-dataset-metadata.json"
    assert result == expected
    result_from_dict = DatasetMetadata.load_from_dict(result, store).to_dict()
    assert result_from_dict == expected
예제 #4
0
def test_builder_msgpack(metadata_version, frozen_time):
    creation_time = TIME_TO_FREEZE_ISO
    expected = {
        "dataset_uuid": "uuid",
        "dataset_metadata_version": metadata_version,
        "metadata": {
            "creation_time": creation_time
        },
        "partitions": {},
    }
    key, result = DatasetMetadataBuilder(
        "uuid", metadata_version=metadata_version).to_msgpack()
    result = msgpack.unpackb(result)
    assert key == "uuid.by-dataset-metadata.msgpack.zstd"
    assert result == expected
예제 #5
0
def test_builder_to_dataset(metadata_version, frozen_time):
    expected = {
        "dataset_uuid": "uuid",
        "dataset_metadata_version": metadata_version,
        "partitions": {
            "part_2": {
                "files": {
                    "core": "uuid/core/part_2.parquet"
                }
            }
        },
        "metadata": {
            "key": "value",
            "creation_time": TIME_TO_FREEZE_ISO
        },
        "indices": {
            "col1": {
                "a": ["part1"],
                "b": ["part2"]
            }
        },
    }

    builder = DatasetMetadataBuilder("uuid", metadata_version=metadata_version)
    part_2 = Partition("part_2", {"core": "uuid/core/part_2.parquet"})
    builder.add_partition("part_2", part_2)
    builder.add_metadata("key", "value")
    builder.add_embedded_index(
        "col1", ExplicitSecondaryIndex("col1", {
            "a": ["part1"],
            "b": ["part2"]
        }))

    result = builder.to_dataset()
    expected_from_dict = DatasetMetadata.from_dict(expected)
    assert result == expected_from_dict
예제 #6
0
def test_builder_empty(explicit_partitions, metadata_version, frozen_time):
    creation_time = TIME_TO_FREEZE_ISO
    expected = {
        "dataset_uuid": "uuid",
        "dataset_metadata_version": metadata_version,
        "metadata": {
            "creation_time": creation_time
        },
    }
    if explicit_partitions:
        expected["partitions"] = {}
    key, result = DatasetMetadataBuilder(
        "uuid",
        metadata_version=metadata_version,
        explicit_partitions=explicit_partitions,
    ).to_json()
    result = simplejson.loads(result)
    assert key == "uuid.by-dataset-metadata.json"
    assert result == expected
예제 #7
0
파일: write.py 프로젝트: x-malet/kartothek
def store_dataset_from_partitions(
    partition_list,
    store,
    dataset_uuid,
    dataset_metadata=None,
    metadata_merger=None,
    update_dataset=None,
    remove_partitions=None,
    metadata_storage_format=naming.DEFAULT_METADATA_STORAGE_FORMAT,
):
    store = _instantiate_store(store)

    if update_dataset:
        dataset_builder = DatasetMetadataBuilder.from_dataset(update_dataset)
        metadata_version = dataset_builder.metadata_version
    else:
        mp = next(iter(partition_list), None)
        if mp is None:
            raise ValueError(
                "Cannot store empty datasets, partition_list must not be empty if in store mode."
            )

        metadata_version = mp.metadata_version
        dataset_builder = DatasetMetadataBuilder(
            uuid=dataset_uuid,
            metadata_version=metadata_version,
            partition_keys=mp.partition_keys,
        )

    dataset_builder.explicit_partitions = True

    dataset_builder.table_meta = persist_common_metadata(
        partition_list, update_dataset, store, dataset_uuid)

    # We can only check for non unique partition labels here and if they occur we will
    # fail hard. The resulting dataset may be corrupted or file may be left in the store
    # without dataset metadata
    partition_labels = partition_labels_from_mps(partition_list)
    non_unique_labels = extract_duplicates(partition_labels)

    if non_unique_labels:
        raise ValueError(
            "The labels {} are duplicated. Dataset metadata was not written.".
            format(", ".join(non_unique_labels)))

    if remove_partitions is None:
        remove_partitions = []

    if metadata_merger is None:
        metadata_merger = combine_metadata

    dataset_builder = update_metadata(dataset_builder, metadata_merger,
                                      partition_list, dataset_metadata)
    dataset_builder = update_partitions(dataset_builder, partition_list,
                                        remove_partitions)
    dataset_builder = update_indices(dataset_builder, store, partition_list,
                                     remove_partitions)
    if metadata_storage_format.lower() == "json":
        store.put(*dataset_builder.to_json())
    elif metadata_storage_format.lower() == "msgpack":
        store.put(*dataset_builder.to_msgpack())
    else:
        raise ValueError(
            "Unknown metadata storage format encountered: {}".format(
                metadata_storage_format))
    dataset = dataset_builder.to_dataset()
    return dataset
예제 #8
0
def store_dataset_from_partitions(
    partition_list,
    store: StoreInput,
    dataset_uuid,
    dataset_metadata=None,
    metadata_merger=None,
    update_dataset=None,
    remove_partitions=None,
    metadata_storage_format=naming.DEFAULT_METADATA_STORAGE_FORMAT,
):
    store = ensure_store(store)

    schemas = set()
    if update_dataset:
        dataset_builder = DatasetMetadataBuilder.from_dataset(update_dataset)
        metadata_version = dataset_builder.metadata_version
        table_name = update_dataset.table_name
        schemas.add(update_dataset.schema)
    else:
        mp = next(iter(partition_list), None)

        if mp is None:
            raise ValueError(
                "Cannot store empty datasets, partition_list must not be empty if in store mode."
            )
        table_name = mp.table_name
        metadata_version = mp.metadata_version
        dataset_builder = DatasetMetadataBuilder(
            uuid=dataset_uuid,
            metadata_version=metadata_version,
            partition_keys=mp.partition_keys,
        )

    for mp in partition_list:
        if mp.schema:
            schemas.add(mp.schema)

    dataset_builder.schema = persist_common_metadata(
        schemas=schemas,
        update_dataset=update_dataset,
        store=store,
        dataset_uuid=dataset_uuid,
        table_name=table_name,
    )

    # We can only check for non unique partition labels here and if they occur we will
    # fail hard. The resulting dataset may be corrupted or file may be left in the store
    # without dataset metadata
    partition_labels = partition_labels_from_mps(partition_list)

    # This could be safely removed since we do not allow to set this by the user
    # anymore. It has implications on tests if mocks are used
    non_unique_labels = extract_duplicates(partition_labels)

    if non_unique_labels:
        raise ValueError(
            "The labels {} are duplicated. Dataset metadata was not written.".
            format(", ".join(non_unique_labels)))

    if remove_partitions is None:
        remove_partitions = []

    if metadata_merger is None:
        metadata_merger = combine_metadata

    dataset_builder = update_metadata(dataset_builder, metadata_merger,
                                      dataset_metadata)
    dataset_builder = update_partitions(dataset_builder, partition_list,
                                        remove_partitions)
    dataset_builder = update_indices(dataset_builder, store, partition_list,
                                     remove_partitions)
    if metadata_storage_format.lower() == "json":
        store.put(*dataset_builder.to_json())
    elif metadata_storage_format.lower() == "msgpack":
        store.put(*dataset_builder.to_msgpack())
    else:
        raise ValueError(
            "Unknown metadata storage format encountered: {}".format(
                metadata_storage_format))
    dataset = dataset_builder.to_dataset()
    return dataset
예제 #9
0
def test_builder_full(metadata_version, frozen_time):
    expected = {
        "dataset_uuid": "uuid",
        "dataset_metadata_version": metadata_version,
        "partitions": {
            "run_id=1/L=1/P=1/part_1": {
                "files": {
                    "core": "uuid/core/run_id=1/L=1/P=1/part_1.parquet",
                    "helper": "uuid/helper/run_id=1/L=1/P=1/part_1.parquet",
                }
            }
        },
        "metadata": {
            "key": "value",
            "creation_time": TIME_TO_FREEZE_ISO
        },
        "indices": {
            "col1": {
                "a": ["run_id=1/L=1/P=1/part_1"],
                "b": ["run_id=2/L=1/P=1/part_1"],
            },
            "col2": "uuid.col2.by-dataset-index.parquet",
        },
        "partition_keys": ["L", "P"],
    }

    builder = DatasetMetadataBuilder("uuid",
                                     metadata_version=metadata_version,
                                     partition_keys=["L", "P"])
    part_2 = Partition(
        label="run_id=1/L=1/P=1/part_1",
        files={
            "core": "uuid/core/run_id=1/L=1/P=1/part_1.parquet",
            "helper": "uuid/helper/run_id=1/L=1/P=1/part_1.parquet",
        },
    )
    builder.add_partition("run_id=1/L=1/P=1/part_1", part_2)
    builder.add_metadata("key", "value")
    builder.add_external_index("col2")
    builder.add_embedded_index(
        "col1",
        ExplicitSecondaryIndex("col1", {
            "a": ["run_id=1/L=1/P=1/part_1"],
            "b": ["run_id=2/L=1/P=1/part_1"]
        }),
    )
    key, result = builder.to_json()
    result = simplejson.loads(result)
    assert key == "uuid.by-dataset-metadata.json"
    assert result == expected
예제 #10
0
def copy_dataset(
    source_dataset_uuid: str,
    store: KeyValueStore,
    target_dataset_uuid: Optional[str] = None,
    target_store: Optional[KeyValueStore] = None,
) -> Dict[str, DatasetMetadata]:
    """
    Copies and optionally renames a dataset, either  from one store to another or
    within one store.

    Parameters
    ----------
    source_dataset_uuid: str
        UUID of source dataset
    store: simplekv.KeyValueStore
        Source store
    target_dataset_uuid: Optional[str]
        UUID of target dataset. May be the same as src_dataset_uuid, if store
        and tgt_store are different. If empty, src_dataset_uuid is used
    target_store: Optional[simplekv.KeyValueStore]
        Target Store. May be the same as store, if src_dataset_uuid and
        target_dataset_uuid are different. If empty, value from parameter store is
        used
    """
    if target_dataset_uuid is None:
        target_dataset_uuid = source_dataset_uuid
    if target_store is None:
        target_store = store

    if (source_dataset_uuid == target_dataset_uuid) & (store == target_store):
        raise ValueError(
            "Cannot copy to a dataset with the same UUID within the same store!"
        )

    ds_factory_source = _ensure_factory(
        dataset_uuid=source_dataset_uuid,
        store=store,
        factory=None,
        load_dataset_metadata=True,
    )

    # Create a dict of {source key: target key} entries
    keys = get_dataset_keys(ds_factory_source.dataset_metadata)
    mapped_keys = {
        source_key: source_key.replace(source_dataset_uuid,
                                       target_dataset_uuid)
        for source_key in keys
    }

    # Create a dict of metadata which has to be changed. This is only the
    # <uuid>.by-dataset-metadata.json file

    md_transformed = {
        f"{target_dataset_uuid}{METADATA_BASE_SUFFIX}{METADATA_FORMAT_JSON}":
        DatasetMetadataBuilder.from_dataset(
            ds_factory_source.dataset_metadata).modify_uuid(
                target_dataset_uuid).to_dataset()
    }
    # Copy the keys from one store to another
    copy_rename_keys(mapped_keys, store, target_store, md_transformed)

    return md_transformed