def _rollback_transaction(existing_datasets, new_datasets, store): """ Rollback changes made during tht write process. Parameters ---------- existing_datasets: Dict[str, kartothek.core.dataset.DatasetMetadata] Datasets that existings before the write process started. new_datasets: Dict[str, kartothek.core.dataset.DatasetMetadata] Datasets that where created / changed during the write process. store: Union[Callable[[], simplekv.KeyValueStore], simplekv.KeyValueStore] KV store. """ if callable(store): store = store() # delete newly created datasets that where not present before the "transaction" for ktk_cube_dataset_id in sorted( set(new_datasets) - set(existing_datasets)): store.delete( metadata_key_from_uuid(new_datasets[ktk_cube_dataset_id].uuid)) # recover changes of old datasets for ktk_cube_dataset_id in sorted( set(new_datasets) & set(existing_datasets)): ds = existing_datasets[ktk_cube_dataset_id] builder = DatasetMetadataBuilder.from_dataset(ds) store.put(*builder.to_json()) store_schema_metadata(schema=ds.schema, dataset_uuid=ds.uuid, store=store, table=ds.table_name)
def create_empty_dataset_header( store, dataset_uuid, table_meta, partition_on=None, metadata=None, overwrite=False, metadata_storage_format=DEFAULT_METADATA_STORAGE_FORMAT, metadata_version=DEFAULT_METADATA_VERSION, ): """ Create an dataset header without any partitions. This may be used in combination with :func:`~kartothek.io.eager.write_single_partition` to create implicitly partitioned datasets. .. note:: The created dataset will **always** have explicit_partition==False .. warning:: This function should only be used in very rare occasions. Usually you're better off using full end-to-end pipelines. Parameters ---------- """ store = lazy_store(store)() if not overwrite: raise_if_dataset_exists(dataset_uuid=dataset_uuid, store=store) for table, schema in table_meta.items(): table_meta[table] = make_meta(schema, origin=table, partition_keys=partition_on) store_schema_metadata( schema=table_meta[table], dataset_uuid=dataset_uuid, store=store, table=table, ) dataset_builder = DatasetMetadataBuilder( uuid=dataset_uuid, metadata_version=metadata_version, partition_keys=partition_on, explicit_partitions=False, table_meta=table_meta, ) if metadata: for key, value in metadata.items(): dataset_builder.add_metadata(key, value) if metadata_storage_format.lower() == "json": store.put(*dataset_builder.to_json()) elif metadata_storage_format.lower() == "msgpack": store.put(*dataset_builder.to_msgpack()) else: raise ValueError( "Unknown metadata storage format encountered: {}".format( metadata_storage_format)) return dataset_builder.to_dataset()
def test_builder_empty_partition_keys(store, metadata_version, frozen_time): expected = { "dataset_uuid": "uuid", "dataset_metadata_version": metadata_version, "metadata": { "creation_time": TIME_TO_FREEZE_ISO }, "partition_keys": ["L", "P"], "partitions": {}, } builder = DatasetMetadataBuilder("uuid", metadata_version=4, partition_keys=["L", "P"]) key, result = builder.to_json() result = simplejson.loads(result) assert key == "uuid.by-dataset-metadata.json" assert result == expected result_from_dict = DatasetMetadata.load_from_dict(result, store).to_dict() assert result_from_dict == expected
def test_builder_msgpack(metadata_version, frozen_time): creation_time = TIME_TO_FREEZE_ISO expected = { "dataset_uuid": "uuid", "dataset_metadata_version": metadata_version, "metadata": { "creation_time": creation_time }, "partitions": {}, } key, result = DatasetMetadataBuilder( "uuid", metadata_version=metadata_version).to_msgpack() result = msgpack.unpackb(result) assert key == "uuid.by-dataset-metadata.msgpack.zstd" assert result == expected
def test_builder_to_dataset(metadata_version, frozen_time): expected = { "dataset_uuid": "uuid", "dataset_metadata_version": metadata_version, "partitions": { "part_2": { "files": { "core": "uuid/core/part_2.parquet" } } }, "metadata": { "key": "value", "creation_time": TIME_TO_FREEZE_ISO }, "indices": { "col1": { "a": ["part1"], "b": ["part2"] } }, } builder = DatasetMetadataBuilder("uuid", metadata_version=metadata_version) part_2 = Partition("part_2", {"core": "uuid/core/part_2.parquet"}) builder.add_partition("part_2", part_2) builder.add_metadata("key", "value") builder.add_embedded_index( "col1", ExplicitSecondaryIndex("col1", { "a": ["part1"], "b": ["part2"] })) result = builder.to_dataset() expected_from_dict = DatasetMetadata.from_dict(expected) assert result == expected_from_dict
def test_builder_empty(explicit_partitions, metadata_version, frozen_time): creation_time = TIME_TO_FREEZE_ISO expected = { "dataset_uuid": "uuid", "dataset_metadata_version": metadata_version, "metadata": { "creation_time": creation_time }, } if explicit_partitions: expected["partitions"] = {} key, result = DatasetMetadataBuilder( "uuid", metadata_version=metadata_version, explicit_partitions=explicit_partitions, ).to_json() result = simplejson.loads(result) assert key == "uuid.by-dataset-metadata.json" assert result == expected
def store_dataset_from_partitions( partition_list, store, dataset_uuid, dataset_metadata=None, metadata_merger=None, update_dataset=None, remove_partitions=None, metadata_storage_format=naming.DEFAULT_METADATA_STORAGE_FORMAT, ): store = _instantiate_store(store) if update_dataset: dataset_builder = DatasetMetadataBuilder.from_dataset(update_dataset) metadata_version = dataset_builder.metadata_version else: mp = next(iter(partition_list), None) if mp is None: raise ValueError( "Cannot store empty datasets, partition_list must not be empty if in store mode." ) metadata_version = mp.metadata_version dataset_builder = DatasetMetadataBuilder( uuid=dataset_uuid, metadata_version=metadata_version, partition_keys=mp.partition_keys, ) dataset_builder.explicit_partitions = True dataset_builder.table_meta = persist_common_metadata( partition_list, update_dataset, store, dataset_uuid) # We can only check for non unique partition labels here and if they occur we will # fail hard. The resulting dataset may be corrupted or file may be left in the store # without dataset metadata partition_labels = partition_labels_from_mps(partition_list) non_unique_labels = extract_duplicates(partition_labels) if non_unique_labels: raise ValueError( "The labels {} are duplicated. Dataset metadata was not written.". format(", ".join(non_unique_labels))) if remove_partitions is None: remove_partitions = [] if metadata_merger is None: metadata_merger = combine_metadata dataset_builder = update_metadata(dataset_builder, metadata_merger, partition_list, dataset_metadata) dataset_builder = update_partitions(dataset_builder, partition_list, remove_partitions) dataset_builder = update_indices(dataset_builder, store, partition_list, remove_partitions) if metadata_storage_format.lower() == "json": store.put(*dataset_builder.to_json()) elif metadata_storage_format.lower() == "msgpack": store.put(*dataset_builder.to_msgpack()) else: raise ValueError( "Unknown metadata storage format encountered: {}".format( metadata_storage_format)) dataset = dataset_builder.to_dataset() return dataset
def store_dataset_from_partitions( partition_list, store: StoreInput, dataset_uuid, dataset_metadata=None, metadata_merger=None, update_dataset=None, remove_partitions=None, metadata_storage_format=naming.DEFAULT_METADATA_STORAGE_FORMAT, ): store = ensure_store(store) schemas = set() if update_dataset: dataset_builder = DatasetMetadataBuilder.from_dataset(update_dataset) metadata_version = dataset_builder.metadata_version table_name = update_dataset.table_name schemas.add(update_dataset.schema) else: mp = next(iter(partition_list), None) if mp is None: raise ValueError( "Cannot store empty datasets, partition_list must not be empty if in store mode." ) table_name = mp.table_name metadata_version = mp.metadata_version dataset_builder = DatasetMetadataBuilder( uuid=dataset_uuid, metadata_version=metadata_version, partition_keys=mp.partition_keys, ) for mp in partition_list: if mp.schema: schemas.add(mp.schema) dataset_builder.schema = persist_common_metadata( schemas=schemas, update_dataset=update_dataset, store=store, dataset_uuid=dataset_uuid, table_name=table_name, ) # We can only check for non unique partition labels here and if they occur we will # fail hard. The resulting dataset may be corrupted or file may be left in the store # without dataset metadata partition_labels = partition_labels_from_mps(partition_list) # This could be safely removed since we do not allow to set this by the user # anymore. It has implications on tests if mocks are used non_unique_labels = extract_duplicates(partition_labels) if non_unique_labels: raise ValueError( "The labels {} are duplicated. Dataset metadata was not written.". format(", ".join(non_unique_labels))) if remove_partitions is None: remove_partitions = [] if metadata_merger is None: metadata_merger = combine_metadata dataset_builder = update_metadata(dataset_builder, metadata_merger, dataset_metadata) dataset_builder = update_partitions(dataset_builder, partition_list, remove_partitions) dataset_builder = update_indices(dataset_builder, store, partition_list, remove_partitions) if metadata_storage_format.lower() == "json": store.put(*dataset_builder.to_json()) elif metadata_storage_format.lower() == "msgpack": store.put(*dataset_builder.to_msgpack()) else: raise ValueError( "Unknown metadata storage format encountered: {}".format( metadata_storage_format)) dataset = dataset_builder.to_dataset() return dataset
def test_builder_full(metadata_version, frozen_time): expected = { "dataset_uuid": "uuid", "dataset_metadata_version": metadata_version, "partitions": { "run_id=1/L=1/P=1/part_1": { "files": { "core": "uuid/core/run_id=1/L=1/P=1/part_1.parquet", "helper": "uuid/helper/run_id=1/L=1/P=1/part_1.parquet", } } }, "metadata": { "key": "value", "creation_time": TIME_TO_FREEZE_ISO }, "indices": { "col1": { "a": ["run_id=1/L=1/P=1/part_1"], "b": ["run_id=2/L=1/P=1/part_1"], }, "col2": "uuid.col2.by-dataset-index.parquet", }, "partition_keys": ["L", "P"], } builder = DatasetMetadataBuilder("uuid", metadata_version=metadata_version, partition_keys=["L", "P"]) part_2 = Partition( label="run_id=1/L=1/P=1/part_1", files={ "core": "uuid/core/run_id=1/L=1/P=1/part_1.parquet", "helper": "uuid/helper/run_id=1/L=1/P=1/part_1.parquet", }, ) builder.add_partition("run_id=1/L=1/P=1/part_1", part_2) builder.add_metadata("key", "value") builder.add_external_index("col2") builder.add_embedded_index( "col1", ExplicitSecondaryIndex("col1", { "a": ["run_id=1/L=1/P=1/part_1"], "b": ["run_id=2/L=1/P=1/part_1"] }), ) key, result = builder.to_json() result = simplejson.loads(result) assert key == "uuid.by-dataset-metadata.json" assert result == expected
def copy_dataset( source_dataset_uuid: str, store: KeyValueStore, target_dataset_uuid: Optional[str] = None, target_store: Optional[KeyValueStore] = None, ) -> Dict[str, DatasetMetadata]: """ Copies and optionally renames a dataset, either from one store to another or within one store. Parameters ---------- source_dataset_uuid: str UUID of source dataset store: simplekv.KeyValueStore Source store target_dataset_uuid: Optional[str] UUID of target dataset. May be the same as src_dataset_uuid, if store and tgt_store are different. If empty, src_dataset_uuid is used target_store: Optional[simplekv.KeyValueStore] Target Store. May be the same as store, if src_dataset_uuid and target_dataset_uuid are different. If empty, value from parameter store is used """ if target_dataset_uuid is None: target_dataset_uuid = source_dataset_uuid if target_store is None: target_store = store if (source_dataset_uuid == target_dataset_uuid) & (store == target_store): raise ValueError( "Cannot copy to a dataset with the same UUID within the same store!" ) ds_factory_source = _ensure_factory( dataset_uuid=source_dataset_uuid, store=store, factory=None, load_dataset_metadata=True, ) # Create a dict of {source key: target key} entries keys = get_dataset_keys(ds_factory_source.dataset_metadata) mapped_keys = { source_key: source_key.replace(source_dataset_uuid, target_dataset_uuid) for source_key in keys } # Create a dict of metadata which has to be changed. This is only the # <uuid>.by-dataset-metadata.json file md_transformed = { f"{target_dataset_uuid}{METADATA_BASE_SUFFIX}{METADATA_FORMAT_JSON}": DatasetMetadataBuilder.from_dataset( ds_factory_source.dataset_metadata).modify_uuid( target_dataset_uuid).to_dataset() } # Copy the keys from one store to another copy_rename_keys(mapped_keys, store, target_store, md_transformed) return md_transformed