示例#1
0
def _rollback_transaction(existing_datasets, new_datasets, store):
    """
    Rollback changes made during tht write process.

    Parameters
    ----------
    existing_datasets: Dict[str, kartothek.core.dataset.DatasetMetadata]
        Datasets that existings before the write process started.
    new_datasets: Dict[str, kartothek.core.dataset.DatasetMetadata]
        Datasets that where created / changed during the write process.
    store: Union[Callable[[], simplekv.KeyValueStore], simplekv.KeyValueStore]
        KV store.
    """
    if callable(store):
        store = store()

    # delete newly created datasets that where not present before the "transaction"
    for ktk_cube_dataset_id in sorted(
            set(new_datasets) - set(existing_datasets)):
        store.delete(
            metadata_key_from_uuid(new_datasets[ktk_cube_dataset_id].uuid))

    # recover changes of old datasets
    for ktk_cube_dataset_id in sorted(
            set(new_datasets) & set(existing_datasets)):
        ds = existing_datasets[ktk_cube_dataset_id]
        builder = DatasetMetadataBuilder.from_dataset(ds)
        store.put(*builder.to_json())
        store_schema_metadata(schema=ds.schema,
                              dataset_uuid=ds.uuid,
                              store=store,
                              table=ds.table_name)
示例#2
0
def update_indices_from_partitions(partition_list, dataset_metadata_factory):
    """
    This takes indices from a partition list and overwrites all indices in the dataset metadata
    provided by the dataset metadata factory. The same is done in the store dataset part. This is used
    in an additional build index step (by the build_dataset_indices__pipeline) which should be used after
    updating partitions of a dataset.
    """

    dataset_indices = MetaPartition.merge_indices(partition_list)

    indices = persist_indices(
        store=dataset_metadata_factory.store,
        dataset_uuid=dataset_metadata_factory.uuid,
        indices=dataset_indices,
    )

    for column, storage_key in six.iteritems(indices):
        dataset_metadata_factory.indices[column] = ExplicitSecondaryIndex(
            column=column, index_storage_key=storage_key)

    dataset_metadata_factory.store.put(
        naming.metadata_key_from_uuid(dataset_metadata_factory.uuid),
        dataset_metadata_factory.to_json(),
    )
    return dataset_metadata_factory
示例#3
0
    def load_from_store(
        uuid: str,
        store: StoreInput,
        load_schema: bool = True,
        load_all_indices: bool = False,
    ) -> "DatasetMetadata":
        """
        Load a dataset from a storage

        Parameters
        ----------
        uuid
            UUID of the dataset.
        store
            Object that implements the .get method for file/object loading.
        load_schema
            Load table schema
        load_all_indices
            Load all registered indices into memory.

        Returns
        -------
        dataset_metadata: :class:`~kartothek.core.dataset.DatasetMetadata`
            Parsed metadata.
        """
        key1 = naming.metadata_key_from_uuid(uuid)
        store = ensure_store(store)
        try:
            value = store.get(key1)
            metadata = load_json(value)
        except KeyError:
            key2 = naming.metadata_key_from_uuid(uuid, format="msgpack")
            try:
                value = store.get(key2)
                metadata = unpackb(value)
            except KeyError:
                raise KeyError(
                    "Dataset does not exist. Tried {} and {}".format(
                        key1, key2))

        ds = DatasetMetadata.load_from_dict(metadata,
                                            store,
                                            load_schema=load_schema)
        if load_all_indices:
            ds = ds.load_all_indices(store)
        return ds
示例#4
0
    def exists(uuid: str, store: StoreInput) -> bool:
        """
        Check if  a dataset exists in a storage

        Parameters
        ----------
        uuid
            UUID of the dataset.
        store
            Object that implements the .get method for file/object loading.

        """
        store = ensure_store(store)
        key = naming.metadata_key_from_uuid(uuid)

        if key in store:
            return True

        key = naming.metadata_key_from_uuid(uuid, format="msgpack")
        return key in store
示例#5
0
def raise_if_dataset_exists(dataset_uuid, store):
    try:
        store_instance = _instantiate_store(store)
        for form in ["msgpack", "json"]:
            key = naming.metadata_key_from_uuid(uuid=dataset_uuid, format=form)
            if key in store_instance:
                raise RuntimeError(
                    "Dataset `%s` already exists and overwrite is not permitted!",
                    dataset_uuid,
                )
    except KeyError:
        pass
示例#6
0
    def exists(uuid, store):
        """
        Check if  a dataset exists in a storage

        Parameters
        ----------
        uuid: str or unicode
            UUID of the dataset.
        store: Object
            Object that implements the .get method for file/object loading.

        Returns
        -------
        exists: bool
            Whether a metadata file could be found.
        """
        key = naming.metadata_key_from_uuid(uuid)

        if key in store:
            return True

        key = naming.metadata_key_from_uuid(uuid, format="msgpack")
        return key in store
示例#7
0
    def to_msgpack(self) -> Tuple[str, bytes]:
        """
        Render the dataset to msgpack.

        Returns
        -------
        storage_key: str
            The path where this metadata should be placed in the storage.
        dataset_json: str
            The rendered JSON for this dataset.
        """
        return (
            naming.metadata_key_from_uuid(self.uuid, format="msgpack"),
            packb(self.to_dict()),
        )
示例#8
0
    def to_json(self):
        """
        Render the dataset to JSON.

        Returns
        -------
        storage_key: str
            The path where this metadata should be placed in the storage.
        dataset_json: str
            The rendered JSON for this dataset.
        """
        return (
            naming.metadata_key_from_uuid(self.uuid),
            simplejson.dumps(self.to_dict()).encode("utf-8"),
        )
示例#9
0
def delete_top_level_metadata(dataset_factory, *args):
    """
    The additional arguments allow to schedule this function with delayed objects.
    """
    dataset_factory.store.delete(
        metadata_key_from_uuid(dataset_factory.dataset_uuid))