def store_dataset_from_ddf( ddf: dd.DataFrame, store: StoreInput, dataset_uuid: str, table: str = SINGLE_TABLE, secondary_indices: Optional[List[str]] = None, shuffle: bool = False, repartition_ratio: Optional[SupportsFloat] = None, num_buckets: int = 1, sort_partitions_by: Optional[Union[List[str], str]] = None, delete_scope: Optional[Iterable[Mapping[str, str]]] = None, metadata: Optional[Mapping] = None, df_serializer: Optional[DataFrameSerializer] = None, metadata_merger: Optional[Callable] = None, metadata_version: int = DEFAULT_METADATA_VERSION, partition_on: Optional[List[str]] = None, bucket_by: Optional[Union[List[str], str]] = None, overwrite: bool = False, ): """ Store a dataset from a dask.dataframe. """ partition_on = normalize_arg("partition_on", partition_on) secondary_indices = normalize_arg("secondary_indices", secondary_indices) sort_partitions_by = normalize_arg("sort_partitions_by", sort_partitions_by) bucket_by = normalize_arg("bucket_by", bucket_by) store = normalize_arg("store", store) delete_scope = dask.delayed(normalize_arg)("delete_scope", delete_scope) if table is None: raise TypeError("The parameter `table` is not optional.") ds_factory = _ensure_factory( dataset_uuid=dataset_uuid, store=store, factory=None, load_dataset_metadata=True ) if not overwrite: raise_if_dataset_exists(dataset_uuid=dataset_uuid, store=store) mps = _write_dataframe_partitions( ddf=ddf, store=store, dataset_uuid=dataset_uuid, table=table, secondary_indices=secondary_indices, shuffle=shuffle, repartition_ratio=repartition_ratio, num_buckets=num_buckets, sort_partitions_by=sort_partitions_by, df_serializer=df_serializer, metadata_version=metadata_version, partition_on=partition_on, bucket_by=bucket_by, ) return dask.delayed(store_dataset_from_partitions)( mps, store=ds_factory.store_factory if ds_factory else store, dataset_uuid=ds_factory.dataset_uuid if ds_factory else dataset_uuid, dataset_metadata=metadata, metadata_merger=metadata_merger, )
def store_dataframes_as_dataset__iter( df_generator, store, dataset_uuid=None, metadata=None, partition_on=None, df_serializer=None, overwrite=False, metadata_storage_format=DEFAULT_METADATA_STORAGE_FORMAT, metadata_version=DEFAULT_METADATA_VERSION, secondary_indices=None, ): """ Store `pd.DataFrame` s iteratively as a partitioned dataset with multiple tables (files). Useful for datasets which do not fit into memory. Parameters ---------- Returns ------- dataset: kartothek.core.dataset.DatasetMetadata The stored dataset. """ if dataset_uuid is None: dataset_uuid = gen_uuid() if not overwrite: raise_if_dataset_exists(dataset_uuid=dataset_uuid, store=store) new_partitions = [] for df in df_generator: mp = parse_input_to_metapartition(df, metadata_version=metadata_version) if partition_on: mp = mp.partition_on(partition_on) if secondary_indices: mp = mp.build_indices(secondary_indices) # Store dataframe, thereby clearing up the dataframe from the `mp` metapartition mp = mp.store_dataframes(store=store, dataset_uuid=dataset_uuid, df_serializer=df_serializer) # Add `kartothek.io_components.metapartition.MetaPartition` object to list to track partitions new_partitions.append(mp) # Store metadata and return `kartothek.DatasetMetadata` object return store_dataset_from_partitions( partition_list=new_partitions, dataset_uuid=dataset_uuid, store=store, dataset_metadata=metadata, metadata_storage_format=metadata_storage_format, )
def store_delayed_as_dataset( delayed_tasks: List[Delayed], store, dataset_uuid=None, metadata=None, df_serializer=None, overwrite=False, metadata_merger=None, metadata_version=naming.DEFAULT_METADATA_VERSION, partition_on=None, metadata_storage_format=naming.DEFAULT_METADATA_STORAGE_FORMAT, table_name: str = SINGLE_TABLE, secondary_indices=None, ) -> Delayed: """ Transform and store a list of dictionaries containing dataframes to a kartothek dataset in store. Parameters ---------- """ store = lazy_store(store) if dataset_uuid is None: dataset_uuid = gen_uuid() if not overwrite: raise_if_dataset_exists(dataset_uuid=dataset_uuid, store=store) raise_if_indices_overlap(partition_on, secondary_indices) input_to_mps = partial( parse_input_to_metapartition, metadata_version=metadata_version, table_name=table_name, ) mps = map_delayed(input_to_mps, delayed_tasks) if partition_on: mps = map_delayed(MetaPartition.partition_on, mps, partition_on=partition_on) if secondary_indices: mps = map_delayed(MetaPartition.build_indices, mps, columns=secondary_indices) mps = map_delayed( MetaPartition.store_dataframes, mps, store=store, df_serializer=df_serializer, dataset_uuid=dataset_uuid, ) return delayed(store_dataset_from_partitions)( mps, dataset_uuid=dataset_uuid, store=store, dataset_metadata=metadata, metadata_merger=metadata_merger, metadata_storage_format=metadata_storage_format, )
def create_empty_dataset_header( store, dataset_uuid, table_meta, partition_on=None, metadata=None, overwrite=False, metadata_storage_format=DEFAULT_METADATA_STORAGE_FORMAT, metadata_version=DEFAULT_METADATA_VERSION, ): """ Create an dataset header without any partitions. This may be used in combination with :func:`~kartothek.io.eager.write_single_partition` to create implicitly partitioned datasets. .. note:: The created dataset will **always** have explicit_partition==False .. warning:: This function should only be used in very rare occasions. Usually you're better off using full end-to-end pipelines. Parameters ---------- """ store = lazy_store(store)() if not overwrite: raise_if_dataset_exists(dataset_uuid=dataset_uuid, store=store) for table, schema in table_meta.items(): table_meta[table] = make_meta(schema, origin=table, partition_keys=partition_on) store_schema_metadata( schema=table_meta[table], dataset_uuid=dataset_uuid, store=store, table=table, ) dataset_builder = DatasetMetadataBuilder( uuid=dataset_uuid, metadata_version=metadata_version, partition_keys=partition_on, explicit_partitions=False, table_meta=table_meta, ) if metadata: for key, value in metadata.items(): dataset_builder.add_metadata(key, value) if metadata_storage_format.lower() == "json": store.put(*dataset_builder.to_json()) elif metadata_storage_format.lower() == "msgpack": store.put(*dataset_builder.to_msgpack()) else: raise ValueError( "Unknown metadata storage format encountered: {}".format( metadata_storage_format)) return dataset_builder.to_dataset()
def store_dataframes_as_dataset( store, dataset_uuid, dfs, metadata=None, partition_on=None, df_serializer=None, overwrite=False, metadata_storage_format=DEFAULT_METADATA_STORAGE_FORMAT, metadata_version=DEFAULT_METADATA_VERSION, ): """ Utility function to store a list of dataframes as a partitioned dataset with multiple tables (files). Useful for very small datasets where all data fits into memory. Parameters ---------- dfs : dict of pd.DataFrame or pd.DataFrame The dataframe(s) to be stored. If only a single dataframe is passed, it will be stored as the `core` table. Returns ------- The stored dataset """ if dataset_uuid is None: dataset_uuid = gen_uuid() if isinstance(dfs, dict): dfs = {"data": [(table, df) for table, df in dfs.items()]} if not overwrite: raise_if_dataset_exists(dataset_uuid=dataset_uuid, store=store) mp = parse_input_to_metapartition(dfs, metadata_version) if partition_on: mp = MetaPartition.partition_on(mp, partition_on) mps = mp.store_dataframes(store=store, dataset_uuid=dataset_uuid, df_serializer=df_serializer) return store_dataset_from_partitions( partition_list=mps, dataset_uuid=dataset_uuid, store=store, dataset_metadata=metadata, metadata_storage_format=metadata_storage_format, )
def _store_bag_as_dataset_parallel( bag: db.Bag, store: KeyValueStore, cube: Cube, ktk_cube_dataset_ids: Iterable[str], metadata: Optional[Dict[str, Dict[str, Any]]], existing_datasets, overwrite: bool = False, update: bool = False, delete_scopes=None, df_serializer: Optional[ParquetSerializer] = None, ) -> db.Bag: """ Vendored, simplified and modified version of kartotheks ``store_bag_as_dataset`` which cannot be easily used to store datasets in parallel (e.g. from a dict). `delete_scope` is a dictionary mapping the kartothek dataset id to the `delete_scope` of the dataset (see `update_dataset_from_partitions` for the definition of the single dataset `delete_scope`). """ if (not update) and (not overwrite): for ktk_cube_dataset_id in ktk_cube_dataset_ids: raise_if_dataset_exists( dataset_uuid=cube.ktk_dataset_uuid(ktk_cube_dataset_id), store=store) mps = bag.map(_multiplex_parse_input_to_metapartition) # prepare_data_for_ktk already runs `MetaPartition.partition_on` and `MetaPartition.build_indices`, so this is not # required here anymore mps = mps.map(_multiplex_store, store=store, cube=cube, df_serializer=df_serializer) aggregate = partial( _multiplex_store_dataset_from_partitions_flat, cube=cube, existing_datasets=existing_datasets, metadata=metadata, store=store, update=update, delete_scopes=delete_scopes or {}, ) return mps.reduction(perpartition=list, aggregate=aggregate, split_every=False, out_type=db.Bag)
def _store_bag_as_dataset_parallel( bag, store, cube, ktk_cube_dataset_ids, metadata, existing_datasets, overwrite=False, update=False, ): """ Vendored, simplified and modified version of kartotheks ``store_bag_as_dataset`` which cannot be easily used to store datasets in parallel (e.g. from a dict). """ if (not update) and (not overwrite): for ktk_cube_dataset_id in ktk_cube_dataset_ids: raise_if_dataset_exists( dataset_uuid=cube.ktk_dataset_uuid(ktk_cube_dataset_id), store=store) mps = bag.map(_multiplex_parse_input_to_metapartition) # prepare_data_for_ktk already runs `MetaPartition.partition_on` and `MetaPartition.build_indices`, so this is not # required here anymore mps = mps.map(_multiplex_store, store=store, cube=cube) aggregate = partial( _multiplex_store_dataset_from_partitions_flat, cube=cube, existing_datasets=existing_datasets, metadata=metadata, store=store, update=update, ) return mps.reduction(perpartition=list, aggregate=aggregate, split_every=False, out_type=db.Bag)
def store_delayed_as_dataset( delayed_tasks, store, dataset_uuid=None, metadata=None, df_serializer=None, overwrite=False, metadata_merger=None, metadata_version=naming.DEFAULT_METADATA_VERSION, partition_on=None, metadata_storage_format=naming.DEFAULT_METADATA_STORAGE_FORMAT, secondary_indices=None, ): """ Transform and store a list of dictionaries containing dataframes to a kartothek dataset in store. Parameters ---------- delayed_tasks: list of dask.delayed Every delayed object represents a partition and should be accepted by :func:`~kartothek.io_components.metapartition.parse_input_to_metapartition` Returns ------- A dask.delayed dataset object. """ _check_callable(store) if dataset_uuid is None: dataset_uuid = gen_uuid() if not overwrite: raise_if_dataset_exists(dataset_uuid=dataset_uuid, store=store) input_to_mps = partial(parse_input_to_metapartition, metadata_version=metadata_version) mps = map_delayed(delayed_tasks, input_to_mps) if partition_on: mps = map_delayed(mps, MetaPartition.partition_on, partition_on=partition_on) if secondary_indices: mps = map_delayed(mps, MetaPartition.build_indices, columns=secondary_indices) mps = map_delayed( mps, MetaPartition.store_dataframes, store=store, df_serializer=df_serializer, dataset_uuid=dataset_uuid, ) return delayed(store_dataset_from_partitions)( mps, dataset_uuid=dataset_uuid, store=store, dataset_metadata=metadata, metadata_merger=metadata_merger, metadata_storage_format=metadata_storage_format, )
def store_bag_as_dataset( bag, store, dataset_uuid=None, metadata=None, df_serializer=None, overwrite=False, metadata_merger=None, metadata_version=naming.DEFAULT_METADATA_VERSION, partition_on=None, metadata_storage_format=naming.DEFAULT_METADATA_STORAGE_FORMAT, secondary_indices=None, ): """ Transform and store a dask.bag of dictionaries containing dataframes to a kartothek dataset in store. This is the dask.bag-equivalent of :func:`~kartothek.io.dask.delayed.store_delayed_as_dataset`. See there for more detailed documentation on the different possible input types. Parameters ---------- bag: dask.bag.Bag A dask bag containing dictionaries of dataframes or dataframes. """ store = lazy_store(store) if dataset_uuid is None: dataset_uuid = gen_uuid() if not overwrite: raise_if_dataset_exists(dataset_uuid=dataset_uuid, store=store) raise_if_indices_overlap(partition_on, secondary_indices) input_to_mps = partial(parse_input_to_metapartition, metadata_version=metadata_version) mps = bag.map(input_to_mps) if partition_on: mps = mps.map(MetaPartition.partition_on, partition_on=partition_on) if secondary_indices: mps = mps.map(MetaPartition.build_indices, columns=secondary_indices) mps = mps.map( MetaPartition.store_dataframes, store=store, df_serializer=df_serializer, dataset_uuid=dataset_uuid, ) aggregate = partial( _store_dataset_from_partitions_flat, dataset_uuid=dataset_uuid, store=store, dataset_metadata=metadata, metadata_merger=metadata_merger, metadata_storage_format=metadata_storage_format, ) return mps.reduction(perpartition=list, aggregate=aggregate, split_every=False)
def test_raise_if_dataset_exists(store_factory, dataset_function): raise_if_dataset_exists(dataset_uuid="ThisDoesNotExist", store=store_factory) with pytest.raises(RuntimeError): raise_if_dataset_exists(dataset_uuid=dataset_function.uuid, store=store_factory)
def store_dataset_from_ddf( ddf: dd.DataFrame, store: StoreInput, dataset_uuid: str, table: str = SINGLE_TABLE, secondary_indices: Optional[List[str]] = None, shuffle: bool = False, repartition_ratio: Optional[SupportsFloat] = None, num_buckets: int = 1, sort_partitions_by: Optional[Union[List[str], str]] = None, metadata: Optional[Mapping] = None, df_serializer: Optional[DataFrameSerializer] = None, metadata_merger: Optional[Callable] = None, metadata_version: int = DEFAULT_METADATA_VERSION, partition_on: Optional[List[str]] = None, bucket_by: Optional[Union[List[str], str]] = None, overwrite: bool = False, ): """ Store a dataset from a dask.dataframe. """ # normalization done by normalize_args but mypy doesn't recognize this sort_partitions_by = cast(List[str], sort_partitions_by) secondary_indices = cast(List[str], secondary_indices) bucket_by = cast(List[str], bucket_by) partition_on = cast(List[str], partition_on) if table is None: raise TypeError("The parameter `table` is not optional.") ds_factory = _ensure_factory(dataset_uuid=dataset_uuid, store=store, factory=None) if not overwrite: raise_if_dataset_exists(dataset_uuid=dataset_uuid, store=store) mp_ser = _write_dataframe_partitions( ddf=ddf, store=ds_factory.store_factory, dataset_uuid=dataset_uuid, table=table, secondary_indices=secondary_indices, shuffle=shuffle, repartition_ratio=repartition_ratio, num_buckets=num_buckets, sort_partitions_by=sort_partitions_by, df_serializer=df_serializer, metadata_version=metadata_version, partition_on=partition_on, bucket_by=bucket_by, ) return mp_ser.reduction( chunk=_id, aggregate=_commit_store_from_reduction, split_every=False, token="commit-dataset", meta=object, aggregate_kwargs={ "store": ds_factory.store_factory, "dataset_uuid": ds_factory.dataset_uuid, "dataset_metadata": metadata, "metadata_merger": metadata_merger, }, )