def __init__( self, dataset_uuid: str, store_factory: Callable[[], "KeyValueStore"], load_schema: bool = True, load_all_indices: bool = False, load_dataset_metadata: bool = True, ) -> None: """ A dataset factory object which can be used to cache dataset load operations. This class should be the primary user entry point when reading datasets. Example using the eager backend: .. code:: from functools import partial from storefact import get_store_from_url from kartothek.io.eager import read_table ds_factory = DatasetFactory( dataset_uuid="my_test_dataset", store=partial(get_store_from_url, store_url) ) df = read_table(factory=ds_factory) Parameters ---------- dataset_uuid: str The unique indetifier for the dataset. store_factory: callable A callable which creates a KeyValueStore object load_schema: bool Load the schema information immediately. load_all_indices: bool Load all indices immediately. load_dataset_metadata: bool Keep the user metadata in memory """ self._cache_metadata: Optional[DatasetMetadata] = None self._cache_store = None _check_callable(store_factory) self.store_factory = store_factory self.dataset_uuid = dataset_uuid self.load_schema = load_schema self._ds_callable = None self.is_loaded = False self.load_dataset_metadata = load_dataset_metadata self.load_all_indices_flag = load_all_indices
def store_delayed_as_dataset( delayed_tasks, store, dataset_uuid=None, metadata=None, df_serializer=None, overwrite=False, metadata_merger=None, metadata_version=naming.DEFAULT_METADATA_VERSION, partition_on=None, metadata_storage_format=naming.DEFAULT_METADATA_STORAGE_FORMAT, secondary_indices=None, ): """ Transform and store a list of dictionaries containing dataframes to a kartothek dataset in store. Parameters ---------- delayed_tasks: list of dask.delayed Every delayed object represents a partition and should be accepted by :func:`~kartothek.io_components.metapartition.parse_input_to_metapartition` Returns ------- A dask.delayed dataset object. """ _check_callable(store) if dataset_uuid is None: dataset_uuid = gen_uuid() if not overwrite: raise_if_dataset_exists(dataset_uuid=dataset_uuid, store=store) input_to_mps = partial(parse_input_to_metapartition, metadata_version=metadata_version) mps = map_delayed(delayed_tasks, input_to_mps) if partition_on: mps = map_delayed(mps, MetaPartition.partition_on, partition_on=partition_on) if secondary_indices: mps = map_delayed(mps, MetaPartition.build_indices, columns=secondary_indices) mps = map_delayed( mps, MetaPartition.store_dataframes, store=store, df_serializer=df_serializer, dataset_uuid=dataset_uuid, ) return delayed(store_dataset_from_partitions)( mps, dataset_uuid=dataset_uuid, store=store, dataset_metadata=metadata, metadata_merger=metadata_merger, metadata_storage_format=metadata_storage_format, )
def merge_datasets_as_delayed( left_dataset_uuid, right_dataset_uuid, store, merge_tasks, match_how="exact", label_merger=None, metadata_merger=None, ): """ A dask.delayed graph to perform the merge of two full kartothek datasets. Parameters ---------- left_dataset_uuid : basestring UUID for left dataset (order does not matter in all merge schemas) right_dataset_uuid : basestring UUID for right dataset (order does not matter in all merge schemas) match_how : basestring or callable, {left, right, prefix, exact} Define the partition label matching scheme. Available implementations are: Parameters ---------- left_dataset_uuid : str UUID for left dataset (order does not matter in all merge schemas) right_dataset_uuid : str UUID for right dataset (order does not matter in all merge schemas) match_how : Union[str, Callable] Define the partition label matching scheme. Available implementations are: * left (right) : The left (right) partitions are considered to be the base partitions and **all** partitions of the right (left) dataset are joined to the left partition. This should only be used if one of the datasets contain very few partitions. * prefix : The labels of the partitions of the dataset with fewer partitions are considered to be the prefixes to the right dataset * exact : All partition labels of the left dataset need to have an exact match in the right dataset * callable : A callable with signature func(left, right) which returns a boolean to determine if the partitions match If True, an exact match of partition labels between the to-be-merged datasets is required in order to merge. If False (Default), the partition labels of the dataset with fewer partitions are interpreted as prefixes. merge_tasks : List[Dict] A list of merge tasks. Each item in this list is a dictionary giving explicit instructions for a specific merge. Each dict should contain key/values: * `left`: The table for the left dataframe * `right`: The table for the right dataframe * 'output_label' : The table for the merged dataframe * `merge_func`: A callable with signature `merge_func(left_df, right_df, merge_kwargs)` to handle the data preprocessing and merging. Default pandas.merge * 'merge_kwargs' : The kwargs to be passed to the `merge_func` Example: .. code:: >>> merge_tasks = [ ... { ... "left": "left_dict", ... "right": "right_dict", ... "merge_kwargs": {"kwargs of merge_func": ''}, ... "output_label": 'merged_core_data' ... }, ... ] """ _check_callable(store) mps = align_datasets( left_dataset_uuid=left_dataset_uuid, right_dataset_uuid=right_dataset_uuid, store=store, match_how=match_how, ) mps = map_delayed( mps, _load_and_merge_mps, store=store, label_merger=label_merger, metadata_merger=metadata_merger, merge_tasks=merge_tasks, ) return mps
def store_bag_as_dataset( bag, store, dataset_uuid=None, metadata=None, df_serializer=None, overwrite=False, metadata_merger=None, metadata_version=naming.DEFAULT_METADATA_VERSION, partition_on=None, metadata_storage_format=naming.DEFAULT_METADATA_STORAGE_FORMAT, secondary_indices=None, ): """ Transform and store a dask.bag of dictionaries containing dataframes to a kartothek dataset in store. This is the dask.bag-equivalent of :func:`store_delayed_as_dataset`. See there for more detailed documentation on the different possible input types. Parameters ---------- bag: dask.bag A dask bag containing dictionaries of dataframes or dataframes. Returns ------- A dask.bag.Item dataset object. """ _check_callable(store) if dataset_uuid is None: dataset_uuid = gen_uuid() if not overwrite: raise_if_dataset_exists(dataset_uuid=dataset_uuid, store=store) input_to_mps = partial( parse_input_to_metapartition, metadata_version=metadata_version ) mps = bag.map(input_to_mps) if partition_on: mps = mps.map(MetaPartition.partition_on, partition_on=partition_on) if secondary_indices: mps = mps.map(MetaPartition.build_indices, columns=secondary_indices) mps = mps.map( MetaPartition.store_dataframes, store=store, df_serializer=df_serializer, dataset_uuid=dataset_uuid, ) aggregate = partial( _store_dataset_from_partitions_flat, dataset_uuid=dataset_uuid, store=store, dataset_metadata=metadata, metadata_merger=metadata_merger, metadata_storage_format=metadata_storage_format, ) return mps.reduction(perpartition=list, aggregate=aggregate, split_every=False)