def store_delayed_as_dataset( delayed_tasks: List[Delayed], store, dataset_uuid=None, metadata=None, df_serializer=None, overwrite=False, metadata_merger=None, metadata_version=naming.DEFAULT_METADATA_VERSION, partition_on=None, metadata_storage_format=naming.DEFAULT_METADATA_STORAGE_FORMAT, table_name: str = SINGLE_TABLE, secondary_indices=None, ) -> Delayed: """ Transform and store a list of dictionaries containing dataframes to a kartothek dataset in store. Parameters ---------- """ store = lazy_store(store) if dataset_uuid is None: dataset_uuid = gen_uuid() if not overwrite: raise_if_dataset_exists(dataset_uuid=dataset_uuid, store=store) raise_if_indices_overlap(partition_on, secondary_indices) input_to_mps = partial( parse_input_to_metapartition, metadata_version=metadata_version, table_name=table_name, ) mps = map_delayed(input_to_mps, delayed_tasks) if partition_on: mps = map_delayed(MetaPartition.partition_on, mps, partition_on=partition_on) if secondary_indices: mps = map_delayed(MetaPartition.build_indices, mps, columns=secondary_indices) mps = map_delayed( MetaPartition.store_dataframes, mps, store=store, df_serializer=df_serializer, dataset_uuid=dataset_uuid, ) return delayed(store_dataset_from_partitions)( mps, dataset_uuid=dataset_uuid, store=store, dataset_metadata=metadata, metadata_merger=metadata_merger, metadata_storage_format=metadata_storage_format, )
def store_bag_as_dataset( bag, store, dataset_uuid=None, metadata=None, df_serializer=None, overwrite=False, metadata_merger=None, metadata_version=naming.DEFAULT_METADATA_VERSION, partition_on=None, metadata_storage_format=naming.DEFAULT_METADATA_STORAGE_FORMAT, secondary_indices=None, ): """ Transform and store a dask.bag of dictionaries containing dataframes to a kartothek dataset in store. This is the dask.bag-equivalent of :func:`~kartothek.io.dask.delayed.store_delayed_as_dataset`. See there for more detailed documentation on the different possible input types. Parameters ---------- bag: dask.bag.Bag A dask bag containing dictionaries of dataframes or dataframes. """ store = lazy_store(store) if dataset_uuid is None: dataset_uuid = gen_uuid() if not overwrite: raise_if_dataset_exists(dataset_uuid=dataset_uuid, store=store) raise_if_indices_overlap(partition_on, secondary_indices) input_to_mps = partial(parse_input_to_metapartition, metadata_version=metadata_version) mps = bag.map(input_to_mps) if partition_on: mps = mps.map(MetaPartition.partition_on, partition_on=partition_on) if secondary_indices: mps = mps.map(MetaPartition.build_indices, columns=secondary_indices) mps = mps.map( MetaPartition.store_dataframes, store=store, df_serializer=df_serializer, dataset_uuid=dataset_uuid, ) aggregate = partial( _store_dataset_from_partitions_flat, dataset_uuid=dataset_uuid, store=store, dataset_metadata=metadata, metadata_merger=metadata_merger, metadata_storage_format=metadata_storage_format, ) return mps.reduction(perpartition=list, aggregate=aggregate, split_every=False)
def store_dataframes_as_dataset__iter( df_generator, store, dataset_uuid=None, metadata=None, partition_on=None, df_serializer=None, overwrite=False, metadata_storage_format=DEFAULT_METADATA_STORAGE_FORMAT, metadata_version=DEFAULT_METADATA_VERSION, secondary_indices=None, ): """ Store `pd.DataFrame` s iteratively as a partitioned dataset with multiple tables (files). Useful for datasets which do not fit into memory. Parameters ---------- Returns ------- dataset: kartothek.core.dataset.DatasetMetadata The stored dataset. """ if dataset_uuid is None: dataset_uuid = gen_uuid() if not overwrite: raise_if_dataset_exists(dataset_uuid=dataset_uuid, store=store) raise_if_indices_overlap(partition_on, secondary_indices) new_partitions = [] for df in df_generator: mp = parse_input_to_metapartition(df, metadata_version=metadata_version) if partition_on: mp = mp.partition_on(partition_on) if secondary_indices: mp = mp.build_indices(secondary_indices) # Store dataframe, thereby clearing up the dataframe from the `mp` metapartition mp = mp.store_dataframes( store=store, dataset_uuid=dataset_uuid, df_serializer=df_serializer ) # Add `kartothek.io_components.metapartition.MetaPartition` object to list to track partitions new_partitions.append(mp) # Store metadata and return `kartothek.DatasetMetadata` object return store_dataset_from_partitions( partition_list=new_partitions, dataset_uuid=dataset_uuid, store=store, dataset_metadata=metadata, metadata_storage_format=metadata_storage_format, )