def garbage_collect_dataset(dataset_uuid=None, store=None, factory=None): """ Remove auxiliary files that are no longer tracked by the dataset. These files include indices that are no longer referenced by the metadata as well as files in the directories of the tables that are no longer referenced. The latter is only applied to static datasets. Parameters ---------- """ ds_factory = _ensure_factory( dataset_uuid=dataset_uuid, store=store, factory=factory, load_dataset_metadata=False, ) nested_files = dispatch_files_to_gc(dataset_uuid=None, store_factory=None, chunk_size=None, factory=ds_factory) # Given that `nested_files` is a generator with a single element, just # return the output of `delete_files` on that element. return delete_files(next(nested_files), store_factory=ds_factory.store_factory)
def validate_partition_keys( dataset_uuid, store, ds_factory, default_metadata_version, partition_on, load_dataset_metadata=True, ): if ds_factory or DatasetMetadata.exists(dataset_uuid, ensure_store(store)): ds_factory = _ensure_factory( dataset_uuid=dataset_uuid, store=store, factory=ds_factory, load_dataset_metadata=load_dataset_metadata, ) ds_metadata_version = ds_factory.metadata_version if partition_on: if not isinstance(partition_on, list): partition_on = [partition_on] if partition_on != ds_factory.partition_keys: raise ValueError( "Incompatible set of partition keys encountered. " "Input partitioning was `{}` while actual dataset was `{}`".format( partition_on, ds_factory.partition_keys ) ) else: partition_on = ds_factory.partition_keys else: ds_factory = None ds_metadata_version = default_metadata_version return ds_factory, ds_metadata_version, partition_on
def delete_dataset(dataset_uuid=None, store=None, factory=None): """ Parameters ---------- """ ds_factory = _ensure_factory( dataset_uuid=dataset_uuid, load_schema=False, store=_make_callable(store), factory=factory, load_dataset_metadata=False, ) # Remove possibly unreferenced files garbage_collect_dataset(factory=ds_factory) # Delete indices first since they do not affect dataset integrity delete_indices(dataset_factory=ds_factory) for metapartition in dispatch_metapartitions_from_factory(ds_factory): metapartition = cast(MetaPartition, metapartition) metapartition.delete_from_store(dataset_uuid=dataset_uuid, store=store) # delete common metadata after partitions delete_common_metadata(dataset_factory=ds_factory) # Delete the top level metadata file delete_top_level_metadata(dataset_factory=ds_factory)
def store_dataset_from_ddf( ddf: dd.DataFrame, store: StoreInput, dataset_uuid: str, table: str = SINGLE_TABLE, secondary_indices: Optional[List[str]] = None, shuffle: bool = False, repartition_ratio: Optional[SupportsFloat] = None, num_buckets: int = 1, sort_partitions_by: Optional[Union[List[str], str]] = None, delete_scope: Optional[Iterable[Mapping[str, str]]] = None, metadata: Optional[Mapping] = None, df_serializer: Optional[DataFrameSerializer] = None, metadata_merger: Optional[Callable] = None, metadata_version: int = DEFAULT_METADATA_VERSION, partition_on: Optional[List[str]] = None, bucket_by: Optional[Union[List[str], str]] = None, overwrite: bool = False, ): """ Store a dataset from a dask.dataframe. """ partition_on = normalize_arg("partition_on", partition_on) secondary_indices = normalize_arg("secondary_indices", secondary_indices) sort_partitions_by = normalize_arg("sort_partitions_by", sort_partitions_by) bucket_by = normalize_arg("bucket_by", bucket_by) store = normalize_arg("store", store) delete_scope = dask.delayed(normalize_arg)("delete_scope", delete_scope) if table is None: raise TypeError("The parameter `table` is not optional.") ds_factory = _ensure_factory( dataset_uuid=dataset_uuid, store=store, factory=None, load_dataset_metadata=True ) if not overwrite: raise_if_dataset_exists(dataset_uuid=dataset_uuid, store=store) mps = _write_dataframe_partitions( ddf=ddf, store=store, dataset_uuid=dataset_uuid, table=table, secondary_indices=secondary_indices, shuffle=shuffle, repartition_ratio=repartition_ratio, num_buckets=num_buckets, sort_partitions_by=sort_partitions_by, df_serializer=df_serializer, metadata_version=metadata_version, partition_on=partition_on, bucket_by=bucket_by, ) return dask.delayed(store_dataset_from_partitions)( mps, store=ds_factory.store_factory if ds_factory else store, dataset_uuid=ds_factory.dataset_uuid if ds_factory else dataset_uuid, dataset_metadata=metadata, metadata_merger=metadata_merger, )
def hash_dataset( store: Optional[StoreInput] = None, dataset_uuid: Optional[str] = None, subset=None, group_key=None, table: str = SINGLE_TABLE, predicates: Optional[PredicatesType] = None, factory: Optional[DatasetFactory] = None, ) -> dd.Series: """ Calculate a partition wise, or group wise, hash of the dataset. .. note:: We do not guarantee the hash values to remain constant accross versions. Example output:: Assuming a dataset with two unique values in column `P` this gives >>> hash_dataset(factory=dataset_with_index_factory,group_key=["P"]).compute() ... P ... 1 11462879952839863487 ... 2 12568779102514529673 ... dtype: uint64 Parameters ---------- subset If provided, only take these columns into account when hashing the dataset group_key If provided, calculate hash per group instead of per partition """ dataset_factory = _ensure_factory( dataset_uuid=dataset_uuid, store=store, factory=factory, load_dataset_metadata=False, ) columns = subset if subset and group_key: columns = sorted(set(subset) | set(group_key)) ddf = read_dataset_as_ddf( table=table, predicates=predicates, factory=dataset_factory, columns=columns, dates_as_object=True, ) if not group_key: return ddf.map_partitions(_hash_partition, meta="uint64").astype("uint64") else: ddf2 = pack_payload(ddf, group_key=group_key) return ( ddf2.groupby(group_key) .apply(_unpack_hash, unpack_meta=ddf._meta, subset=subset, meta="uint64") .astype("uint64") )
def build_dataset_indices(store, dataset_uuid, columns, factory=None): """ Function which builds a :class:`~kartothek.core.index.ExplicitSecondaryIndex`. This function loads the dataset, computes the requested indices and writes the indices to the dataset. The dataset partitions itself are not mutated. Parameters ---------- """ ds_factory = _ensure_factory( dataset_uuid=dataset_uuid, store=store, factory=factory, load_dataset_metadata=False, ) new_partitions = [] for mp in read_dataset_as_metapartitions__iterator(factory=ds_factory): mp = mp.build_indices(columns=columns) mp = mp.remove_dataframes() # Remove dataframe from memory new_partitions.append(mp) return update_indices_from_partitions(new_partitions, dataset_metadata_factory=ds_factory)
def delete_dataset__delayed(dataset_uuid=None, store=None, factory=None): """ Parameters ---------- """ dataset_factory = _ensure_factory( dataset_uuid=dataset_uuid, store=store, factory=factory, load_schema=False, load_dataset_metadata=False, ) gc = garbage_collect_dataset__delayed(factory=dataset_factory) mps = dispatch_metapartitions_from_factory(dataset_factory) delayed_dataset_uuid = delayed(_delete_all_additional_metadata)( dataset_factory=dataset_factory) mps = map_delayed( mps, MetaPartition.delete_from_store, store=store, dataset_uuid=delayed_dataset_uuid, ) return delayed(_delete_tl_metadata)(dataset_factory, mps, gc)
def garbage_collect_dataset__delayed( dataset_uuid: Optional[str] = None, store: StoreInput = None, chunk_size: int = 100, factory=None, ) -> List[Delayed]: """ Remove auxiliary files that are no longer tracked by the dataset. These files include indices that are no longer referenced by the metadata as well as files in the directories of the tables that are no longer referenced. The latter is only applied to static datasets. Parameters ---------- chunk_size Number of files that should be deleted in a single job. """ ds_factory = _ensure_factory( dataset_uuid=dataset_uuid, store=store, factory=factory, load_dataset_metadata=False, ) nested_files = dispatch_files_to_gc( dataset_uuid=None, store_factory=None, chunk_size=chunk_size, factory=ds_factory ) return list( map_delayed(delete_files, nested_files, store_factory=ds_factory.store_factory) )
def read_dataset_as_dataframes( dataset_uuid=None, store=None, tables=None, columns=None, concat_partitions_on_primary_index=False, predicate_pushdown_to_io=True, categoricals=None, label_filter=None, dates_as_object=False, predicates=None, factory=None, ): """ Read a dataset as a list of dataframes. Every element of the list corresponds to a physical partition. Parameters ---------- Returns ------- List[pandas.DataFrame] Returns a list of pandas.DataFrame. One element per partition Examples -------- Dataset in store contains two partitions with two files each .. code :: >>> import storefact >>> from kartothek.io.eager import read_table >>> store = storefact.get_store_from_url('s3://bucket_with_dataset') >>> dfs = read_dataset_as_dataframes('dataset_uuid', store, 'core') """ ds_factory = _ensure_factory( dataset_uuid=dataset_uuid, store=_make_callable(store), factory=factory, load_dataset_metadata=True, ) mps = read_dataset_as_metapartitions( tables=tables, columns=columns, concat_partitions_on_primary_index=concat_partitions_on_primary_index, predicate_pushdown_to_io=predicate_pushdown_to_io, categoricals=categoricals, label_filter=label_filter, dates_as_object=dates_as_object, predicates=predicates, factory=ds_factory, ) return [mp.data for mp in mps]
def read_dataset_as_metapartitions__iterator( dataset_uuid=None, store=None, columns=None, predicate_pushdown_to_io=True, categoricals=None, dates_as_object: bool = True, predicates=None, factory=None, dispatch_by=None, ): """ A Python iterator to retrieve a dataset from store where each partition is loaded as a :class:`~kartothek.io_components.metapartition.MetaPartition`. .. seealso: :func:`~kartothek.io_components.read.read_dataset_as_dataframes__iterator` Parameters ---------- """ ds_factory = _ensure_factory( dataset_uuid=dataset_uuid, store=store, factory=factory, ) store = ds_factory.store mps = dispatch_metapartitions_from_factory( ds_factory, predicates=predicates, dispatch_by=dispatch_by, ) for mp in mps: if dispatch_by is not None: mp = MetaPartition.concat_metapartitions( [ mp_inner.load_dataframes( store=store, columns=columns, categoricals=categoricals, predicate_pushdown_to_io=predicate_pushdown_to_io, predicates=predicates, ) for mp_inner in mp ] ) else: mp = cast(MetaPartition, mp) mp = mp.load_dataframes( store=store, columns=columns, categoricals=categoricals, predicate_pushdown_to_io=predicate_pushdown_to_io, dates_as_object=dates_as_object, predicates=predicates, ) yield mp
def read_dataset_as_ddf( dataset_uuid=None, store=None, table=None, columns=None, concat_partitions_on_primary_index=False, predicate_pushdown_to_io=True, categoricals=None, label_filter=None, dates_as_object=False, predicates=None, factory=None, ): """ Retrieve a single table from a dataset as partition-individual :class:`~dask.dataframe.DataFrame` instance. Please take care when using categoricals with Dask. For index columns, this function will construct dataset wide categoricals. For all other columns, Dask will determine the categories on a partition level and will need to merge them when shuffling data. Parameters ---------- """ ds_factory = _ensure_factory( dataset_uuid=dataset_uuid, store=store, factory=factory, load_dataset_metadata=False, ) if isinstance(columns, dict): columns = columns[table] meta = _get_dask_meta_for_dataset( ds_factory, table, columns, categoricals, dates_as_object ) if columns is None: columns = list(meta.columns) # that we can use factories instead of dataset_uuids delayed_partitions = read_table_as_delayed( factory=ds_factory, table=table, columns=columns, concat_partitions_on_primary_index=concat_partitions_on_primary_index, predicate_pushdown_to_io=predicate_pushdown_to_io, categoricals={table: categoricals}, label_filter=label_filter, dates_as_object=dates_as_object, predicates=predicates, ) return dd.from_delayed(delayed_partitions, meta=meta)
def build_dataset_indices__bag( store, dataset_uuid, columns, partition_size=None, factory=None ): """ Function which builds a :class:`~kartothek.core.index.ExplicitSecondaryIndex`. This function loads the dataset, computes the requested indices and writes the indices to the dataset. The dataset partitions itself are not mutated. Parameters ---------- partition_size: Optional[int] Dask bag partition size. Use a larger numbers to decrease scheduler load and overhead, use smaller numbers for a fine-grained scheduling and better resilience against worker errors. Returns ------- A dask.delayed computation object. """ ds_factory = _ensure_factory( dataset_uuid=dataset_uuid, store=store, factory=factory, load_dataset_metadata=False, ) cols_to_load = { table: set(columns) & set(meta.names) for table, meta in ds_factory.table_meta.items() } cols_to_load = {table: cols for table, cols in cols_to_load.items() if cols} mps = dispatch_metapartitions_from_factory(ds_factory) return ( db.from_sequence(seq=mps, partition_size=partition_size) .map( MetaPartition.load_dataframes, store=ds_factory.store_factory, tables=list(cols_to_load.keys()), columns=cols_to_load, ) .map(MetaPartition.build_indices, columns=columns) .map(MetaPartition.remove_dataframes) .reduction(list, list, split_every=False, out_type=db.Bag) .flatten() .map_partitions(list) .map_partitions( update_indices_from_partitions, dataset_metadata_factory=ds_factory ) )
def build_dataset_indices__bag( store: Optional[StoreInput], dataset_uuid: Optional[str], columns: Sequence[str], partition_size: Optional[int] = None, factory: Optional[DatasetFactory] = None, ) -> Delayed: """ Function which builds a :class:`~kartothek.core.index.ExplicitSecondaryIndex`. This function loads the dataset, computes the requested indices and writes the indices to the dataset. The dataset partitions itself are not mutated. Parameters ---------- """ ds_factory = _ensure_factory( dataset_uuid=dataset_uuid, store=store, factory=factory, load_dataset_metadata=False, ) cols_to_load = { table: set(columns) & set(meta.names) for table, meta in ds_factory.table_meta.items() } cols_to_load = {table: cols for table, cols in cols_to_load.items() if cols} mps = dispatch_metapartitions_from_factory(ds_factory) return ( db.from_sequence(seq=mps, partition_size=partition_size) .map( MetaPartition.load_dataframes, store=ds_factory.store_factory, tables=list(cols_to_load.keys()), columns=cols_to_load, ) .map(MetaPartition.build_indices, columns=columns) .map(MetaPartition.remove_dataframes) .reduction(list, list, split_every=False, out_type=db.Bag) .flatten() .map_partitions(list) .map_partitions( update_indices_from_partitions, dataset_metadata_factory=ds_factory ) )
def dispatch_files_to_gc(dataset_uuid, store_factory, chunk_size, factory): ds_factory = _ensure_factory( dataset_uuid=dataset_uuid, store=store_factory, factory=factory, load_dataset_metadata=False, ) dataset_uuid = dataset_uuid or ds_factory.uuid index_path = "{dataset_uuid}/indices/".format(dataset_uuid=dataset_uuid) remove_index_files = set(ds_factory.store.iter_keys(prefix=index_path)) for index in ds_factory.indices.values(): index_keys = set() # We only add the indices that are saved as explicit indices if index.index_storage_key: index_keys.add(index.index_storage_key) remove_index_files -= index_keys remove_table_files = set() if ds_factory.explicit_partitions: table_files = set() for partition in ds_factory.partitions.values(): for name in partition.files.values(): table_files.add(name) for table in ds_factory.tables: table_path = "{dataset_uuid}/{table}/".format( dataset_uuid=dataset_uuid, table=table ) table_files.add(table_path + TABLE_METADATA_FILE) for key in ds_factory.store.iter_keys(prefix=table_path): remove_table_files.add(key) remove_table_files -= table_files files_to_remove = list(remove_index_files | remove_table_files) if chunk_size is None: yield files_to_remove else: for i in range(0, len(files_to_remove), chunk_size): yield files_to_remove[i : i + chunk_size]
def garbage_collect_dataset__delayed(dataset_uuid=None, store=None, chunk_size=100, factory=None): """ Remove auxiliary files that are no longer tracked by the dataset. These files include indices that are no longer referenced by the metadata as well as files in the directories of the tables that are no longer referenced. The latter is only applied to static datasets. Parameters ---------- dataset_uuid: basestring The UUID of the dataset to be deleted store: callable A function returning a KeyValueStore. chunk_size: int Number of files that should be deleted in a single job. Returns ------- tasks: list of dask.delayed """ ds_factory = _ensure_factory( dataset_uuid=dataset_uuid, store=store, factory=factory, load_dataset_metadata=False, ) nested_files = dispatch_files_to_gc(dataset_uuid=None, store_factory=None, chunk_size=chunk_size, factory=ds_factory) return [ delayed(delete_files)(files, store_factory=ds_factory.store_factory) for files in nested_files ]
def build_dataset_indices(store, dataset_uuid, columns, factory=None): """ Function which builds a :class:`~kartothek.core.index.ExplicitSecondaryIndex`. This function loads the dataset, computes the requested indices and writes the indices to the dataset. The dataset partitions itself are not mutated. Parameters ---------- """ ds_factory = _ensure_factory( dataset_uuid=dataset_uuid, store=store, factory=factory, load_dataset_metadata=False, ) cols_to_load = { table: set(columns) & set(meta.names) for table, meta in ds_factory.table_meta.items() } cols_to_load = { table: cols for table, cols in cols_to_load.items() if cols } new_partitions = [] for mp in dispatch_metapartitions_from_factory(ds_factory): mp = mp.load_dataframes( store=ds_factory.store, tables=list(cols_to_load.keys()), columns=cols_to_load, ) mp = mp.build_indices(columns=columns) mp = mp.remove_dataframes() # Remove dataframe from memory new_partitions.append(mp) return update_indices_from_partitions(new_partitions, dataset_metadata_factory=ds_factory)
def read_table( dataset_uuid=None, store=None, table=SINGLE_TABLE, columns=None, concat_partitions_on_primary_index=False, predicate_pushdown_to_io=True, categoricals=None, label_filter=None, dates_as_object=False, predicates=None, factory=None, ): """ A utility function to load a single table with multiple partitions as a single dataframe in one go. Mostly useful for smaller tables or datasets where all partitions fit into memory. The order of partitions is not guaranteed to be stable in the resulting dataframe. Parameters ---------- Returns ------- pandas.DataFrame Returns a pandas.DataFrame holding the data of the requested columns Examples -------- Dataset in store contains two partitions with two files each .. code :: >>> import storefact >>> from kartothek.io.eager import read_table >>> store = storefact.get_store_from_url('s3://bucket_with_dataset') >>> df = read_table(store, 'dataset_uuid', 'core') """ if concat_partitions_on_primary_index is not False: warnings.warn( "The keyword `concat_partitions_on_primary_index` is deprecated and will be removed in the next major release.", DeprecationWarning, ) if not isinstance(table, str): raise TypeError("Argument `table` needs to be a string") columns = _check_compatible_list(table, columns, "columns") categoricals = _check_compatible_list(table, categoricals, "categoricals") ds_factory = _ensure_factory( dataset_uuid=dataset_uuid, store=_make_callable(store), factory=factory, load_dataset_metadata=False, ) partitions = read_dataset_as_dataframes( tables=[table], columns=columns, concat_partitions_on_primary_index=concat_partitions_on_primary_index, predicate_pushdown_to_io=predicate_pushdown_to_io, categoricals=categoricals, label_filter=label_filter, dates_as_object=dates_as_object, predicates=predicates, factory=ds_factory, ) empty_df = empty_dataframe_from_schema( schema=ds_factory.table_meta[table], columns=columns[table] if columns is not None else None, ) dfs = [partition_data[table] for partition_data in partitions] + [empty_df] # require meta 4 otherwise, can't construct types/columns if categoricals: dfs = align_categories(dfs, categoricals[table]) df = pd.concat(dfs, ignore_index=True, sort=False) # ensure column order if len(empty_df.columns) > 0: df = df.reindex(empty_df.columns, copy=False, axis=1) return df
def read_dataset_as_delayed_metapartitions( dataset_uuid=None, store=None, tables=None, columns=None, concat_partitions_on_primary_index=False, predicate_pushdown_to_io=True, categoricals=None, label_filter=None, dates_as_object=False, load_dataset_metadata=False, predicates=None, factory=None, dispatch_by=None, ): """ A collection of dask.delayed objects to retrieve a dataset from store where each partition is loaded as a :class:`~kartothek.io_components.metapartition.MetaPartition`. .. seealso: :func:`~kartothek.io.dask.read_dataset_as_delayed` Parameters ---------- """ ds_factory = _ensure_factory( dataset_uuid=dataset_uuid, store=store, factory=factory, load_dataset_metadata=load_dataset_metadata, ) store = ds_factory.store_factory mps = dispatch_metapartitions_from_factory( dataset_factory=ds_factory, concat_partitions_on_primary_index=concat_partitions_on_primary_index, label_filter=label_filter, predicates=predicates, dispatch_by=dispatch_by, ) if concat_partitions_on_primary_index or dispatch_by: mps = _load_and_concat_metapartitions( mps, store=store, tables=tables, columns=columns, categoricals=categoricals, predicate_pushdown_to_io=predicate_pushdown_to_io, dates_as_object=dates_as_object, predicates=predicates, ) else: mps = map_delayed( mps, MetaPartition.load_dataframes, store=store, tables=tables, columns=columns, categoricals=categoricals, predicate_pushdown_to_io=predicate_pushdown_to_io, dates_as_object=dates_as_object, predicates=predicates, ) categoricals_from_index = _maybe_get_categoricals_from_index( ds_factory, categoricals) if categoricals_from_index: func_dict = defaultdict(_identity) func_dict.update({ table: partial(_cast_categorical_to_index_cat, categories=cats) for table, cats in categoricals_from_index.items() }) mps = map_delayed(mps, MetaPartition.apply, func_dict, type_safe=True) return mps
def read_dataset_as_metapartitions_bag( dataset_uuid=None, store=None, tables=None, columns=None, concat_partitions_on_primary_index=False, predicate_pushdown_to_io=True, categoricals=None, label_filter=None, dates_as_object=False, load_dataset_metadata=False, predicates=None, factory=None, dispatch_by=None, partition_size=None, dispatch_metadata=True, ): """ Retrieve dataset as `dask.bag.Bag` of `MetaPartition` objects. Parameters ---------- Returns ------- dask.bag.Bag: A dask.bag object containing the metapartions. """ ds_factory = _ensure_factory( dataset_uuid=dataset_uuid, store=store, factory=factory, load_dataset_metadata=load_dataset_metadata, ) if len(ds_factory.tables) > 1: warnings.warn( "Trying to read a dataset with multiple internal tables. This functionality will be removed in the next " "major release. If you require a multi tabled data format, we recommend to switch to the kartothek Cube " "functionality. " "https://kartothek.readthedocs.io/en/stable/guide/cube/kartothek_cubes.html", DeprecationWarning, ) store = ds_factory.store_factory mps = dispatch_metapartitions_from_factory( dataset_factory=ds_factory, concat_partitions_on_primary_index=concat_partitions_on_primary_index, label_filter=label_filter, predicates=predicates, dispatch_by=dispatch_by, dispatch_metadata=dispatch_metadata, ) mps = db.from_sequence(mps, partition_size=partition_size) if concat_partitions_on_primary_index or dispatch_by is not None: mps = mps.map( _load_and_concat_metapartitions_inner, store=store, tables=tables, columns=columns, categoricals=categoricals, predicate_pushdown_to_io=predicate_pushdown_to_io, dates_as_object=dates_as_object, predicates=predicates, ) else: mps = mps.map( MetaPartition.load_dataframes, store=store, tables=tables, columns=columns, categoricals=categoricals, predicate_pushdown_to_io=predicate_pushdown_to_io, dates_as_object=dates_as_object, predicates=predicates, ) categoricals_from_index = _maybe_get_categoricals_from_index( ds_factory, categoricals) if categoricals_from_index: func_dict = defaultdict(_identity) func_dict.update({ table: partial(_cast_categorical_to_index_cat, categories=cats) for table, cats in categoricals_from_index.items() }) mps = mps.map(MetaPartition.apply, func_dict, type_safe=True) return mps
def read_dataset_as_metapartitions_bag( dataset_uuid=None, store=None, tables=None, columns=None, concat_partitions_on_primary_index=False, predicate_pushdown_to_io=True, categoricals=None, label_filter=None, dates_as_object=False, load_dataset_metadata=False, predicates=None, factory=None, dispatch_by=None, partition_size=None, ): """ Retrieve dataset as `dask.bag` of `MetaPartition` objects. Parameters ---------- Returns ------- A dask.bag object containing the metapartions. """ ds_factory = _ensure_factory( dataset_uuid=dataset_uuid, store=store, factory=factory, load_dataset_metadata=load_dataset_metadata, ) store = ds_factory.store_factory mps = dispatch_metapartitions_from_factory( dataset_factory=ds_factory, concat_partitions_on_primary_index=concat_partitions_on_primary_index, label_filter=label_filter, predicates=predicates, dispatch_by=dispatch_by, ) mps = db.from_sequence(mps, partition_size=partition_size) if concat_partitions_on_primary_index or dispatch_by: mps = mps.map( _load_and_concat_metapartitions_inner, store=store, tables=tables, columns=columns, categoricals=categoricals, predicate_pushdown_to_io=predicate_pushdown_to_io, dates_as_object=dates_as_object, predicates=predicates, ) else: mps = mps.map( MetaPartition.load_dataframes, store=store, tables=tables, columns=columns, categoricals=categoricals, predicate_pushdown_to_io=predicate_pushdown_to_io, dates_as_object=dates_as_object, predicates=predicates, ) categoricals_from_index = _maybe_get_categoricals_from_index( ds_factory, categoricals ) if categoricals_from_index: func_dict = defaultdict(_identity) func_dict.update( { table: partial(_cast_categorical_to_index_cat, categories=cats) for table, cats in categoricals_from_index.items() } ) mps = mps.map(MetaPartition.apply, func_dict, type_safe=True) return mps
def read_dataset_as_metapartitions( dataset_uuid=None, store=None, tables=None, columns=None, concat_partitions_on_primary_index=False, predicate_pushdown_to_io=True, categoricals=None, label_filter=None, dates_as_object=False, predicates=None, factory=None, dispatch_by=None, dispatch_metadata=True, ): """ Read a dataset as a list of :class:`kartothek.io_components.metapartition.MetaPartition`. Every element of the list corresponds to a physical partition. Parameters ---------- Returns ------- List[kartothek.io_components.metapartition.MetaPartition] Returns a tuple of the loaded dataframe and the dataset metadata Examples -------- Dataset in store contains two partitions with two files each .. code :: >>> import storefact >>> from kartothek.io.eager import read_dataset_as_dataframe >>> store = storefact.get_store_from_url('s3://bucket_with_dataset') >>> list_mps = read_dataset_as_metapartitions('dataset_uuid', store, 'core') """ ds_factory = _ensure_factory( dataset_uuid=dataset_uuid, store=store, factory=factory, load_dataset_metadata=False, ) if len(ds_factory.tables) > 1: warnings.warn( "Trying to read a dataset with multiple internal tables. This functionality will be removed in the next " "major release. If you require a multi tabled data format, we recommend to switch to the kartothek Cube " "functionality. " "https://kartothek.readthedocs.io/en/stable/guide/cube/kartothek_cubes.html", DeprecationWarning, ) from .iter import read_dataset_as_metapartitions__iterator ds_iter = read_dataset_as_metapartitions__iterator( tables=tables, columns=columns, concat_partitions_on_primary_index=concat_partitions_on_primary_index, predicate_pushdown_to_io=predicate_pushdown_to_io, categoricals=categoricals, label_filter=label_filter, dates_as_object=dates_as_object, predicates=predicates, factory=ds_factory, dispatch_by=dispatch_by, dispatch_metadata=dispatch_metadata, ) return list(ds_iter)
def read_dataset_as_delayed_metapartitions( dataset_uuid=None, store=None, columns=None, predicate_pushdown_to_io=True, categoricals: Optional[Sequence[str]] = None, dates_as_object: bool = True, predicates=None, factory=None, dispatch_by=None, ): """ A collection of dask.delayed objects to retrieve a dataset from store where each partition is loaded as a :class:`~kartothek.io_components.metapartition.MetaPartition`. .. seealso: :func:`~kartothek.io.dask.read_dataset_as_delayed` Parameters ---------- """ ds_factory = _ensure_factory( dataset_uuid=dataset_uuid, store=store, factory=factory, ) store = ds_factory.store_factory mps = dispatch_metapartitions_from_factory( dataset_factory=ds_factory, predicates=predicates, dispatch_by=dispatch_by, ) if dispatch_by is not None: mps = _load_and_concat_metapartitions( mps, store=store, columns=columns, categoricals=categoricals, predicate_pushdown_to_io=predicate_pushdown_to_io, dates_as_object=dates_as_object, predicates=predicates, ) else: mps = map_delayed( MetaPartition.load_dataframes, mps, store=store, columns=columns, categoricals=categoricals, predicate_pushdown_to_io=predicate_pushdown_to_io, dates_as_object=dates_as_object, predicates=predicates, ) categoricals_from_index = _maybe_get_categoricals_from_index( ds_factory, categoricals ) if categoricals_from_index: mps = map_delayed( partial( # type: ignore MetaPartition.apply, func=partial( # type: ignore _cast_categorical_to_index_cat, categories=categoricals_from_index ), type_safe=True, ), mps, ) return list(mps)
def read_dataset_as_ddf( dataset_uuid=None, store=None, table=None, columns=None, concat_partitions_on_primary_index=False, predicate_pushdown_to_io=True, categoricals=None, label_filter=None, dates_as_object=False, predicates=None, factory=None, dask_index_on=None, ): """ Retrieve a single table from a dataset as partition-individual :class:`~dask.dataframe.DataFrame` instance. Please take care when using categoricals with Dask. For index columns, this function will construct dataset wide categoricals. For all other columns, Dask will determine the categories on a partition level and will need to merge them when shuffling data. Parameters ---------- dask_index_on: str Reconstruct (and set) a dask index on the provided index column. For details on performance, see also `dispatch_by` """ if dask_index_on is not None and not isinstance(dask_index_on, str): raise TypeError( f"The paramter `dask_index_on` must be a string but got {type(dask_index_on)}" ) ds_factory = _ensure_factory( dataset_uuid=dataset_uuid, store=store, factory=factory, load_dataset_metadata=False, ) if isinstance(columns, dict): columns = columns[table] meta = _get_dask_meta_for_dataset(ds_factory, table, columns, categoricals, dates_as_object) if columns is None: columns = list(meta.columns) # that we can use factories instead of dataset_uuids delayed_partitions = read_table_as_delayed( factory=ds_factory, table=table, columns=columns, concat_partitions_on_primary_index=concat_partitions_on_primary_index, predicate_pushdown_to_io=predicate_pushdown_to_io, categoricals={table: categoricals}, label_filter=label_filter, dates_as_object=dates_as_object, predicates=predicates, dispatch_by=dask_index_on, ) if dask_index_on: divisions = ds_factory.indices[dask_index_on].observed_values() divisions.sort() divisions = list(divisions) divisions.append(divisions[-1]) return dd.from_delayed(delayed_partitions, meta=meta, divisions=divisions).set_index( dask_index_on, divisions=divisions, sorted=True) else: return dd.from_delayed(delayed_partitions, meta=meta)
def read_dataset_as_delayed_metapartitions( dataset_uuid=None, store=None, tables=None, columns=None, concat_partitions_on_primary_index=False, predicate_pushdown_to_io=True, categoricals=None, label_filter=None, dates_as_object=False, load_dataset_metadata=False, predicates=None, factory=None, dispatch_by=None, dispatch_metadata=True, ): """ A collection of dask.delayed objects to retrieve a dataset from store where each partition is loaded as a :class:`~kartothek.io_components.metapartition.MetaPartition`. .. seealso: :func:`~kartothek.io.dask.read_dataset_as_delayed` Parameters ---------- """ ds_factory = _ensure_factory( dataset_uuid=dataset_uuid, store=store, factory=factory, load_dataset_metadata=load_dataset_metadata, ) if len(ds_factory.tables) > 1: warnings.warn( "Trying to read a dataset with multiple internal tables. This functionality will be removed in the next " "major release. If you require a multi tabled data format, we recommend to switch to the kartothek Cube " "functionality. " "https://kartothek.readthedocs.io/en/stable/guide/cube/kartothek_cubes.html", DeprecationWarning, ) store = ds_factory.store_factory mps = dispatch_metapartitions_from_factory( dataset_factory=ds_factory, concat_partitions_on_primary_index=concat_partitions_on_primary_index, label_filter=label_filter, predicates=predicates, dispatch_by=dispatch_by, dispatch_metadata=dispatch_metadata, ) if concat_partitions_on_primary_index or dispatch_by is not None: mps = _load_and_concat_metapartitions( mps, store=store, tables=tables, columns=columns, categoricals=categoricals, predicate_pushdown_to_io=predicate_pushdown_to_io, dates_as_object=dates_as_object, predicates=predicates, ) else: mps = map_delayed( MetaPartition.load_dataframes, mps, store=store, tables=tables, columns=columns, categoricals=categoricals, predicate_pushdown_to_io=predicate_pushdown_to_io, dates_as_object=dates_as_object, predicates=predicates, ) categoricals_from_index = _maybe_get_categoricals_from_index( ds_factory, categoricals) if categoricals_from_index: func_dict = defaultdict(_identity) func_dict.update({ table: partial(_cast_categorical_to_index_cat, categories=cats) for table, cats in categoricals_from_index.items() }) mps = map_delayed( partial(MetaPartition.apply, func=func_dict, type_safe=True), mps) return list(mps)
def read_dataset_as_ddf( dataset_uuid=None, store=None, table=SINGLE_TABLE, columns=None, concat_partitions_on_primary_index=False, predicate_pushdown_to_io=True, categoricals=None, label_filter=None, dates_as_object=False, predicates=None, factory=None, dask_index_on=None, dispatch_by=None, ): """ Retrieve a single table from a dataset as partition-individual :class:`~dask.dataframe.DataFrame` instance. Please take care when using categoricals with Dask. For index columns, this function will construct dataset wide categoricals. For all other columns, Dask will determine the categories on a partition level and will need to merge them when shuffling data. Parameters ---------- dask_index_on: str Reconstruct (and set) a dask index on the provided index column. Cannot be used in conjunction with `dispatch_by`. For details on performance, see also `dispatch_by` """ if dask_index_on is not None and not isinstance(dask_index_on, str): raise TypeError( f"The paramter `dask_index_on` must be a string but got {type(dask_index_on)}" ) if dask_index_on is not None and dispatch_by is not None and len(dispatch_by) > 0: raise ValueError( "`read_dataset_as_ddf` got parameters `dask_index_on` and `dispatch_by`. " "Note that `dispatch_by` can only be used if `dask_index_on` is None." ) ds_factory = _ensure_factory( dataset_uuid=dataset_uuid, store=store, factory=factory, load_dataset_metadata=False, ) if len(ds_factory.tables) > 1: warnings.warn( "Trying to read a dataset with multiple internal tables. This functionality will be removed in the next " "major release. If you require a multi tabled data format, we recommend to switch to the kartothek Cube " "functionality. " "https://kartothek.readthedocs.io/en/stable/guide/cube/kartothek_cubes.html", DeprecationWarning, ) if isinstance(columns, dict): columns = columns[table] meta = _get_dask_meta_for_dataset( ds_factory, table, columns, categoricals, dates_as_object ) if columns is None: columns = list(meta.columns) # that we can use factories instead of dataset_uuids delayed_partitions = read_table_as_delayed( factory=ds_factory, table=table, columns=columns, concat_partitions_on_primary_index=concat_partitions_on_primary_index, predicate_pushdown_to_io=predicate_pushdown_to_io, categoricals={table: categoricals}, label_filter=label_filter, dates_as_object=dates_as_object, predicates=predicates, dispatch_by=dask_index_on if dask_index_on else dispatch_by, ) if dask_index_on: divisions = ds_factory.indices[dask_index_on].observed_values() divisions.sort() divisions = list(divisions) divisions.append(divisions[-1]) return dd.from_delayed( delayed_partitions, meta=meta, divisions=divisions ).set_index(dask_index_on, divisions=divisions, sorted=True) else: return dd.from_delayed(delayed_partitions, meta=meta)
def collect_dataset_metadata( store: Optional[StoreInput] = None, dataset_uuid: Optional[str] = None, table_name: str = SINGLE_TABLE, predicates: Optional[PredicatesType] = None, frac: float = 1.0, factory: Optional[DatasetFactory] = None, ) -> dd.DataFrame: """ Collect parquet metadata of the dataset. The `frac` parameter can be used to select a subset of the data. .. warning:: If the size of the partitions is not evenly distributed, e.g. some partitions might be larger than others, the metadata returned is not a good approximation for the whole dataset metadata. .. warning:: Using the `frac` parameter is not encouraged for a small number of total partitions. Parameters ---------- predicates Kartothek predicates to apply filters on the data for which to gather statistics .. warning:: Filtering will only be applied for predicates on indices. The evaluation of the predicates therefore will therefore only return an approximate result. frac Fraction of the total number of partitions to use for gathering statistics. `frac == 1.0` will use all partitions. Returns ------- dask.dataframe.DataFrame: A dask.DataFrame containing the following information about dataset statistics: * `partition_label`: File name of the parquet file, unique to each physical partition. * `row_group_id`: Index of the row groups within one parquet file. * `row_group_compressed_size`: Byte size of the data within one row group. * `row_group_uncompressed_size`: Byte size (uncompressed) of the data within one row group. * `number_rows_total`: Total number of rows in one parquet file. * `number_row_groups`: Number of row groups in one parquet file. * `serialized_size`: Serialized size of the parquet file. * `number_rows_per_row_group`: Number of rows per row group. Raises ------ ValueError If no metadata could be retrieved, raise an error. """ if not 0.0 < frac <= 1.0: raise ValueError( f"Invalid value for parameter `frac`: {frac}." "Please make sure to provide a value larger than 0.0 and smaller than or equal to 1.0 ." ) dataset_factory = _ensure_factory( dataset_uuid=dataset_uuid, store=store, factory=factory, load_dataset_metadata=False, ) mps = list( dispatch_metapartitions_from_factory(dataset_factory, predicates=predicates) ) if mps: random.shuffle(mps) # ensure that even with sampling at least one metapartition is returned cutoff_index = max(1, int(len(mps) * frac)) mps = mps[:cutoff_index] ddf = dd.from_delayed( [ dask.delayed(MetaPartition.get_parquet_metadata)( mp, store=dataset_factory.store_factory, table_name=table_name ) for mp in mps ], meta=_METADATA_SCHEMA, ) else: df = pd.DataFrame(columns=_METADATA_SCHEMA.keys()) df = df.astype(_METADATA_SCHEMA) ddf = dd.from_pandas(df, npartitions=1) return ddf
def read_dataset_as_metapartitions__iterator( dataset_uuid=None, store=None, tables=None, columns=None, concat_partitions_on_primary_index=False, predicate_pushdown_to_io=True, categoricals=None, label_filter=None, dates_as_object=False, load_dataset_metadata=False, predicates=None, factory=None, ): """ A Python iterator to retrieve a dataset from store where each partition is loaded as a :class:`~kartothek.io_components.metapartition.MetaPartition`. .. seealso: :func:`~kartothek.io_components.read.read_dataset_as_dataframes__iterator` """ ds_factory = _ensure_factory( dataset_uuid=dataset_uuid, store=store, factory=factory, load_dataset_metadata=load_dataset_metadata, ) store = ds_factory.store mps = dispatch_metapartitions_from_factory( ds_factory, concat_partitions_on_primary_index=concat_partitions_on_primary_index, label_filter=label_filter, predicates=predicates, ) for mp in mps: if concat_partitions_on_primary_index: mp = MetaPartition.concat_metapartitions([ mp_inner.load_dataframes( store=store, tables=tables, columns=columns, categoricals=categoricals, predicate_pushdown_to_io=predicate_pushdown_to_io, predicates=predicates, ) for mp_inner in mp ]) else: mp = mp.load_dataframes( store=store, tables=tables, columns=columns, categoricals=categoricals, predicate_pushdown_to_io=predicate_pushdown_to_io, dates_as_object=dates_as_object, predicates=predicates, ) yield mp
def read_dataset_as_metapartitions__iterator( dataset_uuid=None, store=None, tables=None, columns=None, concat_partitions_on_primary_index=False, predicate_pushdown_to_io=True, categoricals=None, label_filter=None, dates_as_object=False, load_dataset_metadata=False, predicates=None, factory=None, dispatch_by=None, dispatch_metadata=True, ): """ A Python iterator to retrieve a dataset from store where each partition is loaded as a :class:`~kartothek.io_components.metapartition.MetaPartition`. .. seealso: :func:`~kartothek.io_components.read.read_dataset_as_dataframes__iterator` Parameters ---------- """ ds_factory = _ensure_factory( dataset_uuid=dataset_uuid, store=store, factory=factory, load_dataset_metadata=load_dataset_metadata, ) if len(ds_factory.tables) > 1: warnings.warn( "Trying to read a dataset with multiple internal tables. This functionality will be removed in the next " "major release. If you require a multi tabled data format, we recommend to switch to the kartothek Cube " "functionality. " "https://kartothek.readthedocs.io/en/stable/guide/cube/kartothek_cubes.html", DeprecationWarning, ) store = ds_factory.store mps = dispatch_metapartitions_from_factory( ds_factory, concat_partitions_on_primary_index=concat_partitions_on_primary_index, label_filter=label_filter, predicates=predicates, dispatch_by=dispatch_by, dispatch_metadata=dispatch_metadata, ) for mp in mps: if concat_partitions_on_primary_index or dispatch_by is not None: mp = MetaPartition.concat_metapartitions( [ mp_inner.load_dataframes( store=store, tables=tables, columns=columns, categoricals=categoricals, predicate_pushdown_to_io=predicate_pushdown_to_io, predicates=predicates, ) for mp_inner in mp ] ) else: mp = cast(MetaPartition, mp) mp = mp.load_dataframes( store=store, tables=tables, columns=columns, categoricals=categoricals, predicate_pushdown_to_io=predicate_pushdown_to_io, dates_as_object=dates_as_object, predicates=predicates, ) yield mp
def read_dataset_as_dataframes( dataset_uuid: Optional[str] = None, store=None, tables: Optional[List[str]] = None, columns: Dict[str, List[str]] = None, concat_partitions_on_primary_index: bool = False, predicate_pushdown_to_io: bool = True, categoricals: Dict[str, List[str]] = None, label_filter: Callable = None, dates_as_object: bool = False, predicates: Optional[List[List[Tuple[str, str, Any]]]] = None, factory: Optional[DatasetFactory] = None, dispatch_by: Optional[List[str]] = None, ) -> List[pd.DataFrame]: """ Read a dataset as a list of dataframes. Every element of the list corresponds to a physical partition. Parameters ---------- Returns ------- List[pandas.DataFrame] Returns a list of pandas.DataFrame. One element per partition Examples -------- Dataset in store contains two partitions with two files each .. code :: >>> import storefact >>> from kartothek.io.eager import read_dataset_as_dataframes >>> store = storefact.get_store_from_url('s3://bucket_with_dataset') >>> dfs = read_dataset_as_dataframes('dataset_uuid', store, 'core') """ ds_factory = _ensure_factory( dataset_uuid=dataset_uuid, store=store, factory=factory, load_dataset_metadata=True, ) mps = read_dataset_as_metapartitions( tables=tables, columns=columns, concat_partitions_on_primary_index=concat_partitions_on_primary_index, predicate_pushdown_to_io=predicate_pushdown_to_io, categoricals=categoricals, label_filter=label_filter, dates_as_object=dates_as_object, predicates=predicates, factory=ds_factory, dispatch_by=dispatch_by, dispatch_metadata=False, ) return [mp.data for mp in mps]
def read_dataset_as_metapartitions( dataset_uuid=None, store=None, tables=None, columns=None, concat_partitions_on_primary_index=False, predicate_pushdown_to_io=True, categoricals=None, label_filter=None, dates_as_object=False, predicates=None, factory=None, dispatch_by=None, ): """ Read a dataset as a list of :class:`kartothek.io_components.metapartition.MetaPartition`. Every element of the list corresponds to a physical partition. Parameters ---------- Returns ------- List[kartothek.io_components.metapartition.MetaPartition] Returns a tuple of the loaded dataframe and the dataset metadata Examples -------- Dataset in store contains two partitions with two files each .. code :: >>> import storefact >>> from kartothek.io.eager import read_dataset_as_dataframe >>> store = storefact.get_store_from_url('s3://bucket_with_dataset') >>> list_mps = read_dataset_as_metapartitions('dataset_uuid', store, 'core') """ ds_factory = _ensure_factory( dataset_uuid=dataset_uuid, store=store, factory=factory, load_dataset_metadata=False, ) from .iter import read_dataset_as_metapartitions__iterator ds_iter = read_dataset_as_metapartitions__iterator( tables=tables, columns=columns, concat_partitions_on_primary_index=concat_partitions_on_primary_index, predicate_pushdown_to_io=predicate_pushdown_to_io, categoricals=categoricals, label_filter=label_filter, dates_as_object=dates_as_object, predicates=predicates, factory=ds_factory, dispatch_by=dispatch_by, ) return list(ds_iter)