def hash_dataset( store: Optional[StoreInput] = None, dataset_uuid: Optional[str] = None, subset=None, group_key=None, table: str = SINGLE_TABLE, predicates: Optional[PredicatesType] = None, factory: Optional[DatasetFactory] = None, ) -> dd.Series: """ Calculate a partition wise, or group wise, hash of the dataset. .. note:: We do not guarantee the hash values to remain constant accross versions. Example output:: Assuming a dataset with two unique values in column `P` this gives >>> hash_dataset(factory=dataset_with_index_factory,group_key=["P"]).compute() ... P ... 1 11462879952839863487 ... 2 12568779102514529673 ... dtype: uint64 Parameters ---------- subset If provided, only take these columns into account when hashing the dataset group_key If provided, calculate hash per group instead of per partition """ dataset_factory = _ensure_factory( dataset_uuid=dataset_uuid, store=store, factory=factory, load_dataset_metadata=False, ) columns = subset if subset and group_key: columns = sorted(set(subset) | set(group_key)) ddf = read_dataset_as_ddf( table=table, predicates=predicates, factory=dataset_factory, columns=columns, dates_as_object=True, ) if not group_key: return ddf.map_partitions(_hash_partition, meta="uint64").astype("uint64") else: ddf2 = pack_payload(ddf, group_key=group_key) return ( ddf2.groupby(group_key) .apply(_unpack_hash, unpack_meta=ddf._meta, subset=subset, meta="uint64") .astype("uint64") )
def test_pack_payload_roundtrip(df_all_types, num_group_cols): group_key = list(df_all_types.columns[-num_group_cols:]) df_all_types = dd.from_pandas(df_all_types, npartitions=2) pdt.assert_frame_equal( df_all_types.compute(), unpack_payload( pack_payload(df_all_types, group_key=group_key), unpack_meta=df_all_types._meta, ).compute(), )
def test_pack_payload_empty(df_all_types): # For a single row dataframe the packing actually has a few more bytes df_empty = dd.from_pandas(df_all_types.iloc[:0], npartitions=1) group_key = [df_all_types.columns[-1]] pdt.assert_frame_equal( df_empty.compute(), unpack_payload(pack_payload(df_empty, group_key=group_key), unpack_meta=df_empty._meta).compute(), )
def test_pack_payload(df_all_types): # For a single row dataframe the packing actually has a few more bytes df = dd.from_pandas(pd.concat([df_all_types] * 10, ignore_index=True), npartitions=3) size_before = df.memory_usage(deep=True).sum() packed_df = pack_payload(df, group_key=list(df.columns[-2:])) size_after = packed_df.memory_usage(deep=True).sum() assert (size_after < size_before).compute()
def shuffle_store_dask_partitions( ddf: dd.DataFrame, table: str, secondary_indices: Optional[Union[str, Sequence[str]]], metadata_version: int, partition_on: List[str], store_factory: StoreFactory, df_serializer: Optional[DataFrameSerializer], dataset_uuid: str, num_buckets: int, sort_partitions_by: List[str], bucket_by: Sequence[str], ) -> da.Array: """ Perform a dataset update with dask reshuffling to control partitioning. The shuffle operation will perform the following steps 1. Pack payload data Payload data is serialized and compressed into a single byte value using ``distributed.protocol.serialize_bytes``, see also ``pack_payload``. 2. Apply bucketing Hash the column subset ``bucket_by`` and distribute the hashes in ``num_buckets`` bins/buckets. Internally every bucket is identified by an integer and we will create one physical file for every bucket ID. The bucket ID is not exposed to the user and is dropped after the shuffle, before the store. This is done since we do not want to guarantee at the moment, that the hash function remains stable. 3. Perform shuffle (dask.DataFrame.groupby.apply) The groupby key will be the combination of ``partition_on`` fields and the hash bucket ID. This will create a physical file for every unique tuple in ``partition_on + bucket_ID``. The function which is applied to the dataframe will perform all necessary subtask for storage of the dataset (partition_on, index calc, etc.). 4. Unpack data (within the apply-function) After the shuffle, the first step is to unpack the payload data since the follow up tasks will require the full dataframe. 5. Pre storage processing and parquet serialization We apply important pre storage processing like sorting data, applying final partitioning (at this time there should be only one group in the payload data but using the ``MetaPartition.partition_on`` guarantees the appropriate data structures kartothek expects are created.). After the preprocessing is done, the data is serialized and stored as parquet. The applied function will return an (empty) MetaPartition with indices and metadata which will then be used to commit the dataset. Returns ------- A dask.Array holding relevant MetaPartition objects as values """ if ddf.npartitions == 0: return ddf group_cols = partition_on.copy() if num_buckets is None: raise ValueError( "``num_buckets`` must not be None when shuffling data.") meta = ddf._meta meta[_KTK_HASH_BUCKET] = np.uint64(0) ddf = ddf.map_partitions(_hash_bucket, bucket_by, num_buckets, meta=meta) group_cols.append(_KTK_HASH_BUCKET) unpacked_meta = ddf._meta ddf = pack_payload(ddf, group_key=group_cols) ddf = ddf.groupby(by=group_cols) ddf = ddf.apply( partial( _unpack_store_partition, secondary_indices=secondary_indices, sort_partitions_by=sort_partitions_by, table=table, dataset_uuid=dataset_uuid, partition_on=partition_on, store_factory=store_factory, df_serializer=df_serializer, metadata_version=metadata_version, unpacked_meta=unpacked_meta, ), meta=("MetaPartition", "object"), ) return ddf