def test_update_dataset_from_ddf_empty(store_factory, shuffle): with pytest.raises(ValueError, match="Cannot store empty datasets"): update_dataset_from_ddf( dask.dataframe.from_delayed([], meta=(("a", int), )), store_factory, dataset_uuid="output_dataset_uuid", table="core", shuffle=shuffle, partition_on=["a"], ).compute()
def test_update_dataset_from_ddf_empty(store_factory, shuffle): with pytest.raises(ValueError) as exc_info: update_dataset_from_ddf( dask.dataframe.from_delayed([], meta=(("a", int), )), store_factory, dataset_uuid="output_dataset_uuid", table="core", shuffle=shuffle, partition_on=["a"], ).compute() assert str(exc_info.value) in [ "Cannot store empty datasets", # dask <= 2021.5.0 "Cannot store empty datasets, partition_list must not be empty if in store mode.", # dask > 2021.5.0 + shuffle == True "No data left to save outside partition columns", # dask > 2021.5.0 + shuffle == False ]
def partition_filt(self, filt, chunk_dfs=True): """Write partitioned dataset using kartothek """ if chunk_dfs: for i, df in enumerate(self.iter_df_chunks(filt)): if df is not None: print(f"... ...ktk repartitioning {self.dataset} ({filt}, chunk {i + 1})") graph = update_dataset_from_ddf(df, **self.ktk_kwargs) graph.compute() else: df = get_df(self.dataIds_by_filter[filt], self.filenames_by_filter[filt]) if df is not None: print(f"... ...ktk repartitioning {self.dataset} ({filt}, chunk {i + 1})") graph = update_dataset_from_ddf(df, **self.ktk_kwargs) graph.compute()
def partition_filt(self, filt, chunk_dfs=False): """Write partitioned dataset using kartothek """ for i, df in enumerate(self.iter_df_chunks(filt, chunk_dfs=chunk_dfs)): if df is not None: print(f"... ...ktk repartitioning {self.dataset} ({filt}, chunk {i + 1})") graph = update_dataset_from_ddf(df, **self.ktk_kwargs) graph.compute()
def partition(self, chunk_by_filter=True, chunk_dfs=True): if chunk_by_filter: for filt in self.filters: self.partition_filt(filt, chunk_dfs=chunk_dfs) else: df = self.get_df(self.dataIds, self.filenames) print(f"... ...ktk repartitioning {self.dataset}") graph = update_dataset_from_ddf(df, **self.ktk_kwargs) graph.compute()
def test_update_shuffle_no_partition_on(store_factory, bucket_by): df = pd.DataFrame( { "range": np.arange(10), "range_duplicated": np.repeat(np.arange(2), 5), "random": np.random.randint(0, 100, 10), } ) ddf = dd.from_pandas(df, npartitions=10) with pytest.raises( ValueError, match="``num_buckets`` must not be None when shuffling data." ): update_dataset_from_ddf( ddf, store_factory, dataset_uuid="output_dataset_uuid", table="table", shuffle=True, num_buckets=None, bucket_by=bucket_by, ).compute() res_default = update_dataset_from_ddf( ddf, store_factory, dataset_uuid="output_dataset_uuid_default", table="table", shuffle=True, bucket_by=bucket_by, ).compute() assert len(res_default.partitions) == 1 res = update_dataset_from_ddf( ddf, store_factory, dataset_uuid="output_dataset_uuid", table="table", shuffle=True, num_buckets=2, bucket_by=bucket_by, ).compute() assert len(res.partitions) == 2
def update_dataset_from_ddf(self, ddf, **kwargs): """ partition_on=["c_date"], num_buckets=num_buckets, shuffle=True, delete_scope=delete_scope """ return update_dataset_from_ddf(ddf, store=lambda: self.writable_store, dataset_uuid=self.dataset_uuid, table=self.table, **kwargs)
def test_delayed_as_delete_scope(store_factory, df_all_types): # Check that delayed objects are allowed as delete scope. tasks = update_dataset_from_ddf( dd.from_pandas(df_all_types, npartitions=1), store_factory, dataset_uuid="output_dataset_uuid", table="core", delete_scope=dask.delayed(_return_none)(), ) s = pickle.dumps(tasks, pickle.HIGHEST_PROTOCOL) tasks = pickle.loads(s) tasks.compute()
def _update_dataset(partitions, secondary_indices=None, *args, **kwargs): if any(partitions): table_name = next(iter(dict(partitions[0]["data"]).keys())) delayed_partitions = [ dask.delayed(_unwrap_partition)(part) for part in partitions ] partitions = dd.from_delayed(delayed_partitions) else: table_name = "core" partitions = None return update_dataset_from_ddf(partitions, *args, table=table_name, secondary_indices=secondary_indices, **kwargs).compute()
def _update_dataset(partitions, *args, **kwargs): if any(partitions): table_name = next(iter(dict(partitions[0]["data"]).keys())) delayed_partitions = [ dask.delayed(_unwrap_partition)(part) for part in partitions ] partitions = dd.from_delayed(delayed_partitions) else: table_name = "core" partitions = None ddf = update_dataset_from_ddf(partitions, *args, table=table_name, **kwargs) s = pickle.dumps(ddf, pickle.HIGHEST_PROTOCOL) ddf = pickle.loads(s) return ddf.compute()
def _update_dataset(partitions, *args, **kwargs): # TODO: Simplify once parse_input_to_metapartition is removed / obsolete if isinstance(partitions, pd.DataFrame): partitions = dd.from_pandas(partitions, npartitions=1) elif partitions is not None: delayed_partitions = [dask.delayed(_id)(part) for part in partitions] partitions = dd.from_delayed(delayed_partitions) else: partitions = None ddf = update_dataset_from_ddf(partitions, *args, **kwargs) s = pickle.dumps(ddf, pickle.HIGHEST_PROTOCOL) ddf = pickle.loads(s) return ddf.compute()
def _update_dataset(partitions, *args, **kwargs): # TODO: fix the parsing below to adapt for all supported formats (see: parse_input_to_metapartition) if any(partitions): table_name = next(iter(dict(partitions[0]["data"]).keys())) delayed_partitions = [ dask.delayed(_unwrap_partition)(part) for part in partitions ] partitions = dd.from_delayed(delayed_partitions) else: table_name = "core" partitions = None ddf = update_dataset_from_ddf(partitions, *args, table=table_name, **kwargs) s = pickle.dumps(ddf, pickle.HIGHEST_PROTOCOL) ddf = pickle.loads(s) return ddf.compute()
def _update_dataset(partitions, *args, **kwargs): # TODO: Simplify once parse_input_to_metapartition is removed / obsolete if isinstance(partitions, pd.DataFrame): if "table" not in kwargs.keys(): kwargs["table"] = "core" partitions = dd.from_pandas(partitions, npartitions=1) elif any(partitions): kwargs["table"] = next(iter(dict(partitions[0]["data"]).keys())) delayed_partitions = [ dask.delayed(_unwrap_partition)(part) for part in partitions ] partitions = dd.from_delayed(delayed_partitions) else: kwargs["table"] = "core" partitions = None ddf = update_dataset_from_ddf(partitions, *args, **kwargs) s = pickle.dumps(ddf, pickle.HIGHEST_PROTOCOL) ddf = pickle.loads(s) return ddf.compute()
def _update_dataset(partitions, *args, **kwargs): # TODO: Simplify once parse_input_to_metapartition is removed / obsolete if isinstance(partitions, pd.DataFrame): partitions = dd.from_pandas(partitions, npartitions=1) elif partitions is not None: delayed_partitions = [dask.delayed(_id)(part) for part in partitions] partitions = dd.from_delayed(delayed_partitions) else: partitions = None # Replace `table_name` with `table` keyword argument to enable shared test code # via `bound_update_dataset` fixture if "table_name" in kwargs: kwargs["table"] = kwargs["table_name"] del kwargs["table_name"] ddf = update_dataset_from_ddf(partitions, *args, **kwargs) s = pickle.dumps(ddf, pickle.HIGHEST_PROTOCOL) ddf = pickle.loads(s) return ddf.compute()
def dataset(store, dataset_uuid): df = pd.DataFrame({ "A": np.array([1, 2, 3, 4], dtype="int32"), "B": [ pd.Timestamp("2002-01-01"), pd.Timestamp("2002-01-02"), pd.Timestamp("2002-01-03"), pd.Timestamp("2002-01-04"), ], "C": pd.Series(1, index=list(range(4)), dtype="double"), "D": ["test", "train", "test", "prod"], }) ddf = dd.from_pandas(df, npartitions=2) delayed = update_dataset_from_ddf(ddf, store=lambda: store, dataset_uuid=dataset_uuid, table='table', partition_on=["B"]) delayed.compute() yield for k in store.keys(prefix=dataset_uuid): store.delete(k)
def test_update_shuffle_buckets( store_factory, metadata_version, unique_primaries, unique_secondaries, num_buckets, repartition, npartitions, bucket_by, ): """ Assert that certain properties are always given for the output dataset no matter how the input data distribution looks like Properties to assert: * All partitions have a unique value for its correspondent primary key * number of partitions is at least one per unique partition value, at most ``num_buckets`` per primary partition value. * If we demand a column to be sorted it is per partition monotonic """ primaries = np.arange(unique_primaries) secondary = np.arange(unique_secondaries) num_rows = 100 primaries = np.repeat(primaries, np.ceil(num_rows / unique_primaries))[:num_rows] secondary = np.repeat(secondary, np.ceil(num_rows / unique_secondaries))[:num_rows] # ensure that there is an unsorted column uncorrelated # to the primary and secondary columns which can be sorted later on per partition unsorted_column = np.repeat(np.arange(100 / 10), 10) np.random.shuffle(unsorted_column) np.random.shuffle(primaries) np.random.shuffle(secondary) df = pd.DataFrame({ "primary": primaries, "secondary": secondary, "sorted_column": unsorted_column }) secondary_indices = ["secondary"] expected_num_indices = 2 # One primary # used for tests later on to if bucket_by: secondary_indices.append(bucket_by) expected_num_indices = 3 # shuffle all rows. properties of result should be reproducible df = df.sample(frac=1).reset_index(drop=True) ddf = dd.from_pandas(df, npartitions=npartitions) dataset_comp = update_dataset_from_ddf( ddf, store_factory, dataset_uuid="output_dataset_uuid", table="core", secondary_indices=secondary_indices, shuffle=True, bucket_by=bucket_by, repartition_ratio=repartition, num_buckets=num_buckets, sort_partitions_by="sorted_column", default_metadata_version=metadata_version, partition_on=["primary"], ) s = pickle.dumps(dataset_comp, pickle.HIGHEST_PROTOCOL) dataset_comp = pickle.loads(s) dataset = dataset_comp.compute() dataset = dataset.load_all_indices(store_factory()) assert len(dataset.partitions) <= num_buckets * unique_primaries assert len(dataset.partitions) >= unique_primaries assert len(dataset.indices) == expected_num_indices assert set(dataset.indices["primary"].index_dct.keys()) == set( range(unique_primaries)) assert (list( map(lambda x: len(x), dataset.indices["primary"].index_dct.values())) <= [num_buckets] * unique_primaries) assert set(dataset.indices["secondary"].index_dct.keys()) == set( range(unique_secondaries)) assert set(dataset.table_meta["core"].names) == { "primary", "secondary", "sorted_column", } factory = DatasetFactory("output_dataset_uuid", store_factory) factory.load_all_indices() if bucket_by: ind_df = factory.get_indices_as_dataframe(["primary", bucket_by]) assert not ind_df.duplicated().any() for data_dct in read_dataset_as_dataframes__iterator( dataset_uuid=dataset.uuid, store=store_factory): df = data_dct["core"] assert len(df.primary.unique()) == 1 assert df.sorted_column.is_monotonic # update the dataset # do not use partition_on since it should be interfered from the existing dataset tasks = update_dataset_from_ddf( ddf, store_factory, dataset_uuid="output_dataset_uuid", table="core", shuffle=True, repartition_ratio=repartition, num_buckets=num_buckets, sort_partitions_by="sorted_column", default_metadata_version=metadata_version, bucket_by=bucket_by, ) s = pickle.dumps(tasks, pickle.HIGHEST_PROTOCOL) tasks = pickle.loads(s) updated_dataset = tasks.compute() assert len(updated_dataset.partitions) == 2 * len(dataset.partitions) # Not allowed to use different partition_on with pytest.raises( ValueError, match="Incompatible set of partition keys encountered."): update_dataset_from_ddf( ddf, store_factory, dataset_uuid="output_dataset_uuid", table="core", shuffle=True, repartition_ratio=repartition, partition_on=["sorted_column"], num_buckets=num_buckets, sort_partitions_by="sorted_column", default_metadata_version=metadata_version, ) # Not allowed to update with indices which do not yet exist in dataset with pytest.raises(ValueError, match="indices"): update_dataset_from_ddf( ddf, store_factory, dataset_uuid="output_dataset_uuid", table="core", shuffle=True, partition_on=["primary"], repartition_ratio=repartition, secondary_indices=["sorted_column"], num_buckets=num_buckets, sort_partitions_by="sorted_column", default_metadata_version=metadata_version, ) # Check that delayed objects are allowed as delete scope. tasks = update_dataset_from_ddf( None, store_factory, dataset_uuid="output_dataset_uuid", table="core", shuffle=True, repartition_ratio=repartition, num_buckets=num_buckets, sort_partitions_by="sorted_column", default_metadata_version=metadata_version, delete_scope=dask.delayed(_return_none)(), bucket_by=bucket_by, ) s = pickle.dumps(tasks, pickle.HIGHEST_PROTOCOL) tasks = pickle.loads(s) tasks.compute()