def time_store_dataset_from_partitions(self, num_partitions, max_depth, num_leafs): store_dataset_from_partitions( partition_list=self.partitions, store=self.store, dataset_uuid=self.dataset_uuid, dataset_metadata=self.user_dataset_metadata, )
def test_collect_dataset_metadata_empty_dataset_mp(store_factory): mp = MetaPartition(label="cluster_1") store_dataset_from_partitions(partition_list=[mp], store=store_factory, dataset_uuid="dataset_uuid") df_stats = collect_dataset_metadata(store=store_factory, dataset_uuid="dataset_uuid", table_name="table").compute() expected = pd.DataFrame(columns=_METADATA_SCHEMA.keys()) expected = expected.astype(_METADATA_SCHEMA) pd.testing.assert_frame_equal(expected, df_stats, check_index_type=False)
def test_align_datasets_prefix__equal_number_of_partitions( dataset, evaluation_dataset, store_session): """ Test a scenario where the simple prefix match algorithm didn't find any matches in case of equal number of partitions in both datasets. """ # Create a reference dataset which matches the problem (equal number of # partitions and suitable for prefix matching) mp = MetaPartition(label="cluster_1_1", metadata_version=dataset.metadata_version) mp2 = MetaPartition(label="cluster_2_1", metadata_version=dataset.metadata_version) metapartitions = [mp, mp2] store_dataset_from_partitions( partition_list=metapartitions, dataset_uuid="reference_dataset_uuid", store=store_session, ) generator = align_datasets( left_dataset_uuid=dataset.uuid, right_dataset_uuid="reference_dataset_uuid", store=store_session, match_how="prefix", ) assert isinstance(generator, types.GeneratorType) list_metapartitions = list(generator) # Two separate cluster_groups (e.g. cluster_1*) assert len(list_metapartitions) == 2 mp_list = list_metapartitions[0] assert len(mp_list) == 2 mp_list = list_metapartitions[1] assert len(mp_list) == 2 # Test sorting of datasets by length, i.e. order of dataframes is different generator = align_datasets( left_dataset_uuid=evaluation_dataset.uuid, right_dataset_uuid=dataset.uuid, store=store_session, match_how="prefix", ) list_metapartitions = list(generator) mp_list = list_metapartitions[0]
def test_store_dataset_from_partitions(meta_partitions_files_only, store, frozen_time): dataset = store_dataset_from_partitions( partition_list=meta_partitions_files_only, dataset_uuid="dataset_uuid", store=store, dataset_metadata={"some": "metadata"}, ) expected_metadata = { "some": "metadata", "creation_time": TIME_TO_FREEZE_ISO } assert dataset.metadata == expected_metadata assert sorted(dataset.partitions.values(), key=lambda x: x.label) == sorted( [mp.partition for mp in meta_partitions_files_only], key=lambda x: x.label) assert dataset.uuid == "dataset_uuid" store_files = list(store.keys()) # Dataset metadata: 1 file expected_number_files = 1 # common metadata for v4 datasets expected_number_files += 1 assert len(store_files) == expected_number_files # Ensure the dataset can be loaded properly stored_dataset = DatasetMetadata.load_from_store("dataset_uuid", store) assert dataset == stored_dataset
def evaluation_dataset(meta_partitions_evaluation_files_only, store_session): with cm_frozen_time(TIME_TO_FREEZE): return store_dataset_from_partitions( partition_list=meta_partitions_evaluation_files_only, dataset_uuid="evaluation_uuid", store=store_session, )
def update_dataset_from_partitions( partition_list, store_factory, dataset_uuid, ds_factory, delete_scope, metadata, metadata_merger, ): store = ensure_store(store_factory) if ds_factory: ds_factory = ds_factory.load_all_indices() remove_partitions = _get_partitions(ds_factory, delete_scope) index_columns = list(ds_factory.indices.keys()) for column in index_columns: index = ds_factory.indices[column] if isinstance(index, PartitionIndex): del ds_factory.indices[column] else: # Dataset does not exist yet. remove_partitions = [] new_dataset = store_dataset_from_partitions( partition_list=partition_list, store=store, dataset_uuid=dataset_uuid, dataset_metadata=metadata, metadata_merger=metadata_merger, update_dataset=ds_factory, remove_partitions=remove_partitions, ) return new_dataset
def store_dataframes_as_dataset__iter( df_generator, store, dataset_uuid=None, metadata=None, partition_on=None, df_serializer=None, overwrite=False, metadata_storage_format=DEFAULT_METADATA_STORAGE_FORMAT, metadata_version=DEFAULT_METADATA_VERSION, secondary_indices=None, ): """ Store `pd.DataFrame` s iteratively as a partitioned dataset with multiple tables (files). Useful for datasets which do not fit into memory. Parameters ---------- Returns ------- dataset: kartothek.core.dataset.DatasetMetadata The stored dataset. """ if dataset_uuid is None: dataset_uuid = gen_uuid() if not overwrite: raise_if_dataset_exists(dataset_uuid=dataset_uuid, store=store) new_partitions = [] for df in df_generator: mp = parse_input_to_metapartition(df, metadata_version=metadata_version) if partition_on: mp = mp.partition_on(partition_on) if secondary_indices: mp = mp.build_indices(secondary_indices) # Store dataframe, thereby clearing up the dataframe from the `mp` metapartition mp = mp.store_dataframes(store=store, dataset_uuid=dataset_uuid, df_serializer=df_serializer) # Add `kartothek.io_components.metapartition.MetaPartition` object to list to track partitions new_partitions.append(mp) # Store metadata and return `kartothek.DatasetMetadata` object return store_dataset_from_partitions( partition_list=new_partitions, dataset_uuid=dataset_uuid, store=store, dataset_metadata=metadata, metadata_storage_format=metadata_storage_format, )
def _multiplex_store_dataset_from_partitions_flat(mpss, cube, metadata, update, store, existing_datasets): dct = defaultdict(list) for sublist in mpss: for mp in sublist: for k, v in mp.items(): dct[k].append(v) result = {} for k, v in dct.items(): if update: ds_factory = metadata_factory_from_dataset(existing_datasets[k], with_schema=True, store=store) result[k] = update_dataset_from_partitions( v, dataset_uuid=cube.ktk_dataset_uuid(k), delete_scope=[], ds_factory=ds_factory, metadata=metadata[k], metadata_merger=None, store_factory=store, ) else: result[k] = store_dataset_from_partitions( v, dataset_metadata=metadata[k], dataset_uuid=cube.ktk_dataset_uuid(k), metadata_merger=None, metadata_storage_format=KTK_CUBE_METADATA_STORAGE_FORMAT, store=store, ) # list required for dask.bag return [result]
def dataset_function(meta_partitions_files_only_function, store): """ Create a proper kartothek dataset in store with two partitions """ with cm_frozen_time(TIME_TO_FREEZE): return store_dataset_from_partitions( partition_list=meta_partitions_files_only_function, dataset_uuid="dataset_uuid", store=store, dataset_metadata={"dataset": "metadata"}, )
def dataset_alternative_table_name( meta_partitions_files_only_alternative_table_name, store_factory): """ Create a proper kartothek dataset in store with two partitions """ with cm_frozen_time(TIME_TO_FREEZE): return store_dataset_from_partitions( partition_list=meta_partitions_files_only_alternative_table_name, dataset_uuid="dataset_uuid_alternative_name", store=store_factory(), dataset_metadata={"dataset": "metadata"}, )
def store_dataframes_as_dataset( store, dataset_uuid, dfs, metadata=None, partition_on=None, df_serializer=None, overwrite=False, metadata_storage_format=DEFAULT_METADATA_STORAGE_FORMAT, metadata_version=DEFAULT_METADATA_VERSION, ): """ Utility function to store a list of dataframes as a partitioned dataset with multiple tables (files). Useful for very small datasets where all data fits into memory. Parameters ---------- dfs : dict of pd.DataFrame or pd.DataFrame The dataframe(s) to be stored. If only a single dataframe is passed, it will be stored as the `core` table. Returns ------- The stored dataset """ if dataset_uuid is None: dataset_uuid = gen_uuid() if isinstance(dfs, dict): dfs = {"data": [(table, df) for table, df in dfs.items()]} if not overwrite: raise_if_dataset_exists(dataset_uuid=dataset_uuid, store=store) mp = parse_input_to_metapartition(dfs, metadata_version) if partition_on: mp = MetaPartition.partition_on(mp, partition_on) mps = mp.store_dataframes(store=store, dataset_uuid=dataset_uuid, df_serializer=df_serializer) return store_dataset_from_partitions( partition_list=mps, dataset_uuid=dataset_uuid, store=store, dataset_metadata=metadata, metadata_storage_format=metadata_storage_format, )
def dataset_partition_keys(meta_partitions_dataframe, store_session_factory): """ Create a proper kartothek dataset in store with two partitions """ with cm_frozen_time(TIME_TO_FREEZE): new_mps = [] for mp in meta_partitions_dataframe: new_mps.append(mp.partition_on(["P"])) new_mps = _store_metapartitions(new_mps, store_session_factory()) return store_dataset_from_partitions( partition_list=new_mps, dataset_uuid="dataset_uuid_partition_keys", store=store_session_factory(), dataset_metadata={"dataset": "metadata"}, )
def _store_dataset_from_partitions_flat(mpss, *args, **kwargs): return store_dataset_from_partitions( [mp for sublist in mpss for mp in sublist], *args, **kwargs)
def test_store_dataset_from_partitions_update(store, metadata_version, frozen_time): mp1 = MetaPartition( label="cluster_1", data=pd.DataFrame({"p": [1]}), file="1.parquet", indices={ "p": ExplicitSecondaryIndex("p", index_dct={1: ["cluster_1"]}) }, metadata_version=metadata_version, ) mp2 = MetaPartition( label="cluster_2", data=pd.DataFrame({"p": [2]}), file="2.parquet", indices={ "p": ExplicitSecondaryIndex("p", index_dct={2: ["cluster_2"]}) }, metadata_version=metadata_version, ) dataset = store_dataset_from_partitions( partition_list=[mp1, mp2], dataset_uuid="dataset_uuid", store=store, dataset_metadata={"dataset": "metadata"}, ) dataset = dataset.load_index("p", store) mp3 = MetaPartition( label="cluster_3", data=pd.DataFrame({"p": [3]}), file="3.parquet", indices={ "p": ExplicitSecondaryIndex("p", index_dct={3: ["cluster_3"]}) }, metadata_version=metadata_version, ) dataset_updated = store_dataset_from_partitions( partition_list=[mp3], dataset_uuid="dataset_uuid", store=store, dataset_metadata={"extra": "metadata"}, update_dataset=dataset, remove_partitions=["cluster_1"], ) dataset_updated = dataset_updated.load_index("p", store) expected_metadata = {"dataset": "metadata", "extra": "metadata"} expected_metadata["creation_time"] = TIME_TO_FREEZE_ISO assert dataset_updated.metadata == expected_metadata assert list(dataset.partitions) == ["cluster_1", "cluster_2"] assert list(dataset_updated.partitions) == ["cluster_2", "cluster_3"] assert dataset_updated.partitions["cluster_3"] == mp3.partition assert dataset_updated.uuid == "dataset_uuid" store_files = list(store.keys()) # 1 dataset metadata file and 1 index file # note: the update writes a new index file but due to frozen_time this gets # the same name as the previous one and overwrites it. expected_number_files = 2 # common metadata for v4 datasets (1 table) expected_number_files += 1 assert len(store_files) == expected_number_files assert dataset.indices["p"].index_dct == { 1: ["cluster_1"], 2: ["cluster_2"] } assert dataset_updated.indices["p"].index_dct == { 2: ["cluster_2"], 3: ["cluster_3"], } # Ensure the dataset can be loaded properly stored_dataset = DatasetMetadata.load_from_store("dataset_uuid", store) stored_dataset = stored_dataset.load_index("p", store) assert dataset_updated == stored_dataset
def _commit_store_from_reduction(df_mps, **kwargs): partitions = pd.Series(df_mps.values.flatten()).dropna() return store_dataset_from_partitions( partition_list=partitions, **kwargs, )