def test_align_datasets_exact(dataset, evaluation_dataset, store_session): with pytest.raises(RuntimeError): list( align_datasets( left_dataset_uuid=dataset.uuid, right_dataset_uuid=evaluation_dataset.uuid, store=store_session, match_how="exact", )) generator = align_datasets( left_dataset_uuid=dataset.uuid, right_dataset_uuid=dataset.uuid, store=store_session, match_how="exact", ) assert isinstance(generator, types.GeneratorType) list_metapartitions = list(generator) # Two separate cluster_groups (e.g. cluster_1*) assert len(list_metapartitions) == 2 mp_list = list_metapartitions[0] assert len(mp_list) == 2, [mp.label for mp in mp_list] assert [mp.label for mp in mp_list] == ["cluster_1", "cluster_1"] mp_list = list_metapartitions[1] assert len(mp_list) == 2, [mp.label for mp in mp_list] assert [mp.label for mp in mp_list] == ["cluster_2", "cluster_2"]
def test_align_datasets_prefix(dataset, evaluation_dataset, store_session): generator = align_datasets( left_dataset_uuid=dataset.uuid, right_dataset_uuid=evaluation_dataset.uuid, store=store_session, match_how="prefix", ) assert isinstance(generator, types.GeneratorType) list_metapartitions = list(generator) # Two separate cluster_groups (e.g. cluster_1*) assert len(list_metapartitions) == 2 mp_list = list_metapartitions[0] assert len(mp_list) == 3, [mp.label for mp in mp_list] mp_list = list_metapartitions[1] assert len(mp_list) == 3, [mp.label for mp in mp_list] # Test sorting of datasets by length, i.e. order of dataframes is different generator = align_datasets( left_dataset_uuid=evaluation_dataset.uuid, right_dataset_uuid=dataset.uuid, store=store_session, match_how="prefix", ) list_metapartitions = list(generator) mp_list = list_metapartitions[0]
def test_align_datasets_prefix__equal_number_of_partitions( dataset, evaluation_dataset, store_session): """ Test a scenario where the simple prefix match algorithm didn't find any matches in case of equal number of partitions in both datasets. """ # Create a reference dataset which matches the problem (equal number of # partitions and suitable for prefix matching) mp = MetaPartition(label="cluster_1_1", metadata_version=dataset.metadata_version) mp2 = MetaPartition(label="cluster_2_1", metadata_version=dataset.metadata_version) metapartitions = [mp, mp2] store_dataset_from_partitions( partition_list=metapartitions, dataset_uuid="reference_dataset_uuid", store=store_session, ) generator = align_datasets( left_dataset_uuid=dataset.uuid, right_dataset_uuid="reference_dataset_uuid", store=store_session, match_how="prefix", ) assert isinstance(generator, types.GeneratorType) list_metapartitions = list(generator) # Two separate cluster_groups (e.g. cluster_1*) assert len(list_metapartitions) == 2 mp_list = list_metapartitions[0] assert len(mp_list) == 2 mp_list = list_metapartitions[1] assert len(mp_list) == 2 # Test sorting of datasets by length, i.e. order of dataframes is different generator = align_datasets( left_dataset_uuid=evaluation_dataset.uuid, right_dataset_uuid=dataset.uuid, store=store_session, match_how="prefix", ) list_metapartitions = list(generator) mp_list = list_metapartitions[0]
def test_align_datasets_right(dataset, evaluation_dataset, store_session): generator = align_datasets( left_dataset_uuid=dataset.uuid, right_dataset_uuid=evaluation_dataset.uuid, store=store_session, match_how="right", ) assert isinstance(generator, types.GeneratorType) list_metapartitions = list(generator) assert len(list_metapartitions) == len(evaluation_dataset.partitions) mp_list = list_metapartitions[0] assert len(mp_list) == 3, [mp.label for mp in mp_list] expected = ["cluster_1_1", "cluster_1", "cluster_2"] assert [mp.label for mp in mp_list] == expected mp_list = list_metapartitions[1] assert len(mp_list) == 3, [mp.label for mp in mp_list] expected = ["cluster_1_2", "cluster_1", "cluster_2"] assert [mp.label for mp in mp_list] == expected mp_list = list_metapartitions[2] assert len(mp_list) == 3, [mp.label for mp in mp_list] expected = ["cluster_2_1", "cluster_1", "cluster_2"] assert [mp.label for mp in mp_list] == expected mp_list = list_metapartitions[3] assert len(mp_list) == 3, [mp.label for mp in mp_list] expected = ["cluster_2_2", "cluster_1", "cluster_2"] assert [mp.label for mp in mp_list] == expected
def merge_datasets_as_delayed( left_dataset_uuid, right_dataset_uuid, store, merge_tasks, match_how="exact", label_merger=None, metadata_merger=None, ): """ A dask.delayed graph to perform the merge of two full kartothek datasets. Parameters ---------- left_dataset_uuid : basestring UUID for left dataset (order does not matter in all merge schemas) right_dataset_uuid : basestring UUID for right dataset (order does not matter in all merge schemas) match_how : basestring or callable, {left, right, prefix, exact} Define the partition label matching scheme. Available implementations are: Parameters ---------- left_dataset_uuid : str UUID for left dataset (order does not matter in all merge schemas) right_dataset_uuid : str UUID for right dataset (order does not matter in all merge schemas) match_how : Union[str, Callable] Define the partition label matching scheme. Available implementations are: * left (right) : The left (right) partitions are considered to be the base partitions and **all** partitions of the right (left) dataset are joined to the left partition. This should only be used if one of the datasets contain very few partitions. * prefix : The labels of the partitions of the dataset with fewer partitions are considered to be the prefixes to the right dataset * exact : All partition labels of the left dataset need to have an exact match in the right dataset * callable : A callable with signature func(left, right) which returns a boolean to determine if the partitions match If True, an exact match of partition labels between the to-be-merged datasets is required in order to merge. If False (Default), the partition labels of the dataset with fewer partitions are interpreted as prefixes. merge_tasks : List[Dict] A list of merge tasks. Each item in this list is a dictionary giving explicit instructions for a specific merge. Each dict should contain key/values: * `left`: The table for the left dataframe * `right`: The table for the right dataframe * 'output_label' : The table for the merged dataframe * `merge_func`: A callable with signature `merge_func(left_df, right_df, merge_kwargs)` to handle the data preprocessing and merging. Default pandas.merge * 'merge_kwargs' : The kwargs to be passed to the `merge_func` Example: .. code:: >>> merge_tasks = [ ... { ... "left": "left_dict", ... "right": "right_dict", ... "merge_kwargs": {"kwargs of merge_func": ''}, ... "output_label": 'merged_core_data' ... }, ... ] """ _check_callable(store) mps = align_datasets( left_dataset_uuid=left_dataset_uuid, right_dataset_uuid=right_dataset_uuid, store=store, match_how=match_how, ) mps = map_delayed( mps, _load_and_merge_mps, store=store, label_merger=label_merger, metadata_merger=metadata_merger, merge_tasks=merge_tasks, ) return mps