Exemplo n.º 1
0
 def time_store_dataset_from_partitions(self, num_partitions, max_depth, num_leafs):
     store_dataset_from_partitions(
         partition_list=self.partitions,
         store=self.store,
         dataset_uuid=self.dataset_uuid,
         dataset_metadata=self.user_dataset_metadata,
     )
Exemplo n.º 2
0
def test_collect_dataset_metadata_empty_dataset_mp(store_factory):
    mp = MetaPartition(label="cluster_1")
    store_dataset_from_partitions(partition_list=[mp],
                                  store=store_factory,
                                  dataset_uuid="dataset_uuid")

    df_stats = collect_dataset_metadata(store=store_factory,
                                        dataset_uuid="dataset_uuid",
                                        table_name="table").compute()

    expected = pd.DataFrame(columns=_METADATA_SCHEMA.keys())
    expected = expected.astype(_METADATA_SCHEMA)
    pd.testing.assert_frame_equal(expected, df_stats, check_index_type=False)
Exemplo n.º 3
0
def test_align_datasets_prefix__equal_number_of_partitions(
        dataset, evaluation_dataset, store_session):
    """
    Test a scenario where the simple prefix match algorithm didn't find any
    matches in case of equal number of partitions in both datasets.
    """

    # Create a reference dataset which matches the problem (equal number of
    # partitions and suitable for prefix matching)
    mp = MetaPartition(label="cluster_1_1",
                       metadata_version=dataset.metadata_version)
    mp2 = MetaPartition(label="cluster_2_1",
                        metadata_version=dataset.metadata_version)
    metapartitions = [mp, mp2]
    store_dataset_from_partitions(
        partition_list=metapartitions,
        dataset_uuid="reference_dataset_uuid",
        store=store_session,
    )

    generator = align_datasets(
        left_dataset_uuid=dataset.uuid,
        right_dataset_uuid="reference_dataset_uuid",
        store=store_session,
        match_how="prefix",
    )
    assert isinstance(generator, types.GeneratorType)
    list_metapartitions = list(generator)

    # Two separate cluster_groups (e.g. cluster_1*)
    assert len(list_metapartitions) == 2

    mp_list = list_metapartitions[0]

    assert len(mp_list) == 2

    mp_list = list_metapartitions[1]
    assert len(mp_list) == 2

    # Test sorting of datasets by length, i.e. order of dataframes is different
    generator = align_datasets(
        left_dataset_uuid=evaluation_dataset.uuid,
        right_dataset_uuid=dataset.uuid,
        store=store_session,
        match_how="prefix",
    )
    list_metapartitions = list(generator)
    mp_list = list_metapartitions[0]
Exemplo n.º 4
0
def test_store_dataset_from_partitions(meta_partitions_files_only, store,
                                       frozen_time):
    dataset = store_dataset_from_partitions(
        partition_list=meta_partitions_files_only,
        dataset_uuid="dataset_uuid",
        store=store,
        dataset_metadata={"some": "metadata"},
    )

    expected_metadata = {
        "some": "metadata",
        "creation_time": TIME_TO_FREEZE_ISO
    }

    assert dataset.metadata == expected_metadata
    assert sorted(dataset.partitions.values(),
                  key=lambda x: x.label) == sorted(
                      [mp.partition for mp in meta_partitions_files_only],
                      key=lambda x: x.label)
    assert dataset.uuid == "dataset_uuid"

    store_files = list(store.keys())
    # Dataset metadata: 1 file
    expected_number_files = 1
    # common metadata for v4 datasets
    expected_number_files += 1
    assert len(store_files) == expected_number_files

    # Ensure the dataset can be loaded properly
    stored_dataset = DatasetMetadata.load_from_store("dataset_uuid", store)
    assert dataset == stored_dataset
Exemplo n.º 5
0
def evaluation_dataset(meta_partitions_evaluation_files_only, store_session):
    with cm_frozen_time(TIME_TO_FREEZE):
        return store_dataset_from_partitions(
            partition_list=meta_partitions_evaluation_files_only,
            dataset_uuid="evaluation_uuid",
            store=store_session,
        )
Exemplo n.º 6
0
def update_dataset_from_partitions(
    partition_list,
    store_factory,
    dataset_uuid,
    ds_factory,
    delete_scope,
    metadata,
    metadata_merger,
):
    store = ensure_store(store_factory)

    if ds_factory:
        ds_factory = ds_factory.load_all_indices()
        remove_partitions = _get_partitions(ds_factory, delete_scope)

        index_columns = list(ds_factory.indices.keys())
        for column in index_columns:
            index = ds_factory.indices[column]
            if isinstance(index, PartitionIndex):
                del ds_factory.indices[column]
    else:
        # Dataset does not exist yet.
        remove_partitions = []

    new_dataset = store_dataset_from_partitions(
        partition_list=partition_list,
        store=store,
        dataset_uuid=dataset_uuid,
        dataset_metadata=metadata,
        metadata_merger=metadata_merger,
        update_dataset=ds_factory,
        remove_partitions=remove_partitions,
    )

    return new_dataset
Exemplo n.º 7
0
def store_dataframes_as_dataset__iter(
    df_generator,
    store,
    dataset_uuid=None,
    metadata=None,
    partition_on=None,
    df_serializer=None,
    overwrite=False,
    metadata_storage_format=DEFAULT_METADATA_STORAGE_FORMAT,
    metadata_version=DEFAULT_METADATA_VERSION,
    secondary_indices=None,
):
    """
    Store `pd.DataFrame` s iteratively as a partitioned dataset with multiple tables (files).

    Useful for datasets which do not fit into memory.

    Parameters
    ----------

    Returns
    -------
    dataset: kartothek.core.dataset.DatasetMetadata
        The stored dataset.

    """

    if dataset_uuid is None:
        dataset_uuid = gen_uuid()

    if not overwrite:
        raise_if_dataset_exists(dataset_uuid=dataset_uuid, store=store)

    new_partitions = []
    for df in df_generator:
        mp = parse_input_to_metapartition(df,
                                          metadata_version=metadata_version)

        if partition_on:
            mp = mp.partition_on(partition_on)

        if secondary_indices:
            mp = mp.build_indices(secondary_indices)

        # Store dataframe, thereby clearing up the dataframe from the `mp` metapartition
        mp = mp.store_dataframes(store=store,
                                 dataset_uuid=dataset_uuid,
                                 df_serializer=df_serializer)

        # Add `kartothek.io_components.metapartition.MetaPartition` object to list to track partitions
        new_partitions.append(mp)

    # Store metadata and return `kartothek.DatasetMetadata` object
    return store_dataset_from_partitions(
        partition_list=new_partitions,
        dataset_uuid=dataset_uuid,
        store=store,
        dataset_metadata=metadata,
        metadata_storage_format=metadata_storage_format,
    )
Exemplo n.º 8
0
def _multiplex_store_dataset_from_partitions_flat(mpss, cube, metadata, update,
                                                  store, existing_datasets):
    dct = defaultdict(list)
    for sublist in mpss:
        for mp in sublist:
            for k, v in mp.items():
                dct[k].append(v)

    result = {}
    for k, v in dct.items():
        if update:
            ds_factory = metadata_factory_from_dataset(existing_datasets[k],
                                                       with_schema=True,
                                                       store=store)
            result[k] = update_dataset_from_partitions(
                v,
                dataset_uuid=cube.ktk_dataset_uuid(k),
                delete_scope=[],
                ds_factory=ds_factory,
                metadata=metadata[k],
                metadata_merger=None,
                store_factory=store,
            )
        else:
            result[k] = store_dataset_from_partitions(
                v,
                dataset_metadata=metadata[k],
                dataset_uuid=cube.ktk_dataset_uuid(k),
                metadata_merger=None,
                metadata_storage_format=KTK_CUBE_METADATA_STORAGE_FORMAT,
                store=store,
            )

    # list required for dask.bag
    return [result]
Exemplo n.º 9
0
def dataset_function(meta_partitions_files_only_function, store):
    """
    Create a proper kartothek dataset in store with two partitions

    """
    with cm_frozen_time(TIME_TO_FREEZE):
        return store_dataset_from_partitions(
            partition_list=meta_partitions_files_only_function,
            dataset_uuid="dataset_uuid",
            store=store,
            dataset_metadata={"dataset": "metadata"},
        )
Exemplo n.º 10
0
def dataset_alternative_table_name(
        meta_partitions_files_only_alternative_table_name, store_factory):
    """
    Create a proper kartothek dataset in store with two partitions
    """
    with cm_frozen_time(TIME_TO_FREEZE):
        return store_dataset_from_partitions(
            partition_list=meta_partitions_files_only_alternative_table_name,
            dataset_uuid="dataset_uuid_alternative_name",
            store=store_factory(),
            dataset_metadata={"dataset": "metadata"},
        )
Exemplo n.º 11
0
def store_dataframes_as_dataset(
    store,
    dataset_uuid,
    dfs,
    metadata=None,
    partition_on=None,
    df_serializer=None,
    overwrite=False,
    metadata_storage_format=DEFAULT_METADATA_STORAGE_FORMAT,
    metadata_version=DEFAULT_METADATA_VERSION,
):
    """
    Utility function to store a list of dataframes as a partitioned dataset with multiple tables (files).

    Useful for very small datasets where all data fits into memory.

    Parameters
    ----------
    dfs : dict of pd.DataFrame or pd.DataFrame
        The dataframe(s) to be stored. If only a single dataframe is passed, it will be stored as the `core` table.

    Returns
    -------
    The stored dataset

    """
    if dataset_uuid is None:
        dataset_uuid = gen_uuid()

    if isinstance(dfs, dict):
        dfs = {"data": [(table, df) for table, df in dfs.items()]}

    if not overwrite:
        raise_if_dataset_exists(dataset_uuid=dataset_uuid, store=store)

    mp = parse_input_to_metapartition(dfs, metadata_version)

    if partition_on:
        mp = MetaPartition.partition_on(mp, partition_on)

    mps = mp.store_dataframes(store=store,
                              dataset_uuid=dataset_uuid,
                              df_serializer=df_serializer)

    return store_dataset_from_partitions(
        partition_list=mps,
        dataset_uuid=dataset_uuid,
        store=store,
        dataset_metadata=metadata,
        metadata_storage_format=metadata_storage_format,
    )
Exemplo n.º 12
0
def dataset_partition_keys(meta_partitions_dataframe, store_session_factory):
    """
    Create a proper kartothek dataset in store with two partitions

    """
    with cm_frozen_time(TIME_TO_FREEZE):
        new_mps = []
        for mp in meta_partitions_dataframe:
            new_mps.append(mp.partition_on(["P"]))
        new_mps = _store_metapartitions(new_mps, store_session_factory())

        return store_dataset_from_partitions(
            partition_list=new_mps,
            dataset_uuid="dataset_uuid_partition_keys",
            store=store_session_factory(),
            dataset_metadata={"dataset": "metadata"},
        )
Exemplo n.º 13
0
def _store_dataset_from_partitions_flat(mpss, *args, **kwargs):
    return store_dataset_from_partitions(
        [mp for sublist in mpss for mp in sublist], *args, **kwargs)
Exemplo n.º 14
0
def test_store_dataset_from_partitions_update(store, metadata_version,
                                              frozen_time):
    mp1 = MetaPartition(
        label="cluster_1",
        data=pd.DataFrame({"p": [1]}),
        file="1.parquet",
        indices={
            "p": ExplicitSecondaryIndex("p", index_dct={1: ["cluster_1"]})
        },
        metadata_version=metadata_version,
    )
    mp2 = MetaPartition(
        label="cluster_2",
        data=pd.DataFrame({"p": [2]}),
        file="2.parquet",
        indices={
            "p": ExplicitSecondaryIndex("p", index_dct={2: ["cluster_2"]})
        },
        metadata_version=metadata_version,
    )
    dataset = store_dataset_from_partitions(
        partition_list=[mp1, mp2],
        dataset_uuid="dataset_uuid",
        store=store,
        dataset_metadata={"dataset": "metadata"},
    )
    dataset = dataset.load_index("p", store)

    mp3 = MetaPartition(
        label="cluster_3",
        data=pd.DataFrame({"p": [3]}),
        file="3.parquet",
        indices={
            "p": ExplicitSecondaryIndex("p", index_dct={3: ["cluster_3"]})
        },
        metadata_version=metadata_version,
    )

    dataset_updated = store_dataset_from_partitions(
        partition_list=[mp3],
        dataset_uuid="dataset_uuid",
        store=store,
        dataset_metadata={"extra": "metadata"},
        update_dataset=dataset,
        remove_partitions=["cluster_1"],
    )
    dataset_updated = dataset_updated.load_index("p", store)
    expected_metadata = {"dataset": "metadata", "extra": "metadata"}

    expected_metadata["creation_time"] = TIME_TO_FREEZE_ISO

    assert dataset_updated.metadata == expected_metadata
    assert list(dataset.partitions) == ["cluster_1", "cluster_2"]
    assert list(dataset_updated.partitions) == ["cluster_2", "cluster_3"]
    assert dataset_updated.partitions["cluster_3"] == mp3.partition
    assert dataset_updated.uuid == "dataset_uuid"

    store_files = list(store.keys())
    # 1 dataset metadata file and 1 index file
    # note: the update writes a new index file but due to frozen_time this gets
    # the same name as the previous one and overwrites it.
    expected_number_files = 2
    # common metadata for v4 datasets (1 table)
    expected_number_files += 1
    assert len(store_files) == expected_number_files

    assert dataset.indices["p"].index_dct == {
        1: ["cluster_1"],
        2: ["cluster_2"]
    }
    assert dataset_updated.indices["p"].index_dct == {
        2: ["cluster_2"],
        3: ["cluster_3"],
    }

    # Ensure the dataset can be loaded properly
    stored_dataset = DatasetMetadata.load_from_store("dataset_uuid", store)
    stored_dataset = stored_dataset.load_index("p", store)
    assert dataset_updated == stored_dataset
Exemplo n.º 15
0
def _commit_store_from_reduction(df_mps, **kwargs):
    partitions = pd.Series(df_mps.values.flatten()).dropna()
    return store_dataset_from_partitions(
        partition_list=partitions,
        **kwargs,
    )