예제 #1
0
파일: discover.py 프로젝트: lr4d/kartothek
def discover_datasets_unchecked(
    uuid_prefix: str,
    store: Union[Callable[[], KeyValueStore], KeyValueStore],
    filter_ktk_cube_dataset_ids: Optional[Union[str, Iterable[str]]] = None,
) -> Dict[str, DatasetMetadata]:
    """
    Get all known datasets that may belong to a give cube w/o applying any checks.

    .. warning::
        The results are not checked for validity. Found datasets may be incompatible w/ the given cube. Use
        :meth:`check_datasets` to check the results, or go for :meth:`discover_datasets` in the first place.

    Parameters
    ----------
    uuid_prefix
        Dataset UUID prefix.
    store
        KV store.
    filter_ktk_cube_dataset_ids
        Optional selection of datasets to include.

    Returns
    -------
    datasets: Dict[str, DatasetMetadata]
        All discovered datasets. Empty Dict if no dataset is found
    """
    if callable(store):
        store = store()
    filter_ktk_cube_dataset_ids = converter_str_set_optional(
        filter_ktk_cube_dataset_ids)
    prefix = uuid_prefix + KTK_CUBE_UUID_SEPERATOR

    names = _discover_dataset_meta_files(prefix, store)

    if filter_ktk_cube_dataset_ids is not None:
        names = {
            name
            for name in names
            if name[len(prefix):] in filter_ktk_cube_dataset_ids
        }

    result = {}
    # sorted iteration for determistic error messages in case DatasetMetadata.load_from_store fails
    for name in sorted(names):
        try:
            result[name[len(prefix):]] = DatasetMetadata.load_from_store(
                uuid=name,
                store=store,
                load_schema=True,
                load_all_indices=False)
        except KeyError as e:
            _logger.warning(
                'Ignore dataset "{name}" due to KeyError: {e}'.format(
                    name=name, e=e))

    return result
예제 #2
0
def discover_datasets(
    cube: Cube,
    store: StoreInput,
    filter_ktk_cube_dataset_ids: Optional[Union[str, Iterable[str]]] = None,
) -> Dict[str, DatasetMetadata]:
    """
    Get all known datasets that belong to a give cube.

    Parameters
    ----------
    cube
        Cube specification.
    store
        KV store.
    filter_ktk_cube_dataset_ids
        Optional selection of datasets to include.

    Returns
    -------
    datasets: Dict[str, DatasetMetadata]
        All discovered datasets.

    Raises
    ------
    ValueError
        In case no valid cube could be discovered.
    """
    filter_ktk_cube_dataset_ids = converter_str_set_optional(
        filter_ktk_cube_dataset_ids
    )
    result = discover_datasets_unchecked(
        cube.uuid_prefix, store, filter_ktk_cube_dataset_ids
    )
    if filter_ktk_cube_dataset_ids is not None:
        if isinstance(filter_ktk_cube_dataset_ids, str):
            filter_ktk_cube_dataset_ids = {filter_ktk_cube_dataset_ids}
        else:
            filter_ktk_cube_dataset_ids = set(filter_ktk_cube_dataset_ids)
        missing = filter_ktk_cube_dataset_ids - set(result.keys())
        if missing:
            raise ValueError(
                "Could not find the following requested datasets: {missing}".format(
                    missing=", ".join(sorted(missing))
                )
            )
    check_datasets(result, cube)

    return result
예제 #3
0
def discover_datasets(cube, store, filter_ktk_cube_dataset_ids=None):
    """
    Get all known datasets that belong to a give cube.

    Parameters
    ----------
    cube: kartothek.core.cube.cube.Cube
        Cube specification.
    store: Union[Callable[[], simplekv.KeyValueStore], simplekv.KeyValueStore]
        KV store.
    filter_ktk_cube_dataset_ids: Union[None, str, Iterable[str]]
        Optional selection of datasets to include.

    Returns
    -------
    datasets: Dict[str, kartothek.core.dataset.DatasetMetadata]
        All discovered datasets.

    Raises
    ------
    ValueError
        In case no valid cube could be discovered.
    """
    filter_ktk_cube_dataset_ids = converter_str_set_optional(
        filter_ktk_cube_dataset_ids)
    result = discover_datasets_unchecked(cube.uuid_prefix, store,
                                         filter_ktk_cube_dataset_ids)
    if filter_ktk_cube_dataset_ids is not None:
        missing = filter_ktk_cube_dataset_ids - set(result.keys())
        if missing:
            raise ValueError(
                "Could not find the following requested datasets: {missing}".
                format(missing=", ".join(sorted(missing))))
    check_datasets(result, cube)

    return result
예제 #4
0
def prepare_metapartitions_for_removal_action(cube, store, conditions,
                                              ktk_cube_dataset_ids,
                                              existing_datasets):
    """
    Prepare MetaPartition to express removal of given data range from cube.

    The MetaPartition must still be written using ``mp.store_dataframes(...)`` and added to the Dataset using a
    kartothek update method.

    Parameters
    ----------
    cube: kartothek.core.cube.cube.Cube
        Cube spec.
    store: Union[simplekv.KeyValueStore, Callable[[], simplekv.KeyValueStore]]
        Store.
    conditions: Union[None, Condition, Iterable[Condition], Conjunction]
        Conditions that should be applied, optional. Defaults to "entire cube".
    ktk_cube_dataset_ids: Optional[Union[Iterable[str], str]]
        Ktk_cube dataset IDs to apply the remove action to, optional. Default to "all".
    existing_datasets: Dict[str, kartothek.core.dataset.DatasetMetadata]
        Existing datasets.

    Returns
    -------
    metapartitions: Dict[str, Tuple[kartothek.core.dataset.DatasetMetadata,
            kartothek.io_components.metapartition.MetaPartition, List[Dict[str, Any]]]]
        MetaPartitions that should be written and updatet to the kartothek datasets as well as the ``delete_scope`` for
        kartothek.
    """
    conditions = Conjunction(conditions)
    conditions_split = conditions.split_by_column()
    if set(conditions_split.keys()) - set(cube.partition_columns):
        raise ValueError(
            "Can only remove partitions with conditions concerning cubes physical partition columns."
        )

    ktk_cube_dataset_ids = converter_str_set_optional(ktk_cube_dataset_ids)
    if ktk_cube_dataset_ids is not None:
        unknown_dataset_ids = ktk_cube_dataset_ids - set(
            existing_datasets.keys())
        if unknown_dataset_ids:
            raise ValueError("Unknown ktk_cube_dataset_ids: {}".format(
                ", ".join(sorted(unknown_dataset_ids))))
    else:
        ktk_cube_dataset_ids = set(existing_datasets.keys())

    metapartitions = {}
    for ktk_cube_dataset_id in ktk_cube_dataset_ids:
        ds = existing_datasets[ktk_cube_dataset_id]
        ds = ds.load_partition_indices()
        mp = _prepare_mp_empty(ds)

        if not ds.partition_keys:
            # no partition keys --> delete all
            delete_scope = [{}]
        else:

            df_partitions = get_partition_dataframe(dataset=ds, cube=cube)
            df_partitions = df_partitions.drop_duplicates()
            local_condition = reduce(
                lambda a, b: a & b,
                (cond for col, cond in conditions_split.items()
                 if col in df_partitions.columns),
                Conjunction([]),
            )
            df_partitions = local_condition.filter_df(df_partitions)

            delete_scope = df_partitions.to_dict(orient="records")

        metapartitions[ktk_cube_dataset_id] = (ds, mp, delete_scope)

    return metapartitions
예제 #5
0
def test_str_set_optional(param, expected):
    actual = converter_str_set_optional(param)
    assert actual == expected
    if actual is not None:
        assert isinstance(actual, frozenset)
        assert all(isinstance(x, str) for x in actual)