def discover_datasets_unchecked( uuid_prefix: str, store: Union[Callable[[], KeyValueStore], KeyValueStore], filter_ktk_cube_dataset_ids: Optional[Union[str, Iterable[str]]] = None, ) -> Dict[str, DatasetMetadata]: """ Get all known datasets that may belong to a give cube w/o applying any checks. .. warning:: The results are not checked for validity. Found datasets may be incompatible w/ the given cube. Use :meth:`check_datasets` to check the results, or go for :meth:`discover_datasets` in the first place. Parameters ---------- uuid_prefix Dataset UUID prefix. store KV store. filter_ktk_cube_dataset_ids Optional selection of datasets to include. Returns ------- datasets: Dict[str, DatasetMetadata] All discovered datasets. Empty Dict if no dataset is found """ if callable(store): store = store() filter_ktk_cube_dataset_ids = converter_str_set_optional( filter_ktk_cube_dataset_ids) prefix = uuid_prefix + KTK_CUBE_UUID_SEPERATOR names = _discover_dataset_meta_files(prefix, store) if filter_ktk_cube_dataset_ids is not None: names = { name for name in names if name[len(prefix):] in filter_ktk_cube_dataset_ids } result = {} # sorted iteration for determistic error messages in case DatasetMetadata.load_from_store fails for name in sorted(names): try: result[name[len(prefix):]] = DatasetMetadata.load_from_store( uuid=name, store=store, load_schema=True, load_all_indices=False) except KeyError as e: _logger.warning( 'Ignore dataset "{name}" due to KeyError: {e}'.format( name=name, e=e)) return result
def discover_datasets( cube: Cube, store: StoreInput, filter_ktk_cube_dataset_ids: Optional[Union[str, Iterable[str]]] = None, ) -> Dict[str, DatasetMetadata]: """ Get all known datasets that belong to a give cube. Parameters ---------- cube Cube specification. store KV store. filter_ktk_cube_dataset_ids Optional selection of datasets to include. Returns ------- datasets: Dict[str, DatasetMetadata] All discovered datasets. Raises ------ ValueError In case no valid cube could be discovered. """ filter_ktk_cube_dataset_ids = converter_str_set_optional( filter_ktk_cube_dataset_ids ) result = discover_datasets_unchecked( cube.uuid_prefix, store, filter_ktk_cube_dataset_ids ) if filter_ktk_cube_dataset_ids is not None: if isinstance(filter_ktk_cube_dataset_ids, str): filter_ktk_cube_dataset_ids = {filter_ktk_cube_dataset_ids} else: filter_ktk_cube_dataset_ids = set(filter_ktk_cube_dataset_ids) missing = filter_ktk_cube_dataset_ids - set(result.keys()) if missing: raise ValueError( "Could not find the following requested datasets: {missing}".format( missing=", ".join(sorted(missing)) ) ) check_datasets(result, cube) return result
def discover_datasets(cube, store, filter_ktk_cube_dataset_ids=None): """ Get all known datasets that belong to a give cube. Parameters ---------- cube: kartothek.core.cube.cube.Cube Cube specification. store: Union[Callable[[], simplekv.KeyValueStore], simplekv.KeyValueStore] KV store. filter_ktk_cube_dataset_ids: Union[None, str, Iterable[str]] Optional selection of datasets to include. Returns ------- datasets: Dict[str, kartothek.core.dataset.DatasetMetadata] All discovered datasets. Raises ------ ValueError In case no valid cube could be discovered. """ filter_ktk_cube_dataset_ids = converter_str_set_optional( filter_ktk_cube_dataset_ids) result = discover_datasets_unchecked(cube.uuid_prefix, store, filter_ktk_cube_dataset_ids) if filter_ktk_cube_dataset_ids is not None: missing = filter_ktk_cube_dataset_ids - set(result.keys()) if missing: raise ValueError( "Could not find the following requested datasets: {missing}". format(missing=", ".join(sorted(missing)))) check_datasets(result, cube) return result
def prepare_metapartitions_for_removal_action(cube, store, conditions, ktk_cube_dataset_ids, existing_datasets): """ Prepare MetaPartition to express removal of given data range from cube. The MetaPartition must still be written using ``mp.store_dataframes(...)`` and added to the Dataset using a kartothek update method. Parameters ---------- cube: kartothek.core.cube.cube.Cube Cube spec. store: Union[simplekv.KeyValueStore, Callable[[], simplekv.KeyValueStore]] Store. conditions: Union[None, Condition, Iterable[Condition], Conjunction] Conditions that should be applied, optional. Defaults to "entire cube". ktk_cube_dataset_ids: Optional[Union[Iterable[str], str]] Ktk_cube dataset IDs to apply the remove action to, optional. Default to "all". existing_datasets: Dict[str, kartothek.core.dataset.DatasetMetadata] Existing datasets. Returns ------- metapartitions: Dict[str, Tuple[kartothek.core.dataset.DatasetMetadata, kartothek.io_components.metapartition.MetaPartition, List[Dict[str, Any]]]] MetaPartitions that should be written and updatet to the kartothek datasets as well as the ``delete_scope`` for kartothek. """ conditions = Conjunction(conditions) conditions_split = conditions.split_by_column() if set(conditions_split.keys()) - set(cube.partition_columns): raise ValueError( "Can only remove partitions with conditions concerning cubes physical partition columns." ) ktk_cube_dataset_ids = converter_str_set_optional(ktk_cube_dataset_ids) if ktk_cube_dataset_ids is not None: unknown_dataset_ids = ktk_cube_dataset_ids - set( existing_datasets.keys()) if unknown_dataset_ids: raise ValueError("Unknown ktk_cube_dataset_ids: {}".format( ", ".join(sorted(unknown_dataset_ids)))) else: ktk_cube_dataset_ids = set(existing_datasets.keys()) metapartitions = {} for ktk_cube_dataset_id in ktk_cube_dataset_ids: ds = existing_datasets[ktk_cube_dataset_id] ds = ds.load_partition_indices() mp = _prepare_mp_empty(ds) if not ds.partition_keys: # no partition keys --> delete all delete_scope = [{}] else: df_partitions = get_partition_dataframe(dataset=ds, cube=cube) df_partitions = df_partitions.drop_duplicates() local_condition = reduce( lambda a, b: a & b, (cond for col, cond in conditions_split.items() if col in df_partitions.columns), Conjunction([]), ) df_partitions = local_condition.filter_df(df_partitions) delete_scope = df_partitions.to_dict(orient="records") metapartitions[ktk_cube_dataset_id] = (ds, mp, delete_scope) return metapartitions
def test_str_set_optional(param, expected): actual = converter_str_set_optional(param) assert actual == expected if actual is not None: assert isinstance(actual, frozenset) assert all(isinstance(x, str) for x in actual)