def test_ensure_store(store_input_types): store = ensure_store(store_input_types) assert isinstance(store, KeyValueStore) value = b"value" key = "key" store.put(key, value) assert value == store.get(key) assert store is ensure_store(store)
def _discover_dataset_meta_files(prefix: str, store: StoreInput) -> Set[str]: """ Get meta file names for all datasets. Parameters ---------- prefix the prefix. store KV store. Returns ------- names: Set[str] The meta file names """ store = ensure_store(store) names = { name[: -len(METADATA_BASE_SUFFIX + suffix)] for name in store.iter_prefixes(delimiter="/", prefix=prefix) for suffix in [METADATA_FORMAT_JSON, METADATA_FORMAT_MSGPACK] if name.endswith(METADATA_BASE_SUFFIX + suffix) } return names
def validate_schema_compatible(self, store: StoreInput, dataset_uuid: str) -> "MetaPartition": """ Validates that the currently held DataFrames match the schema of the existing dataset. Parameters ---------- store If it is a function, the result of calling it must be a KeyValueStore. dataset_uuid The dataset UUID the partition will be assigned to """ # Load the reference meta of the existing dataset. Using the built-in # `load_all_table_meta` would not be helpful here as it would be a no-op # as we have already loaded the meta from the input DataFrame. store = ensure_store(store) reference_meta = read_schema_metadata(dataset_uuid=dataset_uuid, store=store, table=self.table_name) try: validate_compatible([self.schema, reference_meta]) except ValueError as e: raise ValueError( f"Schemas for dataset '{dataset_uuid}' are not compatible!\n\n{e}" ) return self
def load(self, store: StoreInput): """ Load an external index into memory. Returns a new index object that contains the index dictionary. Returns itself if the index is internal or an already loaded index. Parameters ---------- store Object that implements the .get method for file/object loading. Returns ------- index: [kartothek.core.index.ExplicitSecondaryIndex] """ if self.loaded: return self store = ensure_store(store) index_buffer = store.get(self.index_storage_key) index_dct, column_type = _parquet_bytes_to_dict( self.column, index_buffer) return ExplicitSecondaryIndex( column=self.column, index_dct=index_dct, dtype=column_type, index_storage_key=self.index_storage_key, normalize_dtype=False, )
def validate_partition_keys( dataset_uuid, store, ds_factory, default_metadata_version, partition_on, load_dataset_metadata=True, ): if ds_factory or DatasetMetadata.exists(dataset_uuid, ensure_store(store)): ds_factory = _ensure_factory( dataset_uuid=dataset_uuid, store=store, factory=ds_factory, load_dataset_metadata=load_dataset_metadata, ) ds_metadata_version = ds_factory.metadata_version if partition_on: if not isinstance(partition_on, list): partition_on = [partition_on] if partition_on != ds_factory.partition_keys: raise ValueError( "Incompatible set of partition keys encountered. " "Input partitioning was `{}` while actual dataset was `{}`".format( partition_on, ds_factory.partition_keys ) ) else: partition_on = ds_factory.partition_keys else: ds_factory = None ds_metadata_version = default_metadata_version return ds_factory, ds_metadata_version, partition_on
def update_dataset_from_partitions( partition_list, store_factory, dataset_uuid, ds_factory, delete_scope, metadata, metadata_merger, ): store = ensure_store(store_factory) if ds_factory: ds_factory = ds_factory.load_all_indices() remove_partitions = _get_partitions(ds_factory, delete_scope) index_columns = list(ds_factory.indices.keys()) for column in index_columns: index = ds_factory.indices[column] if isinstance(index, PartitionIndex): del ds_factory.indices[column] else: # Dataset does not exist yet. remove_partitions = [] new_dataset = store_dataset_from_partitions( partition_list=partition_list, store=store, dataset_uuid=dataset_uuid, dataset_metadata=metadata, metadata_merger=metadata_merger, update_dataset=ds_factory, remove_partitions=remove_partitions, ) return new_dataset
def write_partition( partition_df: MetaPartitionInput, secondary_indices: List[str], sort_partitions_by: List[str], dataset_uuid: str, partition_on: List[str], store_factory: StoreFactory, df_serializer: Optional[DataFrameSerializer], metadata_version: int, dataset_table_name: str = SINGLE_TABLE, ) -> MetaPartition: """ Write a dataframe to store, performing all necessary preprocessing tasks like partitioning, bucketing (NotImplemented), indexing, etc. in the correct order. """ store = ensure_store(store_factory) # I don't have access to the group values mps = parse_input_to_metapartition( partition_df, metadata_version=metadata_version, table_name=dataset_table_name, ) if sort_partitions_by: mps = mps.apply( partial(sort_values_categorical, columns=sort_partitions_by)) if partition_on: mps = mps.partition_on(partition_on) if secondary_indices: mps = mps.build_indices(secondary_indices) return mps.store_dataframes(store=store, dataset_uuid=dataset_uuid, df_serializer=df_serializer)
def discover_datasets_unchecked( uuid_prefix: str, store: StoreInput, filter_ktk_cube_dataset_ids: Optional[Union[str, Iterable[str]]] = None, ) -> Dict[str, DatasetMetadata]: """ Get all known datasets that may belong to a give cube w/o applying any checks. .. warning:: The results are not checked for validity. Found datasets may be incompatible w/ the given cube. Use :meth:`check_datasets` to check the results, or go for :meth:`discover_datasets` in the first place. Parameters ---------- uuid_prefix Dataset UUID prefix. store KV store. filter_ktk_cube_dataset_ids Optional selection of datasets to include. Returns ------- datasets: Dict[str, DatasetMetadata] All discovered datasets. Empty Dict if no dataset is found """ store = ensure_store(store) filter_ktk_cube_dataset_ids = converter_str_set_optional( filter_ktk_cube_dataset_ids) prefix = uuid_prefix + KTK_CUBE_UUID_SEPERATOR names = _discover_dataset_meta_files(prefix, store) if filter_ktk_cube_dataset_ids is not None: names = { name for name in names if name[len(prefix):] in filter_ktk_cube_dataset_ids } result = {} # sorted iteration for determistic error messages in case DatasetMetadata.load_from_store fails for name in sorted(names): try: result[name[len(prefix):]] = DatasetMetadata.load_from_store( uuid=name, store=store, load_schema=True, load_all_indices=False) except KeyError as e: _logger.warning( 'Ignore dataset "{name}" due to KeyError: {e}'.format( name=name, e=e)) return result
def store(self, store: StoreInput, dataset_uuid: str) -> str: """ Store the index as a parquet file If compatible, the new keyname will be the name stored under the attribute `index_storage_key`. If this attribute is None, a new key will be generated of the format `{dataset_uuid}/indices/{column}/{timestamp}.by-dataset-index.parquet` where the timestamp is in nanosecond accuracy and is created upon Index object initialization Parameters ---------- store: dataset_uuid: """ storage_key = None store = ensure_store(store) if (self.index_storage_key is not None and dataset_uuid and dataset_uuid in self.index_storage_key): storage_key = self.index_storage_key if storage_key is None: storage_key = "{dataset_uuid}/indices/{column}/{timestamp}{suffix}".format( dataset_uuid=dataset_uuid, suffix=naming.EXTERNAL_INDEX_SUFFIX, column=quote(self.column), timestamp=quote(self.creation_time.isoformat()), ) # The arrow representation of index_dct requires a large amount of memory because strings are duplicated and # flattened into the buffer. To avoid a high peak memory usage, split the index_dct into chunks and only convert # one chunk a time to arrow. parts_iter = partition_all(10_000, self.index_dct.items()) # Get first table explicit because its schema is required for ParquetWriter. try: table = _index_dct_to_table(dict(next(parts_iter)), self.column, self.dtype) except StopIteration: # index_dct was empty, just pass it entirely table = _index_dct_to_table(self.index_dct, self.column, self.dtype) buf = pa.BufferOutputStream() with pq.ParquetWriter(buf, schema=table.schema) as writer: writer.write_table(table) del table for part in parts_iter: writer.write_table( _index_dct_to_table(dict(part), self.column, self.dtype)) store.put(storage_key, buf.getvalue().to_pybytes()) return storage_key
def raise_if_dataset_exists(dataset_uuid, store): try: store_instance = ensure_store(store) for form in ["msgpack", "json"]: key = naming.metadata_key_from_uuid(uuid=dataset_uuid, format=form) if key in store_instance: raise RuntimeError( "Dataset `%s` already exists and overwrite is not permitted!", dataset_uuid, ) except KeyError: pass
def load_schema(self, store: StoreInput, dataset_uuid: str) -> "MetaPartition": """ Loads all table metadata in memory and stores it under the `tables` attribute """ if self.schema is None: store = ensure_store(store) self.schema = read_schema_metadata(dataset_uuid=dataset_uuid, store=store, table=self.table_name) return self
def _initialize_store_for_metapartition(method, method_args, method_kwargs): for store_variable in ["store", "storage"]: if store_variable in method_kwargs: method_kwargs[store_variable] = ensure_store( method_kwargs[store_variable]) else: method = cast(object, method) args = inspect.getfullargspec(method).args if store_variable in args: ix = args.index(store_variable) # reduce index since the argspec and method_args start counting differently due to self ix -= 1 instantiated_store = ensure_store(method_args[ix]) new_args = [] for ix_method, arg in enumerate(method_args): if ix_method != ix: new_args.append(arg) else: new_args.append(instantiated_store) method_args = tuple(new_args) return method_args, method_kwargs
def load_from_store( uuid: str, store: StoreInput, load_schema: bool = True, load_all_indices: bool = False, ) -> "DatasetMetadata": """ Load a dataset from a storage Parameters ---------- uuid UUID of the dataset. store Object that implements the .get method for file/object loading. load_schema Load table schema load_all_indices Load all registered indices into memory. Returns ------- dataset_metadata: :class:`~kartothek.core.dataset.DatasetMetadata` Parsed metadata. """ key1 = naming.metadata_key_from_uuid(uuid) store = ensure_store(store) try: value = store.get(key1) metadata = load_json(value) except KeyError: key2 = naming.metadata_key_from_uuid(uuid, format="msgpack") try: value = store.get(key2) metadata = unpackb(value) except KeyError: raise KeyError( "Dataset does not exist. Tried {} and {}".format( key1, key2)) ds = DatasetMetadata.load_from_dict(metadata, store, load_schema=load_schema) if load_all_indices: ds = ds.load_all_indices(store) return ds
def storage_keys(uuid: str, store: StoreInput) -> List[str]: """ Retrieve all keys that belong to the given dataset. Parameters ---------- uuid UUID of the dataset. store Object that implements the .iter_keys method for key retrieval loading. """ store = ensure_store(store) start_markers = ["{}.".format(uuid), "{}/".format(uuid)] return list( sorted(k for k in store.iter_keys(uuid) if any( k.startswith(marker) for marker in start_markers)))
def get_parquet_metadata(self, store: StoreInput) -> pd.DataFrame: """ Retrieve the parquet metadata for the MetaPartition. Especially relevant for calculating dataset statistics. Parameters ---------- store A factory function providing a KeyValueStore table_name Name of the kartothek table for which the statistics should be retrieved Returns ------- pd.DataFrame A DataFrame with relevant parquet metadata """ store = ensure_store(store) data = {} with store.open(self.file) as fd: # type: ignore pq_metadata = pa.parquet.ParquetFile(fd).metadata data = { "partition_label": self.label, "serialized_size": pq_metadata.serialized_size, "number_rows_total": pq_metadata.num_rows, "number_row_groups": pq_metadata.num_row_groups, "row_group_id": [], "number_rows_per_row_group": [], "row_group_compressed_size": [], "row_group_uncompressed_size": [], } for rg_ix in range(pq_metadata.num_row_groups): rg = pq_metadata.row_group(rg_ix) data["row_group_id"].append(rg_ix) data["number_rows_per_row_group"].append(rg.num_rows) data["row_group_compressed_size"].append(rg.total_byte_size) data["row_group_uncompressed_size"].append( sum( rg.column(col_ix).total_uncompressed_size for col_ix in range(rg.num_columns))) df = pd.DataFrame(data=data, columns=_METADATA_SCHEMA.keys()) df = df.astype(_METADATA_SCHEMA) return df
def exists(uuid: str, store: StoreInput) -> bool: """ Check if a dataset exists in a storage Parameters ---------- uuid UUID of the dataset. store Object that implements the .get method for file/object loading. """ store = ensure_store(store) key = naming.metadata_key_from_uuid(uuid) if key in store: return True key = naming.metadata_key_from_uuid(uuid, format="msgpack") return key in store
def persist_indices(store: StoreInput, dataset_uuid: str, indices: Dict[str, IndexBase]) -> Dict[str, str]: store = ensure_store(store) output_filenames = {} for column, index in indices.items(): # backwards compat if isinstance(index, dict): legacy_storage_key = "{dataset_uuid}.{column}{suffix}".format( dataset_uuid=dataset_uuid, column=column, suffix=naming.EXTERNAL_INDEX_SUFFIX, ) index = ExplicitSecondaryIndex( column=column, index_dct=index, index_storage_key=legacy_storage_key) elif isinstance(index, PartitionIndex): continue index = cast(ExplicitSecondaryIndex, index) output_filenames[column] = index.store(store=store, dataset_uuid=dataset_uuid) return output_filenames
def write_partition( partition_df: MetaPartitionInput, secondary_indices: Optional[InferredIndices], sort_partitions_by: Optional[Union[str, Sequence[str]]], dataset_uuid: str, partition_on: Optional[Union[str, Sequence[str]]], store_factory: StoreFactory, df_serializer: Optional[DataFrameSerializer], metadata_version: int, dataset_table_name: Optional[str] = None, ) -> MetaPartition: """ Write a dataframe to store, performing all necessary preprocessing tasks like partitioning, bucketing (NotImplemented), indexing, etc. in the correct order. """ store = ensure_store(store_factory) parse_input: MetaPartitionInput if isinstance(partition_df, pd.DataFrame) and dataset_table_name: parse_input = [{"data": {dataset_table_name: partition_df}}] else: parse_input = partition_df # delete reference to enable release after partition_on; before index build del partition_df # I don't have access to the group values mps = parse_input_to_metapartition( parse_input, metadata_version=metadata_version, expected_secondary_indices=secondary_indices, ) if sort_partitions_by: mps = mps.apply( partial(sort_values_categorical, columns=sort_partitions_by)) if partition_on: mps = mps.partition_on(partition_on) if secondary_indices: mps = mps.build_indices(secondary_indices) return mps.store_dataframes(store=store, dataset_uuid=dataset_uuid, df_serializer=df_serializer)
def align_datasets( left_dataset_uuid: str, right_dataset_uuid: str, store: StoreInput, match_how: Union[Literal["exact", "prefix", "all"], Callable] = "exact", ) -> Generator[List[MetaPartition], None, None]: """ Determine dataset partition alignment Parameters ---------- left_dataset_uuid right_dataset_uuid store match_how Yields ------ List """ store = ensure_store(store) left_dataset = DatasetMetadata.load_from_store(uuid=left_dataset_uuid, store=store) right_dataset = DatasetMetadata.load_from_store( uuid=right_dataset_uuid, store=store ) metadata_version = left_dataset.metadata_version # Loop over the dataset with fewer partitions, treating its keys as # partition label prefixes if ( callable(match_how) or match_how == "left" or ( match_how == "prefix" and len(list(left_dataset.partitions.keys())[0]) < len(list(right_dataset.partitions.keys())[0]) ) ): first_dataset = left_dataset second_dataset = right_dataset else: first_dataset = right_dataset second_dataset = left_dataset # The del statements are here to reduce confusion below del left_dataset del right_dataset # For every partition in the 'small' dataset, at least one partition match # needs to be found in the larger dataset. available_partitions = list(second_dataset.partitions.items()) partition_stack = available_partitions[:] # TODO: write a test which protects against the following scenario!! # Sort the partition labels by length of the labels, starting with the # labels which are the longest. This way we prevent label matching for # similar partitions, e.g. cluster_100 and cluster_1. This, of course, # works only as long as the internal loop removes elements which were # matched already (here improperly called stack) for l_1 in sorted(first_dataset.partitions, key=len, reverse=True): p_1 = first_dataset.partitions[l_1] res = [ MetaPartition.from_partition( partition=p_1, metadata_version=metadata_version ) ] for parts in available_partitions: l_2, p_2 = parts if callable(match_how) and not match_how(l_1, l_2): continue if match_how == "exact" and l_1 != l_2: continue elif match_how == "prefix" and not l_2.startswith(l_1): LOGGER.debug("rejecting (%s, %s)", l_1, l_2) continue LOGGER.debug( "Found alignment between partitions " "(%s, %s) and" "(%s, %s)", first_dataset.uuid, p_1.label, second_dataset.uuid, p_2.label, ) res.append( MetaPartition.from_partition( partition=p_2, metadata_version=metadata_version ) ) # In exact or prefix matching schemes, it is expected to only # find one partition alignment. in this case reduce the size of # the inner loop if match_how in ["exact", "prefix"]: partition_stack.remove((l_2, p_2)) # Need to copy, otherwise remove will alter the loop iterator available_partitions = partition_stack[:] if len(res) == 1: raise RuntimeError( "No matching partition for {} in dataset {} " "found".format(p_1, first_dataset) ) yield res
def test_ensure_store_returns_same_store(): store = get_store_from_url("memory://") assert ensure_store(lambda: store) is store
def delete_from_store(self, dataset_uuid: Any, store: StoreInput) -> "MetaPartition": store = ensure_store(store) # Delete data first store.delete(self.file) return self.copy(file=None, data=None)
def store_dataset_from_partitions( partition_list, store: StoreInput, dataset_uuid, dataset_metadata=None, metadata_merger=None, update_dataset=None, remove_partitions=None, metadata_storage_format=naming.DEFAULT_METADATA_STORAGE_FORMAT, ): store = ensure_store(store) schemas = set() if update_dataset: dataset_builder = DatasetMetadataBuilder.from_dataset(update_dataset) metadata_version = dataset_builder.metadata_version table_name = update_dataset.table_name schemas.add(update_dataset.schema) else: mp = next(iter(partition_list), None) if mp is None: raise ValueError( "Cannot store empty datasets, partition_list must not be empty if in store mode." ) table_name = mp.table_name metadata_version = mp.metadata_version dataset_builder = DatasetMetadataBuilder( uuid=dataset_uuid, metadata_version=metadata_version, partition_keys=mp.partition_keys, ) for mp in partition_list: if mp.schema: schemas.add(mp.schema) dataset_builder.schema = persist_common_metadata( schemas=schemas, update_dataset=update_dataset, store=store, dataset_uuid=dataset_uuid, table_name=table_name, ) # We can only check for non unique partition labels here and if they occur we will # fail hard. The resulting dataset may be corrupted or file may be left in the store # without dataset metadata partition_labels = partition_labels_from_mps(partition_list) # This could be safely removed since we do not allow to set this by the user # anymore. It has implications on tests if mocks are used non_unique_labels = extract_duplicates(partition_labels) if non_unique_labels: raise ValueError( "The labels {} are duplicated. Dataset metadata was not written.". format(", ".join(non_unique_labels))) if remove_partitions is None: remove_partitions = [] if metadata_merger is None: metadata_merger = combine_metadata dataset_builder = update_metadata(dataset_builder, metadata_merger, dataset_metadata) dataset_builder = update_partitions(dataset_builder, partition_list, remove_partitions) dataset_builder = update_indices(dataset_builder, store, partition_list, remove_partitions) if metadata_storage_format.lower() == "json": store.put(*dataset_builder.to_json()) elif metadata_storage_format.lower() == "msgpack": store.put(*dataset_builder.to_msgpack()) else: raise ValueError( "Unknown metadata storage format encountered: {}".format( metadata_storage_format)) dataset = dataset_builder.to_dataset() return dataset
def store_dataset_from_partitions( partition_list, store: StoreInput, dataset_uuid, dataset_metadata=None, metadata_merger=None, update_dataset=None, remove_partitions=None, metadata_storage_format=naming.DEFAULT_METADATA_STORAGE_FORMAT, ): store = ensure_store(store) if update_dataset: dataset_builder = DatasetMetadataBuilder.from_dataset(update_dataset) metadata_version = dataset_builder.metadata_version else: mp = next(iter(partition_list), None) if mp is None: raise ValueError( "Cannot store empty datasets, partition_list must not be empty if in store mode." ) metadata_version = mp.metadata_version dataset_builder = DatasetMetadataBuilder( uuid=dataset_uuid, metadata_version=metadata_version, partition_keys=mp.partition_keys, ) dataset_builder.explicit_partitions = True dataset_builder.table_meta = persist_common_metadata( partition_list, update_dataset, store, dataset_uuid) # We can only check for non unique partition labels here and if they occur we will # fail hard. The resulting dataset may be corrupted or file may be left in the store # without dataset metadata partition_labels = partition_labels_from_mps(partition_list) non_unique_labels = extract_duplicates(partition_labels) if non_unique_labels: raise ValueError( "The labels {} are duplicated. Dataset metadata was not written.". format(", ".join(non_unique_labels))) if remove_partitions is None: remove_partitions = [] if metadata_merger is None: metadata_merger = combine_metadata dataset_builder = update_metadata(dataset_builder, metadata_merger, partition_list, dataset_metadata) dataset_builder = update_partitions(dataset_builder, partition_list, remove_partitions) dataset_builder = update_indices(dataset_builder, store, partition_list, remove_partitions) if metadata_storage_format.lower() == "json": store.put(*dataset_builder.to_json()) elif metadata_storage_format.lower() == "msgpack": store.put(*dataset_builder.to_msgpack()) else: raise ValueError( "Unknown metadata storage format encountered: {}".format( metadata_storage_format)) dataset = dataset_builder.to_dataset() return dataset