def test_create_dataset_header(store, metadata_storage_format, frozen_time): table_meta = {"table": make_meta(pd.DataFrame({"col": [1]}), origin="1")} new_dataset = create_empty_dataset_header( store=store, table_meta=table_meta, dataset_uuid="new_dataset_uuid", metadata_storage_format=metadata_storage_format, metadata_version=4, ) expected_dataset = DatasetMetadata( uuid="new_dataset_uuid", metadata_version=4, explicit_partitions=False, table_meta=table_meta, ) assert new_dataset == expected_dataset storage_keys = list(store.keys()) assert len(storage_keys) == 2 loaded = DatasetMetadata.load_from_store(store=store, uuid="new_dataset_uuid") assert loaded == expected_dataset # If the read succeeds, the schema is written read_schema_metadata(dataset_uuid=new_dataset.uuid, store=store, table="table")
def validate_schema_compatible(self, store: StoreInput, dataset_uuid: str) -> "MetaPartition": """ Validates that the currently held DataFrames match the schema of the existing dataset. Parameters ---------- store If it is a function, the result of calling it must be a KeyValueStore. dataset_uuid The dataset UUID the partition will be assigned to """ # Load the reference meta of the existing dataset. Using the built-in # `load_all_table_meta` would not be helpful here as it would be a no-op # as we have already loaded the meta from the input DataFrame. store = ensure_store(store) reference_meta = read_schema_metadata(dataset_uuid=dataset_uuid, store=store, table=self.table_name) try: validate_compatible([self.schema, reference_meta]) except ValueError as e: raise ValueError( f"Schemas for dataset '{dataset_uuid}' are not compatible!\n\n{e}" ) return self
def persist_common_metadata( schemas: Iterable[SchemaWrapper], update_dataset: Optional[DatasetFactory], store: KeyValueStore, dataset_uuid: str, table_name: str, ): if not schemas: return None schemas_set = set(schemas) del schemas if update_dataset: schemas_set.add( read_schema_metadata(dataset_uuid=dataset_uuid, store=store, table=table_name)) schemas_sorted = sorted(schemas_set, key=lambda s: sorted(s.origin)) try: result = validate_compatible(schemas_sorted) except ValueError as e: raise ValueError( "Schemas for dataset '{dataset_uuid}' are not compatible!\n\n{e}". format(dataset_uuid=dataset_uuid, e=e)) if result: store_schema_metadata(schema=result, dataset_uuid=dataset_uuid, store=store, table=table_name) return result
def test_schema_roundtrip(df_all_types, store): expected_meta = make_meta(df_all_types, origin="df_all_types") store_schema_metadata( expected_meta, dataset_uuid="dataset_uuid", store=store, table="table" ) result = read_schema_metadata( dataset_uuid="dataset_uuid", store=store, table="table" ) assert result == expected_meta
def load_schema(self, store: StoreInput, dataset_uuid: str) -> "MetaPartition": """ Loads all table metadata in memory and stores it under the `tables` attribute """ if self.schema is None: store = ensure_store(store) self.schema = read_schema_metadata(dataset_uuid=dataset_uuid, store=store, table=self.table_name) return self
def persist_common_metadata(partition_list, update_dataset, store, dataset_uuid): # hash the schemas for quick equality check with possible false negatives # (e.g. other pandas version or null schemas) tm_dct = defaultdict(set) for mp in partition_list: for tab, tm in mp.table_meta.items(): tm_dct[tab].add(tm) if update_dataset: if set(tm_dct.keys()) and set(update_dataset.tables) != set( tm_dct.keys()): raise ValueError(( "Input partitions for update have different tables than dataset:\n" "Input partition tables: {}\n" "Tables of existing dataset: {}").format( set(tm_dct.keys()), update_dataset.tables)) for table in update_dataset.tables: tm_dct[table].add( read_schema_metadata(dataset_uuid=dataset_uuid, store=store, table=table)) result = {} # sort tables and schemas to have reproducible error messages for table in sorted(tm_dct.keys()): schemas = sorted(tm_dct[table], key=lambda s: sorted(s.origin)) try: result[table] = validate_compatible(schemas) except ValueError as e: raise ValueError( "Schemas for table '{table}' of dataset '{dataset_uuid}' are not compatible!\n\n{e}" .format(table=table, dataset_uuid=dataset_uuid, e=e)) validate_shared_columns(list(result.values())) for table, schema in result.items(): store_schema_metadata(schema=schema, dataset_uuid=dataset_uuid, store=store, table=table) return result
def test_compat_old_rw_path(df_all_types, store): # strip down DF before some column types weren't supported before anyway df = df_all_types[ [ c for c in df_all_types.columns if ( not c.startswith("array_") # array types (always null) and c != "unicode" # unicode type (alway null) and "8" not in c # 8 bit types are casted to 64 bit and "16" not in c # 16 bit types are casted to 64 bit and "32" not in c # 32 bit types are casted to 64 bit ) ] ] expected_meta = make_meta(df, origin="df") # old schema write path old_meta = dask_make_meta(df) pa_table = pa.Table.from_pandas(old_meta) buf = pa.BufferOutputStream() pq.write_table(pa_table, buf, version="2.0") key_old = _get_common_metadata_key("dataset_uuid_old", "table") store.put(key_old, buf.getvalue().to_pybytes()) actual_meta = read_schema_metadata( dataset_uuid="dataset_uuid_old", store=store, table="table" ) validate_compatible([actual_meta, expected_meta]) store_schema_metadata( schema=make_meta(df, origin="df"), dataset_uuid="dataset_uuid_new", store=store, table="table", ) key_new = _get_common_metadata_key("dataset_uuid_new", "table") actual_df = ParquetSerializer.restore_dataframe(key=key_new, store=store) actual_df["date"] = actual_df["date"].dt.date pdt.assert_frame_equal(actual_df, old_meta)
def load_from_dict(dct: Dict, store: StoreInput, load_schema: bool = True) -> "DatasetMetadata": """ Load dataset metadata from a dictionary and resolve any external includes. Parameters ---------- dct store Object that implements the .get method for file/object loading. load_schema Load table schema """ # Use copy here to get an OrderedDict metadata = copy.copy(dct) if "metadata" not in metadata: metadata["metadata"] = OrderedDict() metadata_version = dct[naming.METADATA_VERSION_KEY] dataset_uuid = dct[naming.UUID_KEY] explicit_partitions = "partitions" in metadata storage_keys = None if not explicit_partitions: storage_keys = DatasetMetadata.storage_keys(dataset_uuid, store) partitions = _load_partitions_from_filenames( store=store, storage_keys=storage_keys, metadata_version=metadata_version, ) metadata["partitions"] = partitions if metadata["partitions"]: tables = [ tab for tab in list(metadata["partitions"].values())[0]["files"] ] else: table_set = set() if storage_keys is None: storage_keys = DatasetMetadata.storage_keys( dataset_uuid, store) for key in storage_keys: if key.endswith(naming.TABLE_METADATA_FILE): table_set.add(key.split("/")[1]) tables = list(table_set) table_meta = {} if load_schema: for table in tables: table_meta[table] = read_schema_metadata( dataset_uuid=dataset_uuid, store=store, table=table) metadata["table_meta"] = table_meta if "partition_keys" not in metadata: metadata["partition_keys"] = _get_partition_keys_from_partitions( metadata["partitions"]) return DatasetMetadata.from_dict( metadata, explicit_partitions=explicit_partitions)