def test_read_table_meta(store): meta_dct = { "dataset_metadata_version": 4, "dataset_uuid": "dataset_uuid", "partitions": { "location_id=1/part_1": { "files": { "table1": "dataset_uuid/table1/location_id=1/part_1.parquet", "table2": "dataset_uuid/table2/location_id=1/part_1.parquet", } } }, } df1 = pd.DataFrame({ "location_id": pd.Series([1], dtype=int), "x": pd.Series([True], dtype=bool) }) df2 = pd.DataFrame({ "location_id": pd.Series([1], dtype=int), "y": pd.Series([1.0], dtype=float) }) schema1 = make_meta(df1, origin="1") schema2 = make_meta(df2, origin="2") store_schema_metadata(schema1, "dataset_uuid", store, "table1") store_schema_metadata(schema2, "dataset_uuid", store, "table2") dmd = DatasetMetadata.load_from_dict(meta_dct, store) actual = dmd.table_meta expected = {"table1": schema1, "table2": schema2} assert actual == expected
def test_dynamic_partitions_multiple_indices(store): """ Do not specify partitions in metadata, but read them dynamically from store """ suffix = "suffix" dataset_uuid = "uuid+namespace-attribute12_underscored" partition0_core = create_partition_key( dataset_uuid, "core", [("location", "L-0"), ("product", "P-0")], "{}.parquet".format(suffix), ) partition1_core = create_partition_key( dataset_uuid, "core", [("location", "L-1"), ("product", "P-0")], "{}.parquet".format(suffix), ) metadata = {"dataset_metadata_version": 4, "dataset_uuid": dataset_uuid} expected_partitions = { "location=L-0/product=P-0/{}".format(suffix): { "files": {"core": partition0_core} }, "location=L-1/product=P-0/{}".format(suffix): { "files": {"core": partition1_core} }, } expected_indices = { "location": { "L-0": ["location=L-0/product=P-0/{}".format(suffix)], "L-1": ["location=L-1/product=P-0/{}".format(suffix)], }, "product": { "P-0": [ "location=L-0/product=P-0/{}".format(suffix), "location=L-1/product=P-0/{}".format(suffix), ] }, } store.put(partition0_core, b"test") store.put(partition1_core, b"test") store_schema_metadata( make_meta(pd.DataFrame({"location": ["L-0"], "product": ["P-0"]}), origin="1"), dataset_uuid, store, "core", ) dmd = DatasetMetadata.load_from_dict(metadata, store) dmd = dmd.load_partition_indices() dmd_dict = dmd.to_dict() assert dmd_dict["partitions"] == expected_partitions # Sorting may differ in the index list. This is ok for runtime # but does produce flaky tests thus sort them. sorted_result = { column: {label: sorted(x) for label, x in index.items()} for column, index in dmd_dict["indices"].items() } assert sorted_result == expected_indices
def test_load_partition_indices_no_files(store): meta_dct = { "dataset_metadata_version": 4, "dataset_uuid": "dataset_uuid", "partitions": { "p1": { "files": {} } }, } dmd = DatasetMetadata.load_from_dict(meta_dct, store) dmd = dmd.load_partition_indices() assert len(dmd.indices) == 0
def test_dynamic_partitions_quote(store, metadata_version): """ Do not specify partitions in metadata, but read them dynamically from store """ dataset_uuid = "uuid-namespace-attribute12_underscored" partition0_core = create_partition_key(dataset_uuid, "core", [("location", "München")], "data.parquet") partition1_core = create_partition_key(dataset_uuid, "core", [("location", "å\\ øß")], "data.parquet") metadata = { "dataset_metadata_version": metadata_version, "dataset_uuid": dataset_uuid, } expected_partitions = { "location=M%C3%BCnchen/data": { "files": { "core": partition0_core } }, "location=%C3%A5%5C%20%C3%B8%C3%9F/data": { "files": { "core": partition1_core } }, } expected_indices = { "location": { "München": ["location=M%C3%BCnchen/data"], "å\\ øß": ["location=%C3%A5%5C%20%C3%B8%C3%9F/data"], } } store.put(partition0_core, b"test") store.put(partition1_core, b"test") store_schema_metadata( make_meta(pd.DataFrame({"location": ["L-0"]}), origin="1"), dataset_uuid, store, "core", ) dmd = DatasetMetadata.load_from_dict(metadata, store) dmd = dmd.load_partition_indices() dmd_dict = dmd.to_dict() assert dmd_dict["partitions"] == expected_partitions assert dmd_dict["indices"] == expected_indices
def test_builder_empty_partition_keys(store, metadata_version, frozen_time): expected = { "dataset_uuid": "uuid", "dataset_metadata_version": metadata_version, "metadata": { "creation_time": TIME_TO_FREEZE_ISO }, "partition_keys": ["L", "P"], "partitions": {}, } builder = DatasetMetadataBuilder("uuid", metadata_version=4, partition_keys=["L", "P"]) key, result = builder.to_json() result = simplejson.loads(result) assert key == "uuid.by-dataset-metadata.json" assert result == expected result_from_dict = DatasetMetadata.load_from_dict(result, store).to_dict() assert result_from_dict == expected
def test_read_table_meta(store): meta_dct = { "dataset_metadata_version": 4, "dataset_uuid": "dataset_uuid", "partitions": { "location_id=1/part_1": { "files": { "table1": "dataset_uuid/table1/location_id=1/part_1.parquet" } } }, } df1 = pd.DataFrame({ "location_id": pd.Series([1], dtype=int), "x": pd.Series([True], dtype=bool) }) schema1 = make_meta(df1, origin="1") store_schema_metadata(schema1, "dataset_uuid", store, "table1") dmd = DatasetMetadata.load_from_dict(meta_dct, store) assert dmd.schema == schema1
def test_dynamic_partitions_with_garbage(store): """ In case there are unknown files, dataset and indices still load correctly """ dataset_uuid = "uuid+namespace-attribute12_underscored" partition_suffix = "suffix" partition0_core = create_partition_key( dataset_uuid, "core", [("location", "L-0"), ("product", "P-0")], "{}.parquet".format(partition_suffix), ) partition1_core = create_partition_key( dataset_uuid, "core", [("location", "L-1"), ("product", "P-0")], "{}.parquet".format(partition_suffix), ) metadata = {"dataset_metadata_version": 4, "dataset_uuid": dataset_uuid} expected_partitions = { "location=L-0/product=P-0/{}".format(partition_suffix): { "files": { "core": partition0_core } }, "location=L-1/product=P-0/{}".format(partition_suffix): { "files": { "core": partition1_core } }, } expected_indices = { "location": { "L-0": ["location=L-0/product=P-0/{}".format(partition_suffix)], "L-1": ["location=L-1/product=P-0/{}".format(partition_suffix)], }, "product": { "P-0": [ "location=L-0/product=P-0/{}".format(partition_suffix), "location=L-1/product=P-0/{}".format(partition_suffix), ] }, } store.put(partition0_core, b"test") store.put(partition1_core, b"test") store_schema_metadata( make_meta(pd.DataFrame({ "location": ["L-0"], "product": ["P-0"] }), origin="1"), dataset_uuid, store, "core", ) # the following files are garbage and should not interfere with the indices and/or partitions for suffix in ["", ".json", ".msgpack", ".my_own_file_format"]: store.put("this_should_not_exist{}".format(suffix), b"ignore me") store.put("{}/this_should_not_exist{}".format(dataset_uuid, suffix), b"ignore me") store.put( "{}/{}/this_should_not_exist{}".format(dataset_uuid, "core", suffix), b"ignore me", ) store.put( "{}/{}/location=L-0/this_should_not_exist{}".format( dataset_uuid, "core", suffix), b"ignore me", ) dmd = DatasetMetadata.load_from_dict(metadata, store) dmd = dmd.load_partition_indices() dmd_dict = dmd.to_dict() assert dmd_dict["partitions"] == expected_partitions # Sorting may differ in the index list. This is ok for runtime # but does produce flaky tests thus sort them. sorted_result = { column: {label: sorted(x) for label, x in index.items()} for column, index in dmd_dict["indices"].items() } assert sorted_result == expected_indices