def test_dynamic_partitions_multiple_indices(store): """ Do not specify partitions in metadata, but read them dynamically from store """ suffix = "suffix" dataset_uuid = "uuid+namespace-attribute12_underscored" partition0_core = create_partition_key( dataset_uuid, "core", [("location", "L-0"), ("product", "P-0")], "{}.parquet".format(suffix), ) partition1_core = create_partition_key( dataset_uuid, "core", [("location", "L-1"), ("product", "P-0")], "{}.parquet".format(suffix), ) metadata = {"dataset_metadata_version": 4, "dataset_uuid": dataset_uuid} expected_partitions = { "location=L-0/product=P-0/{}".format(suffix): { "files": {"core": partition0_core} }, "location=L-1/product=P-0/{}".format(suffix): { "files": {"core": partition1_core} }, } expected_indices = { "location": { "L-0": ["location=L-0/product=P-0/{}".format(suffix)], "L-1": ["location=L-1/product=P-0/{}".format(suffix)], }, "product": { "P-0": [ "location=L-0/product=P-0/{}".format(suffix), "location=L-1/product=P-0/{}".format(suffix), ] }, } store.put(partition0_core, b"test") store.put(partition1_core, b"test") store_schema_metadata( make_meta(pd.DataFrame({"location": ["L-0"], "product": ["P-0"]}), origin="1"), dataset_uuid, store, "core", ) dmd = DatasetMetadata.load_from_dict(metadata, store) dmd = dmd.load_partition_indices() dmd_dict = dmd.to_dict() assert dmd_dict["partitions"] == expected_partitions # Sorting may differ in the index list. This is ok for runtime # but does produce flaky tests thus sort them. sorted_result = { column: {label: sorted(x) for label, x in index.items()} for column, index in dmd_dict["indices"].items() } assert sorted_result == expected_indices
def test_dynamic_partitions_quote(store, metadata_version): """ Do not specify partitions in metadata, but read them dynamically from store """ dataset_uuid = "uuid-namespace-attribute12_underscored" partition0_core = create_partition_key(dataset_uuid, "core", [("location", "München")], "data.parquet") partition1_core = create_partition_key(dataset_uuid, "core", [("location", "å\\ øß")], "data.parquet") metadata = { "dataset_metadata_version": metadata_version, "dataset_uuid": dataset_uuid, } expected_partitions = { "location=M%C3%BCnchen/data": { "files": { "core": partition0_core } }, "location=%C3%A5%5C%20%C3%B8%C3%9F/data": { "files": { "core": partition1_core } }, } expected_indices = { "location": { "München": ["location=M%C3%BCnchen/data"], "å\\ øß": ["location=%C3%A5%5C%20%C3%B8%C3%9F/data"], } } store.put(partition0_core, b"test") store.put(partition1_core, b"test") store_schema_metadata( make_meta(pd.DataFrame({"location": ["L-0"]}), origin="1"), dataset_uuid, store, "core", ) dmd = DatasetMetadata.load_from_dict(metadata, store) dmd = dmd.load_partition_indices() dmd_dict = dmd.to_dict() assert dmd_dict["partitions"] == expected_partitions assert dmd_dict["indices"] == expected_indices
def test_dynamic_partitions(store): """ Do not specify partitions in metadata, but read them dynamically from store """ partition_suffix = "suffix" dataset_uuid = "uuid+namespace-attribute12_underscored" partition0_core = create_partition_key( dataset_uuid, "core", [("location", "L-0")], "{}.parquet".format(partition_suffix), ) partition1_core = create_partition_key( dataset_uuid, "core", [("location", "L-1")], "{}.parquet".format(partition_suffix), ) partition0_ext = create_partition_key( dataset_uuid, "extension", [("location", "L-0")], "{}.parquet".format(partition_suffix), ) partition1_ext = create_partition_key( dataset_uuid, "extension", [("location", "L-1")], "{}.parquet".format(partition_suffix), ) metadata = {"dataset_metadata_version": 4, "dataset_uuid": dataset_uuid} expected_partitions = { "location=L-0/{}".format(partition_suffix): { "files": { "core": partition0_core, "extension": partition0_ext } }, "location=L-1/{}".format(partition_suffix): { "files": { "core": partition1_core, "extension": partition1_ext } }, } expected_indices = { "location": { "L-0": ["location=L-0/{}".format(partition_suffix)], "L-1": ["location=L-1/{}".format(partition_suffix)], } } # put two partitions for two tables each to store store.put( "{}{}.json".format(dataset_uuid, naming.METADATA_BASE_SUFFIX), simplejson.dumps(metadata).encode("utf-8"), ) store.put(partition0_core, b"test") store.put(partition1_core, b"test") store.put(partition0_ext, b"test") store.put(partition1_ext, b"test") store_schema_metadata( make_meta( pd.DataFrame({"location": ["L-0/{}".format(partition_suffix)]}), origin="stored", ), dataset_uuid, store, "core", ) # instantiate metadata to write table metadatad core_schema = make_meta( pd.DataFrame({ "column_0": pd.Series([1], dtype=int), "column_1": pd.Series([1], dtype=int), "location": pd.Series(["str"]), }), origin="core", ) extension_schema = make_meta( pd.DataFrame({ "column_77": pd.Series([1], dtype=int), "column_78": pd.Series([1], dtype=int), "location": pd.Series(["str"]), }), origin="extension", ) store_schema_metadata(core_schema, dataset_uuid, store, "core") store_schema_metadata(extension_schema, dataset_uuid, store, "extension") dmd = DatasetMetadata.load_from_store(dataset_uuid, store) # reload metadata to use table metadata dmd = DatasetMetadata.load_from_store(dataset_uuid, store) dmd = dmd.load_partition_indices() dmd_dict = dmd.to_dict() assert dmd_dict["partitions"] == expected_partitions assert dmd_dict["indices"] == expected_indices
def test_create_partition_key(): key = create_partition_key("my-uuid", "testtable", [("index1", "value1"), ("index2", "value2")]) assert key == "my-uuid/testtable/index1=value1/index2=value2/data"
def test_dynamic_partitions_with_garbage(store): """ In case there are unknown files, dataset and indices still load correctly """ dataset_uuid = "uuid+namespace-attribute12_underscored" partition_suffix = "suffix" partition0_core = create_partition_key( dataset_uuid, "core", [("location", "L-0"), ("product", "P-0")], "{}.parquet".format(partition_suffix), ) partition1_core = create_partition_key( dataset_uuid, "core", [("location", "L-1"), ("product", "P-0")], "{}.parquet".format(partition_suffix), ) metadata = {"dataset_metadata_version": 4, "dataset_uuid": dataset_uuid} expected_partitions = { "location=L-0/product=P-0/{}".format(partition_suffix): { "files": { "core": partition0_core } }, "location=L-1/product=P-0/{}".format(partition_suffix): { "files": { "core": partition1_core } }, } expected_indices = { "location": { "L-0": ["location=L-0/product=P-0/{}".format(partition_suffix)], "L-1": ["location=L-1/product=P-0/{}".format(partition_suffix)], }, "product": { "P-0": [ "location=L-0/product=P-0/{}".format(partition_suffix), "location=L-1/product=P-0/{}".format(partition_suffix), ] }, } store.put(partition0_core, b"test") store.put(partition1_core, b"test") store_schema_metadata( make_meta(pd.DataFrame({ "location": ["L-0"], "product": ["P-0"] }), origin="1"), dataset_uuid, store, "core", ) # the following files are garbage and should not interfere with the indices and/or partitions for suffix in ["", ".json", ".msgpack", ".my_own_file_format"]: store.put("this_should_not_exist{}".format(suffix), b"ignore me") store.put("{}/this_should_not_exist{}".format(dataset_uuid, suffix), b"ignore me") store.put( "{}/{}/this_should_not_exist{}".format(dataset_uuid, "core", suffix), b"ignore me", ) store.put( "{}/{}/location=L-0/this_should_not_exist{}".format( dataset_uuid, "core", suffix), b"ignore me", ) dmd = DatasetMetadata.load_from_dict(metadata, store) dmd = dmd.load_partition_indices() dmd_dict = dmd.to_dict() assert dmd_dict["partitions"] == expected_partitions # Sorting may differ in the index list. This is ok for runtime # but does produce flaky tests thus sort them. sorted_result = { column: {label: sorted(x) for label, x in index.items()} for column, index in dmd_dict["indices"].items() } assert sorted_result == expected_indices