def test_read_table_meta(store):
    meta_dct = {
        "dataset_metadata_version": 4,
        "dataset_uuid": "dataset_uuid",
        "partitions": {
            "location_id=1/part_1": {
                "files": {
                    "table1":
                    "dataset_uuid/table1/location_id=1/part_1.parquet",
                    "table2":
                    "dataset_uuid/table2/location_id=1/part_1.parquet",
                }
            }
        },
    }
    df1 = pd.DataFrame({
        "location_id": pd.Series([1], dtype=int),
        "x": pd.Series([True], dtype=bool)
    })
    df2 = pd.DataFrame({
        "location_id": pd.Series([1], dtype=int),
        "y": pd.Series([1.0], dtype=float)
    })
    schema1 = make_meta(df1, origin="1")
    schema2 = make_meta(df2, origin="2")
    store_schema_metadata(schema1, "dataset_uuid", store, "table1")
    store_schema_metadata(schema2, "dataset_uuid", store, "table2")

    dmd = DatasetMetadata.load_from_dict(meta_dct, store)

    actual = dmd.table_meta
    expected = {"table1": schema1, "table2": schema2}
    assert actual == expected
예제 #2
0
def test_dynamic_partitions_multiple_indices(store):
    """
    Do not specify partitions in metadata, but read them dynamically from store
    """
    suffix = "suffix"
    dataset_uuid = "uuid+namespace-attribute12_underscored"
    partition0_core = create_partition_key(
        dataset_uuid,
        "core",
        [("location", "L-0"), ("product", "P-0")],
        "{}.parquet".format(suffix),
    )
    partition1_core = create_partition_key(
        dataset_uuid,
        "core",
        [("location", "L-1"), ("product", "P-0")],
        "{}.parquet".format(suffix),
    )
    metadata = {"dataset_metadata_version": 4, "dataset_uuid": dataset_uuid}
    expected_partitions = {
        "location=L-0/product=P-0/{}".format(suffix): {
            "files": {"core": partition0_core}
        },
        "location=L-1/product=P-0/{}".format(suffix): {
            "files": {"core": partition1_core}
        },
    }
    expected_indices = {
        "location": {
            "L-0": ["location=L-0/product=P-0/{}".format(suffix)],
            "L-1": ["location=L-1/product=P-0/{}".format(suffix)],
        },
        "product": {
            "P-0": [
                "location=L-0/product=P-0/{}".format(suffix),
                "location=L-1/product=P-0/{}".format(suffix),
            ]
        },
    }

    store.put(partition0_core, b"test")
    store.put(partition1_core, b"test")
    store_schema_metadata(
        make_meta(pd.DataFrame({"location": ["L-0"], "product": ["P-0"]}), origin="1"),
        dataset_uuid,
        store,
        "core",
    )

    dmd = DatasetMetadata.load_from_dict(metadata, store)
    dmd = dmd.load_partition_indices()
    dmd_dict = dmd.to_dict()
    assert dmd_dict["partitions"] == expected_partitions
    # Sorting may differ in the index list. This is ok for runtime
    # but does produce flaky tests thus sort them.
    sorted_result = {
        column: {label: sorted(x) for label, x in index.items()}
        for column, index in dmd_dict["indices"].items()
    }
    assert sorted_result == expected_indices
예제 #3
0
def test_load_partition_indices_no_files(store):
    meta_dct = {
        "dataset_metadata_version": 4,
        "dataset_uuid": "dataset_uuid",
        "partitions": {
            "p1": {
                "files": {}
            }
        },
    }
    dmd = DatasetMetadata.load_from_dict(meta_dct, store)
    dmd = dmd.load_partition_indices()
    assert len(dmd.indices) == 0
예제 #4
0
def test_dynamic_partitions_quote(store, metadata_version):
    """
    Do not specify partitions in metadata, but read them dynamically from store
    """
    dataset_uuid = "uuid-namespace-attribute12_underscored"
    partition0_core = create_partition_key(dataset_uuid, "core",
                                           [("location", "München")],
                                           "data.parquet")
    partition1_core = create_partition_key(dataset_uuid, "core",
                                           [("location", "å\\ øß")],
                                           "data.parquet")
    metadata = {
        "dataset_metadata_version": metadata_version,
        "dataset_uuid": dataset_uuid,
    }
    expected_partitions = {
        "location=M%C3%BCnchen/data": {
            "files": {
                "core": partition0_core
            }
        },
        "location=%C3%A5%5C%20%C3%B8%C3%9F/data": {
            "files": {
                "core": partition1_core
            }
        },
    }
    expected_indices = {
        "location": {
            "München": ["location=M%C3%BCnchen/data"],
            "å\\ øß": ["location=%C3%A5%5C%20%C3%B8%C3%9F/data"],
        }
    }

    store.put(partition0_core, b"test")
    store.put(partition1_core, b"test")
    store_schema_metadata(
        make_meta(pd.DataFrame({"location": ["L-0"]}), origin="1"),
        dataset_uuid,
        store,
        "core",
    )

    dmd = DatasetMetadata.load_from_dict(metadata, store)
    dmd = dmd.load_partition_indices()

    dmd_dict = dmd.to_dict()
    assert dmd_dict["partitions"] == expected_partitions
    assert dmd_dict["indices"] == expected_indices
예제 #5
0
def test_builder_empty_partition_keys(store, metadata_version, frozen_time):
    expected = {
        "dataset_uuid": "uuid",
        "dataset_metadata_version": metadata_version,
        "metadata": {
            "creation_time": TIME_TO_FREEZE_ISO
        },
        "partition_keys": ["L", "P"],
        "partitions": {},
    }

    builder = DatasetMetadataBuilder("uuid",
                                     metadata_version=4,
                                     partition_keys=["L", "P"])
    key, result = builder.to_json()
    result = simplejson.loads(result)
    assert key == "uuid.by-dataset-metadata.json"
    assert result == expected
    result_from_dict = DatasetMetadata.load_from_dict(result, store).to_dict()
    assert result_from_dict == expected
예제 #6
0
def test_read_table_meta(store):
    meta_dct = {
        "dataset_metadata_version": 4,
        "dataset_uuid": "dataset_uuid",
        "partitions": {
            "location_id=1/part_1": {
                "files": {
                    "table1":
                    "dataset_uuid/table1/location_id=1/part_1.parquet"
                }
            }
        },
    }
    df1 = pd.DataFrame({
        "location_id": pd.Series([1], dtype=int),
        "x": pd.Series([True], dtype=bool)
    })
    schema1 = make_meta(df1, origin="1")
    store_schema_metadata(schema1, "dataset_uuid", store, "table1")

    dmd = DatasetMetadata.load_from_dict(meta_dct, store)

    assert dmd.schema == schema1
예제 #7
0
def test_dynamic_partitions_with_garbage(store):
    """
    In case there are unknown files, dataset and indices still load correctly
    """
    dataset_uuid = "uuid+namespace-attribute12_underscored"
    partition_suffix = "suffix"
    partition0_core = create_partition_key(
        dataset_uuid,
        "core",
        [("location", "L-0"), ("product", "P-0")],
        "{}.parquet".format(partition_suffix),
    )
    partition1_core = create_partition_key(
        dataset_uuid,
        "core",
        [("location", "L-1"), ("product", "P-0")],
        "{}.parquet".format(partition_suffix),
    )
    metadata = {"dataset_metadata_version": 4, "dataset_uuid": dataset_uuid}
    expected_partitions = {
        "location=L-0/product=P-0/{}".format(partition_suffix): {
            "files": {
                "core": partition0_core
            }
        },
        "location=L-1/product=P-0/{}".format(partition_suffix): {
            "files": {
                "core": partition1_core
            }
        },
    }
    expected_indices = {
        "location": {
            "L-0": ["location=L-0/product=P-0/{}".format(partition_suffix)],
            "L-1": ["location=L-1/product=P-0/{}".format(partition_suffix)],
        },
        "product": {
            "P-0": [
                "location=L-0/product=P-0/{}".format(partition_suffix),
                "location=L-1/product=P-0/{}".format(partition_suffix),
            ]
        },
    }

    store.put(partition0_core, b"test")
    store.put(partition1_core, b"test")
    store_schema_metadata(
        make_meta(pd.DataFrame({
            "location": ["L-0"],
            "product": ["P-0"]
        }),
                  origin="1"),
        dataset_uuid,
        store,
        "core",
    )

    # the following files are garbage and should not interfere with the indices and/or partitions
    for suffix in ["", ".json", ".msgpack", ".my_own_file_format"]:
        store.put("this_should_not_exist{}".format(suffix), b"ignore me")
        store.put("{}/this_should_not_exist{}".format(dataset_uuid, suffix),
                  b"ignore me")
        store.put(
            "{}/{}/this_should_not_exist{}".format(dataset_uuid, "core",
                                                   suffix),
            b"ignore me",
        )
        store.put(
            "{}/{}/location=L-0/this_should_not_exist{}".format(
                dataset_uuid, "core", suffix),
            b"ignore me",
        )

    dmd = DatasetMetadata.load_from_dict(metadata, store)
    dmd = dmd.load_partition_indices()
    dmd_dict = dmd.to_dict()
    assert dmd_dict["partitions"] == expected_partitions
    # Sorting may differ in the index list. This is ok for runtime
    # but does produce flaky tests thus sort them.
    sorted_result = {
        column: {label: sorted(x)
                 for label, x in index.items()}
        for column, index in dmd_dict["indices"].items()
    }
    assert sorted_result == expected_indices