예제 #1
0
def test_invalid_uuid():
    expected = {
        "dataset_metadata_version": 4,
        "dataset_uuid": "uuid.",
        "partitions": {
            "part_1": {
                "files": {
                    "core": "file.parquet"
                }
            }
        },
    }
    with pytest.raises(ValueError):
        DatasetMetadata.from_dict(expected)

    expected = {
        "dataset_metadata_version": 4,
        "dataset_uuid": "mañana",
        "partitions": {
            "part_1": {
                "files": {
                    "core": "file.parquet"
                }
            }
        },
    }
    with pytest.raises(ValueError):
        DatasetMetadata.from_dict(expected)
예제 #2
0
def test_existing_indices_are_added_when_missing_in_cube():
    """
    Test that indices already existing in the dataset are added to the validated cube
    """
    source_metadata = DatasetMetadata.from_dict({
        "dataset_uuid":
        "source",
        "dataset_metadata_version":
        4,
        "schema":
        FakeExtraTableMetadata(),
        "partition_keys": ["p"],
        "indices": {
            "d1": {
                "1": ["part_1"]
            },
            "d2": {
                "1": ["part_1"]
            },
            "i1": {
                "1": ["part_1"]
            },
            "i2": {
                "1": ["part_1"]
            },
        },
    })
    extra_metadata = DatasetMetadata.from_dict({
        "dataset_uuid":
        "extra",
        "dataset_metadata_version":
        4,
        "schema":
        FakeExtraTableMetadata(),
        "partition_keys": ["p"],
        "indices": {
            "i1": {
                "1": ["part_1"]
            }
        },
    })
    cube = Cube(
        dimension_columns=["d1", "d2"],
        partition_columns=["p"],
        uuid_prefix="cube",
        seed_dataset="source",
        index_columns=["i1"],
    )

    validated_cube = ensure_valid_cube_indices(
        {
            "source": source_metadata,
            "extra": extra_metadata
        }, cube)

    assert validated_cube.index_columns == {"i1", "i2"}
예제 #3
0
def test_no_indices_are_suppressed_when_they_already_exist():
    """
    Test that no indicies marked as suppressed in the cube are actually suppressed when
    they are already present in the dataset
    """
    source_metadata = DatasetMetadata.from_dict({
        "dataset_uuid":
        "source",
        "dataset_metadata_version":
        4,
        "schema":
        FakeSeedTableMetadata(),
        "partition_keys": ["p"],
        "indices": {
            "d1": {
                "1": ["part_1"]
            },
            "d2": {
                "1": ["part_1"]
            },
            "i1": {
                "1": ["part_1"]
            },
        },
    })
    extra_metadata = DatasetMetadata.from_dict({
        "dataset_uuid":
        "extra",
        "dataset_metadata_version":
        4,
        "schema":
        FakeExtraTableMetadata(),
        "partition_keys": ["p"],
        "indices": {
            "i1": {
                "1": ["part_1"]
            }
        },
    })
    cube = Cube(
        dimension_columns=["d1", "d2"],
        partition_columns=["p"],
        uuid_prefix="cube",
        seed_dataset="source",
        suppress_index_on=["d1", "d2"],
    )

    validated_cube = ensure_valid_cube_indices(
        {
            "source": source_metadata,
            "extra": extra_metadata
        }, cube)

    assert validated_cube.suppress_index_on == frozenset()
예제 #4
0
def test_cube_with_valid_indices_is_not_modified_by_validation():
    """
    Test that a cube with valid indices is not modified by `ensure_valid_cube_indices`
    """
    source_metadata = DatasetMetadata.from_dict({
        "dataset_uuid":
        "source",
        "dataset_metadata_version":
        4,
        "schema":
        FakeSeedTableMetadata(),
        "partition_keys": ["p"],
        "indices": {
            "d1": {
                "1": ["part_1"]
            },
            "d2": {
                "1": ["part_1"]
            },
            "i1": {
                "1": ["part_1"]
            },
        },
    })
    extra_metadata = DatasetMetadata.from_dict({
        "dataset_uuid":
        "extra",
        "dataset_metadata_version":
        4,
        "schema":
        FakeExtraTableMetadata(),
        "partition_keys": ["p"],
        "indices": {
            "i1": {
                "1": ["part_1"]
            }
        },
    })
    cube = Cube(
        dimension_columns=["d1", "d2"],
        partition_columns=["p"],
        uuid_prefix="cube",
        seed_dataset="source",
        index_columns=["i1"],
    )

    validated_cube = ensure_valid_cube_indices(
        {
            "source": source_metadata,
            "extra": extra_metadata
        }, cube)

    assert validated_cube == cube
예제 #5
0
def test_raises_when_cube_defines_index_not_in_dataset():
    """
    Test that a `ValueError` is raised when the cube defines an index that is not part of a dataset
    """
    source_metadata = DatasetMetadata.from_dict({
        "dataset_uuid":
        "source",
        "dataset_metadata_version":
        4,
        "schema":
        FakeSeedTableMetadata(),
        "partition_keys": ["p"],
        "indices": {
            "d1": {
                "1": ["part_1"]
            },
            "d2": {
                "1": ["part_1"]
            },
            "i1": {
                "1": ["part_1"]
            },
        },
    })
    extra_metadata = DatasetMetadata.from_dict({
        "dataset_uuid":
        "extra",
        "dataset_metadata_version":
        4,
        "schema":
        FakeExtraTableMetadata(),
        "partition_keys": ["p"],
        "indices": {
            "i1": {
                "1": ["part_1"]
            }
        },
    })
    cube = Cube(
        dimension_columns=["d1", "d2"],
        partition_columns=["p"],
        uuid_prefix="cube",
        seed_dataset="source",
        index_columns=["i2"],
    )

    with pytest.raises(ValueError):
        ensure_valid_cube_indices(
            {
                "source": source_metadata,
                "extra": extra_metadata
            }, cube)
예제 #6
0
def test_complicated_uuid():
    expected = {
        "dataset_metadata_version": 4,
        "dataset_uuid": "uuid+namespace-attribute12_underscored",
        "partitions": {
            "part_1": {
                "files": {
                    "core": "file.parquet"
                }
            }
        },
    }
    DatasetMetadata.from_dict(expected)
예제 #7
0
def test_load_all_indices(store, metadata_version):
    meta_dct = {
        "dataset_metadata_version": metadata_version,
        "dataset_uuid": "uuid+namespace-attribute12_underscored",
        "partitions": {
            "location_id=1/part_1": {
                "files": {
                    "core_data":
                    "dataset_uuid/table/location_id=1/part_1.parquet"
                }
            }
        },
        "indices": {
            "product_id": {
                "1": ["part_1"],
                "2": ["part_1"],
                "100": ["part_1"],
                "34": ["part_1"],
            }
        },
    }
    dmd = DatasetMetadata.from_dict(meta_dct)
    dmd.schema = make_meta(pd.DataFrame(
        {"location_id": pd.Series([1], dtype=int)}),
                           origin="core")

    dmd = dmd.load_all_indices(store)

    assert "product_id" in dmd.indices
    assert isinstance(dmd.indices["product_id"], ExplicitSecondaryIndex)

    assert "location_id" in dmd.indices
    assert isinstance(dmd.indices["location_id"], PartitionIndex)

    assert len(dmd.indices) == 2
예제 #8
0
def test_load_indices_embedded(metadata_version):
    expected = {
        "dataset_metadata_version": metadata_version,
        "dataset_uuid": "uuid+namespace-attribute12_underscored",
        "partitions": {
            "part_1": {
                "files": {
                    "core_data": "file.parquest"
                }
            }
        },
        "indices": {
            "product_id": {
                "1": ["part_1"],
                "2": ["part_1"],
                "100": ["part_1"],
                "34": ["part_1"],
            }
        },
    }
    dmd = DatasetMetadata.from_dict(expected)
    assert "product_id" in dmd.indices

    with pytest.raises(KeyError):
        dmd.load_index("not there", store=None)

    dmd_loaded = dmd.load_index("product_id", store=None)
    assert "product_id" in dmd_loaded.indices
예제 #9
0
def test_raise_multitable(metadata_version):
    expected = {
        "dataset_metadata_version": metadata_version,
        "dataset_uuid": "uuid",
        "metadata": {},
        "partitions": {
            "part_1": {
                "files": {
                    "tableA": "file.parquet",
                    "tableB": "file.parquet"
                }
            }
        },
        "partition_keys": [],
    }

    with pytest.raises(
            RuntimeError,
            match=
            r"Dataset uuid has tables.*but read support for multi tabled dataset was dropped with kartothek 4\.0\.",
    ):
        DatasetMetadata.from_dict(expected)
예제 #10
0
def test_roundtrip_no_metadata(metadata_version, frozen_time):
    expected = {
        "dataset_metadata_version": metadata_version,
        "dataset_uuid": "uuid",
        "metadata": {
            "creation_time": "2000-01-01 01:01:01"
        },
        "partition_keys": [],
        "partitions": {
            "part_1": {
                "files": {
                    "core": "file.parquet"
                }
            }
        },
    }
    result = DatasetMetadata.from_dict(expected).to_dict()
    assert expected == result
예제 #11
0
def test_builder_to_dataset(metadata_version, frozen_time):
    expected = {
        "dataset_uuid": "uuid",
        "dataset_metadata_version": metadata_version,
        "partitions": {
            "part_2": {
                "files": {
                    "core": "uuid/core/part_2.parquet"
                }
            }
        },
        "metadata": {
            "key": "value",
            "creation_time": TIME_TO_FREEZE_ISO
        },
        "indices": {
            "col1": {
                "a": ["part1"],
                "b": ["part2"]
            }
        },
    }

    builder = DatasetMetadataBuilder("uuid", metadata_version=metadata_version)
    part_2 = Partition("part_2", {"core": "uuid/core/part_2.parquet"})
    builder.add_partition("part_2", part_2)
    builder.add_metadata("key", "value")
    builder.add_embedded_index(
        "col1", ExplicitSecondaryIndex("col1", {
            "a": ["part1"],
            "b": ["part2"]
        }))

    result = builder.to_dataset()
    expected_from_dict = DatasetMetadata.from_dict(expected)
    assert result == expected_from_dict
예제 #12
0
def test_roundtrip_empty(metadata_version):
    ds = DatasetMetadata(uuid="dataset_uuid",
                         metadata_version=metadata_version)
    assert ds == ds.from_dict(ds.to_dict())