Пример #1
0
def test_dataset_get_indices_as_dataframe_duplicates():
    ds = DatasetMetadata(
        "some_uuid",
        indices={
            "l_external_code":
            ExplicitSecondaryIndex("l_external_code", {
                "1": ["part1", "part2"],
                "2": ["part1", "part2"]
            }),
            "p_external_code":
            ExplicitSecondaryIndex("p_external_code", {
                "1": ["part1"],
                "2": ["part2"]
            }),
        },
    )
    expected = pd.DataFrame(
        OrderedDict([
            ("p_external_code", ["1", "1", "2", "2"]),
            ("l_external_code", ["1", "2", "1", "2"]),
        ]),
        index=pd.Index(["part1", "part1", "part2", "part2"], name="partition"),
    )
    result = ds.get_indices_as_dataframe()
    pdt.assert_frame_equal(result, expected)
Пример #2
0
def test_create_dataset_header(store, metadata_storage_format, frozen_time):
    table_meta = {"table": make_meta(pd.DataFrame({"col": [1]}), origin="1")}
    new_dataset = create_empty_dataset_header(
        store=store,
        table_meta=table_meta,
        dataset_uuid="new_dataset_uuid",
        metadata_storage_format=metadata_storage_format,
        metadata_version=4,
    )

    expected_dataset = DatasetMetadata(
        uuid="new_dataset_uuid",
        metadata_version=4,
        explicit_partitions=False,
        table_meta=table_meta,
    )
    assert new_dataset == expected_dataset

    storage_keys = list(store.keys())
    assert len(storage_keys) == 2

    loaded = DatasetMetadata.load_from_store(store=store,
                                             uuid="new_dataset_uuid")
    assert loaded == expected_dataset

    # If the read succeeds, the schema is written
    read_schema_metadata(dataset_uuid=new_dataset.uuid,
                         store=store,
                         table="table")
Пример #3
0
def test_copy(frozen_time):
    ds = DatasetMetadata(
        uuid="uuid",
        partitions={"partition_label": {
            "files": {}
        }},
        metadata={"some": "metadata"},
        indices={
            "column":
            ExplicitSecondaryIndex(column="column",
                                   index_dct={1: ["partition_label"]})
        },
        explicit_partitions=True,
        partition_keys=["P", "L"],
    )
    new_ds = ds.copy()
    # Check if the copy is identical
    assert new_ds == ds
    # ... but not the same object
    assert id(new_ds) != id(ds)

    new_ds = ds.copy(metadata={"new": "metadata"})
    assert id(new_ds) != id(ds)
    assert new_ds.metadata == {
        "new": "metadata",
        # The DatasetMetadata constructor ensure that the creation time is
        # always present.
        "creation_time": "2000-01-01T01:01:01.000001",
    }
Пример #4
0
def test_roundtrip_empty_with_store(store, metadata_version):
    dataset_uuid = "dataset_uuid"
    dataset = DatasetMetadata(uuid=dataset_uuid,
                              metadata_version=metadata_version)
    store.put(
        "{}.by-dataset-metadata.json".format(dataset_uuid),
        simplejson.dumps(dataset.to_dict()).encode("utf-8"),
    )
    assert dataset == DatasetMetadata.load_from_store(dataset_uuid, store)
Пример #5
0
def test_roundtrip_empty(metadata_version):
    ds = DatasetMetadata(uuid="dataset_uuid",
                         metadata_version=metadata_version)
    assert ds == ds.from_dict(ds.to_dict())