def test_dataset_get_indices_as_dataframe_duplicates(): ds = DatasetMetadata( "some_uuid", indices={ "l_external_code": ExplicitSecondaryIndex("l_external_code", { "1": ["part1", "part2"], "2": ["part1", "part2"] }), "p_external_code": ExplicitSecondaryIndex("p_external_code", { "1": ["part1"], "2": ["part2"] }), }, ) expected = pd.DataFrame( OrderedDict([ ("p_external_code", ["1", "1", "2", "2"]), ("l_external_code", ["1", "2", "1", "2"]), ]), index=pd.Index(["part1", "part1", "part2", "part2"], name="partition"), ) result = ds.get_indices_as_dataframe() pdt.assert_frame_equal(result, expected)
def test_create_dataset_header(store, metadata_storage_format, frozen_time): table_meta = {"table": make_meta(pd.DataFrame({"col": [1]}), origin="1")} new_dataset = create_empty_dataset_header( store=store, table_meta=table_meta, dataset_uuid="new_dataset_uuid", metadata_storage_format=metadata_storage_format, metadata_version=4, ) expected_dataset = DatasetMetadata( uuid="new_dataset_uuid", metadata_version=4, explicit_partitions=False, table_meta=table_meta, ) assert new_dataset == expected_dataset storage_keys = list(store.keys()) assert len(storage_keys) == 2 loaded = DatasetMetadata.load_from_store(store=store, uuid="new_dataset_uuid") assert loaded == expected_dataset # If the read succeeds, the schema is written read_schema_metadata(dataset_uuid=new_dataset.uuid, store=store, table="table")
def test_copy(frozen_time): ds = DatasetMetadata( uuid="uuid", partitions={"partition_label": { "files": {} }}, metadata={"some": "metadata"}, indices={ "column": ExplicitSecondaryIndex(column="column", index_dct={1: ["partition_label"]}) }, explicit_partitions=True, partition_keys=["P", "L"], ) new_ds = ds.copy() # Check if the copy is identical assert new_ds == ds # ... but not the same object assert id(new_ds) != id(ds) new_ds = ds.copy(metadata={"new": "metadata"}) assert id(new_ds) != id(ds) assert new_ds.metadata == { "new": "metadata", # The DatasetMetadata constructor ensure that the creation time is # always present. "creation_time": "2000-01-01T01:01:01.000001", }
def test_roundtrip_empty_with_store(store, metadata_version): dataset_uuid = "dataset_uuid" dataset = DatasetMetadata(uuid=dataset_uuid, metadata_version=metadata_version) store.put( "{}.by-dataset-metadata.json".format(dataset_uuid), simplejson.dumps(dataset.to_dict()).encode("utf-8"), ) assert dataset == DatasetMetadata.load_from_store(dataset_uuid, store)
def test_roundtrip_empty(metadata_version): ds = DatasetMetadata(uuid="dataset_uuid", metadata_version=metadata_version) assert ds == ds.from_dict(ds.to_dict())