def test_invalid_uuid(): expected = { "dataset_metadata_version": 4, "dataset_uuid": "uuid.", "partitions": { "part_1": { "files": { "core": "file.parquet" } } }, } with pytest.raises(ValueError): DatasetMetadata.from_dict(expected) expected = { "dataset_metadata_version": 4, "dataset_uuid": "mañana", "partitions": { "part_1": { "files": { "core": "file.parquet" } } }, } with pytest.raises(ValueError): DatasetMetadata.from_dict(expected)
def test_dataset_get_indices_as_dataframe_duplicates(): ds = DatasetMetadata( "some_uuid", indices={ "l_external_code": ExplicitSecondaryIndex("l_external_code", { "1": ["part1", "part2"], "2": ["part1", "part2"] }), "p_external_code": ExplicitSecondaryIndex("p_external_code", { "1": ["part1"], "2": ["part2"] }), }, ) expected = pd.DataFrame( OrderedDict([ ("p_external_code", ["1", "1", "2", "2"]), ("l_external_code", ["1", "2", "1", "2"]), ]), index=pd.Index(["part1", "part1", "part2", "part2"], name="partition"), ) result = ds.get_indices_as_dataframe() pdt.assert_frame_equal(result, expected)
def test_conditions(driver, function_store, existing_cube): parts_source1 = set( DatasetMetadata.load_from_store( existing_cube.ktk_dataset_uuid("source"), function_store() ).partitions ) parts_enrich1 = set( DatasetMetadata.load_from_store( existing_cube.ktk_dataset_uuid("enrich"), function_store() ).partitions ) parts_source_to_delete = {part for part in parts_source1 if "p=0" not in part} result = driver( cube=existing_cube, store=function_store, ktk_cube_dataset_ids=["source"], conditions=C("p") > 0, ) assert set(result.keys()) == {"source", "enrich"} ds_source = result["source"] ds_enrich = result["enrich"] parts_source2 = set(ds_source.partitions) parts_enrich2 = set(ds_enrich.partitions) assert parts_enrich1 == parts_enrich2 assert parts_source1 - parts_source_to_delete == parts_source2
def test_fail_wrong_types(driver, function_store): """ Might catch nasty pandas and other type bugs. """ df_source = pd.DataFrame( {"x": [0, 1, 2, 3], "p": [0, 0, 1, 1], "v1": [10, 11, 12, 13]} ) df_enrich = pd.DataFrame( {"x": [0.0, 1.0, 2.0, 3.0], "p": [0, 0, 1, 1], "v2": [20, 21, 22, 23]} ) cube = Cube( dimension_columns=["x"], partition_columns=["p"], uuid_prefix="cube", seed_dataset="source", ) with pytest.raises(MultiTableCommitAborted) as exc_info: driver( data={"source": df_source, "enrich": df_enrich}, cube=cube, store=function_store, ) cause = exc_info.value.__cause__ assert isinstance(cause, ValueError) assert 'Found incompatible entries for column "x"' in str(cause) assert not DatasetMetadata.exists(cube.ktk_dataset_uuid("source"), function_store()) assert not DatasetMetadata.exists(cube.ktk_dataset_uuid("enrich"), function_store())
def test_append_partitions(driver, function_store, existing_cube): partitions_source_1 = set( DatasetMetadata.load_from_store( existing_cube.ktk_dataset_uuid("source"), function_store()).partitions.keys()) partitions_enrich_1 = set( DatasetMetadata.load_from_store( existing_cube.ktk_dataset_uuid("enrich"), function_store()).partitions.keys()) df_source = pd.DataFrame({ "x": [0, 1, 2, 3], "p": [0, 0, 1, 1], "v1": [20, 21, 22, 23], "i1": [20, 21, 22, 23], }) result = driver(data={"source": df_source}, cube=existing_cube, store=function_store) assert set(result.keys()) == {"source"} ds_source = result["source"] ds_enrich = DatasetMetadata.load_from_store( existing_cube.ktk_dataset_uuid("enrich"), function_store()) partitions_source_2 = set(ds_source.partitions.keys()) partitions_enrich_2 = set(ds_enrich.partitions.keys()) assert len(partitions_source_2) > len(partitions_source_1) assert partitions_source_1.issubset(partitions_source_2) assert partitions_enrich_2 == partitions_enrich_1
def test_fail_partition_on_4(driver, function_store): df_source = pd.DataFrame( {"x": [0, 1, 2, 3], "p": [0, 0, 1, 1], "v1": [10, 11, 12, 13]} ) df_enrich = pd.DataFrame( {"x": [0, 1, 2, 3], "p": [0, 0, 1, 1], "v2": [20, 21, 22, 23]} ) cube = Cube( dimension_columns=["x"], partition_columns=["p"], uuid_prefix="cube", seed_dataset="source", ) with pytest.raises( ValueError, match="Unspecified but provided partition columns in enrich: p" ): driver( data={"source": df_source, "enrich": df_enrich}, cube=cube, store=function_store, partition_on={"enrich": []}, ) assert not DatasetMetadata.exists(cube.ktk_dataset_uuid("source"), function_store()) assert not DatasetMetadata.exists(cube.ktk_dataset_uuid("enrich"), function_store())
def test_fail_nondistinc_payload(driver, function_store): """ This would lead to problems during the query phase. """ df_source = pd.DataFrame( {"x": [0, 1, 2, 3], "p": [0, 0, 1, 1], "v1": [10, 11, 12, 13]} ) df_enrich = pd.DataFrame( {"x": [0, 1, 2, 3], "p": [0, 0, 1, 1], "v1": [20, 21, 22, 23]} ) cube = Cube( dimension_columns=["x"], partition_columns=["p"], uuid_prefix="cube", seed_dataset="source", ) with pytest.raises(MultiTableCommitAborted) as exc_info: driver( data={"source": df_source, "enrich": df_enrich}, cube=cube, store=function_store, ) cause = exc_info.value.__cause__ assert isinstance(cause, ValueError) assert "Found columns present in multiple datasets" in str(cause) assert not DatasetMetadata.exists(cube.ktk_dataset_uuid("source"), function_store()) assert not DatasetMetadata.exists(cube.ktk_dataset_uuid("enrich"), function_store())
def test_create_dataset_header(store, metadata_storage_format, frozen_time): table_meta = {"table": make_meta(pd.DataFrame({"col": [1]}), origin="1")} new_dataset = create_empty_dataset_header( store=store, table_meta=table_meta, dataset_uuid="new_dataset_uuid", metadata_storage_format=metadata_storage_format, metadata_version=4, ) expected_dataset = DatasetMetadata( uuid="new_dataset_uuid", metadata_version=4, explicit_partitions=False, table_meta=table_meta, ) assert new_dataset == expected_dataset storage_keys = list(store.keys()) assert len(storage_keys) == 2 loaded = DatasetMetadata.load_from_store(store=store, uuid="new_dataset_uuid") assert loaded == expected_dataset # If the read succeeds, the schema is written read_schema_metadata(dataset_uuid=new_dataset.uuid, store=store, table="table")
def test_copy(frozen_time): ds = DatasetMetadata( uuid="uuid", partitions={"partition_label": { "files": {} }}, metadata={"some": "metadata"}, indices={ "column": ExplicitSecondaryIndex(column="column", index_dct={1: ["partition_label"]}) }, explicit_partitions=True, partition_keys=["P", "L"], ) new_ds = ds.copy() # Check if the copy is identical assert new_ds == ds # ... but not the same object assert id(new_ds) != id(ds) new_ds = ds.copy(metadata={"new": "metadata"}) assert id(new_ds) != id(ds) assert new_ds.metadata == { "new": "metadata", # The DatasetMetadata constructor ensure that the creation time is # always present. "creation_time": "2000-01-01T01:01:01.000001", }
def test_commit_dataset_from_metapartition(dataset_function, store): new_data = [ pd.DataFrame( OrderedDict([ ("P", [5]), ("L", [5]), ("TARGET", [5]), ("DATE", [datetime.date(2016, 3, 23)]), ])) ] new_partition = write_single_partition(store=store, dataset_uuid=dataset_function.uuid, data=new_data) pre_commit_dataset = DatasetMetadata.load_from_store( uuid=dataset_function.uuid, store=store) # Cannot assert equal since the metadata is differently ordered assert pre_commit_dataset == dataset_function updated_dataset = commit_dataset( store=store, dataset_uuid=dataset_function.uuid, new_partitions=new_partition, delete_scope=None, partition_on=None, ) assert updated_dataset != dataset_function assert updated_dataset.uuid == dataset_function.uuid assert len( updated_dataset.partitions) == len(dataset_function.partitions) + 1 # ensure that the new dataset is actually the one on disc loaded_dataset = DatasetMetadata.load_from_store(uuid=updated_dataset.uuid, store=store) assert loaded_dataset == updated_dataset # Read the data and check whether the rows above are included. # This checks whether all necessary informations were updated in the header # (e.g. files attributes of the partitions) actual = read_table(store=store, dataset_uuid=dataset_function.uuid) df_expected = pd.DataFrame( OrderedDict([ ( "DATE", [ datetime.date(2016, 3, 23), datetime.date(2010, 1, 1), datetime.date(2009, 12, 31), ], ), ("L", [5, 1, 2]), ("P", [5, 1, 2]), ("TARGET", [5, 1, 2]), ])) actual = actual.sort_values("DATE", ascending=False).reset_index(drop=True) assert_frame_equal(df_expected, actual)
def test_roundtrip_empty_with_store(store, metadata_version): dataset_uuid = "dataset_uuid" dataset = DatasetMetadata(uuid=dataset_uuid, metadata_version=metadata_version) store.put( "{}.by-dataset-metadata.json".format(dataset_uuid), simplejson.dumps(dataset.to_dict()).encode("utf-8"), ) assert dataset == DatasetMetadata.load_from_store(dataset_uuid, store)
def test_existing_indices_are_added_when_missing_in_cube(): """ Test that indices already existing in the dataset are added to the validated cube """ source_metadata = DatasetMetadata.from_dict({ "dataset_uuid": "source", "dataset_metadata_version": 4, "schema": FakeExtraTableMetadata(), "partition_keys": ["p"], "indices": { "d1": { "1": ["part_1"] }, "d2": { "1": ["part_1"] }, "i1": { "1": ["part_1"] }, "i2": { "1": ["part_1"] }, }, }) extra_metadata = DatasetMetadata.from_dict({ "dataset_uuid": "extra", "dataset_metadata_version": 4, "schema": FakeExtraTableMetadata(), "partition_keys": ["p"], "indices": { "i1": { "1": ["part_1"] } }, }) cube = Cube( dimension_columns=["d1", "d2"], partition_columns=["p"], uuid_prefix="cube", seed_dataset="source", index_columns=["i1"], ) validated_cube = ensure_valid_cube_indices( { "source": source_metadata, "extra": extra_metadata }, cube) assert validated_cube.index_columns == {"i1", "i2"}
def test_no_indices_are_suppressed_when_they_already_exist(): """ Test that no indicies marked as suppressed in the cube are actually suppressed when they are already present in the dataset """ source_metadata = DatasetMetadata.from_dict({ "dataset_uuid": "source", "dataset_metadata_version": 4, "schema": FakeSeedTableMetadata(), "partition_keys": ["p"], "indices": { "d1": { "1": ["part_1"] }, "d2": { "1": ["part_1"] }, "i1": { "1": ["part_1"] }, }, }) extra_metadata = DatasetMetadata.from_dict({ "dataset_uuid": "extra", "dataset_metadata_version": 4, "schema": FakeExtraTableMetadata(), "partition_keys": ["p"], "indices": { "i1": { "1": ["part_1"] } }, }) cube = Cube( dimension_columns=["d1", "d2"], partition_columns=["p"], uuid_prefix="cube", seed_dataset="source", suppress_index_on=["d1", "d2"], ) validated_cube = ensure_valid_cube_indices( { "source": source_metadata, "extra": extra_metadata }, cube) assert validated_cube.suppress_index_on == frozenset()
def test_simple(cli, built_cube, skv, store): ds = DatasetMetadata.load_from_store(built_cube.ktk_dataset_uuid("source"), store) assert "v1" not in ds.indices result = cli("--store=cubes", "my_cube", "index", "source", "v1") assert result.exit_code == 0 ds = DatasetMetadata.load_from_store(built_cube.ktk_dataset_uuid("source"), store) assert "v1" in ds.indices
def test_cube_with_valid_indices_is_not_modified_by_validation(): """ Test that a cube with valid indices is not modified by `ensure_valid_cube_indices` """ source_metadata = DatasetMetadata.from_dict({ "dataset_uuid": "source", "dataset_metadata_version": 4, "schema": FakeSeedTableMetadata(), "partition_keys": ["p"], "indices": { "d1": { "1": ["part_1"] }, "d2": { "1": ["part_1"] }, "i1": { "1": ["part_1"] }, }, }) extra_metadata = DatasetMetadata.from_dict({ "dataset_uuid": "extra", "dataset_metadata_version": 4, "schema": FakeExtraTableMetadata(), "partition_keys": ["p"], "indices": { "i1": { "1": ["part_1"] } }, }) cube = Cube( dimension_columns=["d1", "d2"], partition_columns=["p"], uuid_prefix="cube", seed_dataset="source", index_columns=["i1"], ) validated_cube = ensure_valid_cube_indices( { "source": source_metadata, "extra": extra_metadata }, cube) assert validated_cube == cube
def test_raises_when_cube_defines_index_not_in_dataset(): """ Test that a `ValueError` is raised when the cube defines an index that is not part of a dataset """ source_metadata = DatasetMetadata.from_dict({ "dataset_uuid": "source", "dataset_metadata_version": 4, "schema": FakeSeedTableMetadata(), "partition_keys": ["p"], "indices": { "d1": { "1": ["part_1"] }, "d2": { "1": ["part_1"] }, "i1": { "1": ["part_1"] }, }, }) extra_metadata = DatasetMetadata.from_dict({ "dataset_uuid": "extra", "dataset_metadata_version": 4, "schema": FakeExtraTableMetadata(), "partition_keys": ["p"], "indices": { "i1": { "1": ["part_1"] } }, }) cube = Cube( dimension_columns=["d1", "d2"], partition_columns=["p"], uuid_prefix="cube", seed_dataset="source", index_columns=["i2"], ) with pytest.raises(ValueError): ensure_valid_cube_indices( { "source": source_metadata, "extra": extra_metadata }, cube)
def test_complicated_uuid(): expected = { "dataset_metadata_version": 4, "dataset_uuid": "uuid+namespace-attribute12_underscored", "partitions": { "part_1": { "files": { "core": "file.parquet" } } }, } DatasetMetadata.from_dict(expected)
def test_fail_all_empty(driver, function_store): """ Might happen due to DB-based filters. """ df = pd.DataFrame( {"x": [0, 1, 2, 3], "p": [0, 0, 1, 1], "v": [10, 11, 12, 13]} ).loc[[]] cube = Cube(dimension_columns=["x"], partition_columns=["p"], uuid_prefix="cube") with pytest.raises(ValueError) as exc: driver(data=df, cube=cube, store=function_store) assert "Cannot write empty datasets: seed" in str(exc.value) assert not DatasetMetadata.exists(cube.ktk_dataset_uuid("source"), function_store()) assert not DatasetMetadata.exists(cube.ktk_dataset_uuid("enrich"), function_store())
def test_indices(driver, function_store, existing_cube): idx1_1 = set( DatasetMetadata.load_from_store( existing_cube.ktk_dataset_uuid("source"), function_store() ) .load_all_indices(function_store()) .indices["i1"] .index_dct.keys() ) idx2_1 = set( DatasetMetadata.load_from_store( existing_cube.ktk_dataset_uuid("enrich"), function_store() ) .load_all_indices(function_store()) .indices["i2"] .index_dct.keys() ) df_source = pd.DataFrame( { "x": [0, 1, 2, 3], "p": [0, 0, 1, 1], "v1": [20, 21, 22, 23], "i1": [20, 21, 22, 23], } ) result = driver( data={"source": df_source}, cube=existing_cube, store=function_store ) assert set(result.keys()) == {"source"} ds_source = result["source"] ds_enrich = DatasetMetadata.load_from_store( existing_cube.ktk_dataset_uuid("enrich"), function_store() ) idx1_2 = set( ds_source.load_all_indices(function_store()).indices["i1"].index_dct.keys() ) idx2_2 = set( ds_enrich.load_all_indices(function_store()).indices["i2"].index_dct.keys() ) assert idx1_1.issubset(idx1_2) assert len(idx1_1) < len(idx1_2) assert idx2_1 == idx2_2
def test_overlap_keyspace(store, metadata_version): dataset_uuid1 = "uuid+namespace-attribute12_underscored" dataset_uuid2 = "uuid+namespace-attribute12_underscored_ext" table = "core" for dataset_uuid in (dataset_uuid1, dataset_uuid2): partition0 = "location=L-0".format(dataset_uuid) partition0_key = "{}/{}/{}/data.parquet".format( dataset_uuid, table, partition0) metadata = { "dataset_metadata_version": metadata_version, "dataset_uuid": dataset_uuid, } # put two partitions for two tables each to store store.put( "{}{}.json".format(dataset_uuid, naming.METADATA_BASE_SUFFIX), simplejson.dumps(metadata).encode("utf-8"), ) store.put(partition0_key, b"test") store_schema_metadata( make_meta(pd.DataFrame({"location": ["L-0"]}), origin="1"), dataset_uuid, store, "core", ) for dataset_uuid in (dataset_uuid1, dataset_uuid2): partition0_label = "location=L-0/data".format(dataset_uuid) partition0_key = "{}/{}/{}.parquet".format(dataset_uuid, table, partition0_label) expected_partitions = { "location=L-0/data": { "files": { "core": partition0_key } } } expected_indices = {"location": {"L-0": ["location=L-0/data"]}} assert DatasetMetadata.storage_keys(dataset_uuid, store) == [ "{}{}.json".format(dataset_uuid, naming.METADATA_BASE_SUFFIX), _get_common_metadata_key(dataset_uuid, "core"), partition0_key, ] dmd = DatasetMetadata.load_from_store(dataset_uuid, store) dmd = dmd.load_partition_indices() dmd_dict = dmd.to_dict() assert dmd_dict["partitions"] == expected_partitions assert dmd_dict["indices"] == expected_indices
def validate_partition_keys( dataset_uuid, store, ds_factory, default_metadata_version, partition_on, **load_kwargs, ): if ds_factory or DatasetMetadata.exists(dataset_uuid, _instantiate_store(store)): ds_factory = _ensure_factory( dataset_uuid=dataset_uuid, store=store, factory=ds_factory, load_dataset_metadata=load_kwargs.pop("load_dataset_metadata", True), ) ds_metadata_version = ds_factory.metadata_version if partition_on: if not isinstance(partition_on, list): partition_on = [partition_on] if partition_on != ds_factory.partition_keys: raise ValueError( "Incompatible set of partition keys encountered. " "Input partitioning was `{}` while actual dataset was `{}`" .format(partition_on, ds_factory.partition_keys)) else: partition_on = ds_factory.partition_keys else: ds_factory = None ds_metadata_version = default_metadata_version return ds_factory, ds_metadata_version, partition_on
def test_store_dataset_from_partitions(meta_partitions_files_only, store, frozen_time): dataset = store_dataset_from_partitions( partition_list=meta_partitions_files_only, dataset_uuid="dataset_uuid", store=store, dataset_metadata={"some": "metadata"}, ) expected_metadata = { "some": "metadata", "creation_time": TIME_TO_FREEZE_ISO } assert dataset.metadata == expected_metadata assert sorted(dataset.partitions.values(), key=lambda x: x.label) == sorted( [mp.partition for mp in meta_partitions_files_only], key=lambda x: x.label) assert dataset.uuid == "dataset_uuid" store_files = list(store.keys()) # Dataset metadata: 1 file expected_number_files = 1 # common metadata for v4 datasets expected_number_files += 1 assert len(store_files) == expected_number_files # Ensure the dataset can be loaded properly stored_dataset = DatasetMetadata.load_from_store("dataset_uuid", store) assert dataset == stored_dataset
def test_store_dataframes_as_dataset_mp_partition_on_none( metadata_version, store, store_factory, bound_store_dataframes ): df = pd.DataFrame( {"P": np.arange(0, 10), "L": np.arange(0, 10), "TARGET": np.arange(10, 20)} ) df2 = pd.DataFrame({"P": np.arange(0, 10), "info": np.arange(100, 110)}) mp = MetaPartition( label=gen_uuid(), data={"core": df, "helper": df2}, metadata_version=metadata_version, ) df_list = [None, mp] dataset = bound_store_dataframes( df_list, store=store_factory, dataset_uuid="dataset_uuid", metadata_version=metadata_version, partition_on=["P"], ) assert isinstance(dataset, DatasetMetadata) assert dataset.partition_keys == ["P"] assert len(dataset.partitions) == 10 assert dataset.metadata_version == metadata_version stored_dataset = DatasetMetadata.load_from_store("dataset_uuid", store) assert dataset == stored_dataset
def test_store_dataframes_as_dataset_list_input( store_factory, metadata_version, bound_store_dataframes ): df = pd.DataFrame( {"P": np.arange(0, 10), "L": np.arange(0, 10), "TARGET": np.arange(10, 20)} ) df2 = pd.DataFrame( { "P": np.arange(100, 110), "L": np.arange(100, 110), "TARGET": np.arange(10, 20), } ) df_list = [df, df2] dataset = bound_store_dataframes( df_list, store=store_factory, dataset_uuid="dataset_uuid", metadata_version=metadata_version, ) assert isinstance(dataset, DatasetMetadata) assert len(dataset.partitions) == 2 stored_dataset = DatasetMetadata.load_from_store("dataset_uuid", store_factory()) assert dataset == stored_dataset
def test_cube_blacklist_dimension_index(function_store, driver): cube1 = Cube( dimension_columns=["A", "B"], partition_columns=["P"], uuid_prefix="cube", seed_dataset="source", ) df_1 = pd.DataFrame({"A": range(10), "P": 1, "B": 1, "payload": ""}) build_cube( data={"source": df_1}, cube=cube1, store=function_store, metadata={"source": {"meta_at_create": "data"}}, ) cube2 = Cube( dimension_columns=["A", "B"], partition_columns=["P"], uuid_prefix="cube", seed_dataset="source", suppress_index_on=["B"], ) df_2 = pd.DataFrame({"A": range(10), "P": 1, "B": 2, "payload": ""}) driver( data={"source": df_2}, cube=cube2, store=function_store, remove_conditions=None ) dataset_uuid = cube2.ktk_dataset_uuid(cube2.seed_dataset) dm = DatasetMetadata.load_from_store( dataset_uuid, function_store(), load_all_indices=True ) obs_values = dm.indices["B"].observed_values() assert sorted(obs_values) == [1, 2]
def test_all(cli, built_cube, skv, store): result = cli("--store=cubes", "my_cube", "index", "source", "*") assert result.exit_code == 0 ds = DatasetMetadata.load_from_store(built_cube.ktk_dataset_uuid("source"), store) assert set(ds.indices.keys()) == set(get_dataset_columns(ds))
def test_update_secondary_indices_subset(store_factory, bound_update_dataset): df1 = pd.DataFrame({"A": range(10), "indexed": 1}) dataset_uuid = "dataset_uuid" bound_update_dataset(df1, dataset_uuid=dataset_uuid, store=store_factory, secondary_indices="indexed") df2 = pd.DataFrame({"A": range(10), "indexed": 2}) # secondary index is omitted. Kartothek should pick it up regardless bound_update_dataset(df2, dataset_uuid=dataset_uuid, store=store_factory) dm = DatasetMetadata.load_from_store(dataset_uuid, store_factory(), load_all_indices=True) obs_values = dm.indices["indexed"].observed_values() assert sorted(obs_values) == [1, 2] with pytest.raises(ValueError, match="Incorrect indices provided"): # secondary index is omitted. Kartothek should pick it up regardless bound_update_dataset(df2, dataset_uuid=dataset_uuid, store=store_factory, secondary_indices="A")
def test_store_dataframes_as_dataset_no_pipeline_partition_on(store): df = pd.DataFrame({ "P": np.arange(0, 10), "L": np.arange(0, 10), "TARGET": np.arange(10, 20) }) df2 = pd.DataFrame({"P": np.arange(0, 10), "info": np.arange(100, 110)}) dataset = store_dataframes_as_dataset( store=store, dataset_uuid="dataset_uuid", dfs={ "core": df, "helper": df2 }, partition_on="P", metadata_version=4, ) assert isinstance(dataset, DatasetMetadata) assert len(dataset.partitions) == 10 stored_dataset = DatasetMetadata.load_from_store("dataset_uuid", store) assert dataset == stored_dataset
def test_store_dataframes_as_dataset(store_factory, metadata_version, bound_store_dataframes): df = pd.DataFrame({ "P": np.arange(0, 10), "L": np.arange(0, 10), "TARGET": np.arange(10, 20) }) df_helper = pd.DataFrame({ "P": np.arange(0, 10), "info": string.ascii_lowercase[:10] }) df_list = [ { "label": "cluster_1", "data": [("core", df.copy(deep=True)), ("helper", df_helper)], }, { "label": "cluster_2", "data": [("core", df.copy(deep=True)), ("helper", df_helper)], }, ] dataset = bound_store_dataframes( df_list, store=store_factory, dataset_uuid="dataset_uuid", metadata_version=metadata_version, secondary_indices=["P"], ) assert isinstance(dataset, DatasetMetadata) assert len(dataset.partitions) == 2 assert "P" in dataset.indices store = store_factory() stored_dataset = DatasetMetadata.load_from_store("dataset_uuid", store) assert dataset.uuid == stored_dataset.uuid assert dataset.metadata == stored_dataset.metadata assert dataset.partitions == stored_dataset.partitions index_dct = stored_dataset.indices["P"].load(store).index_dct assert sorted(index_dct.keys()) == list(range(0, 10)) assert any( [sorted(p) == ["cluster_1", "cluster_2"] for p in index_dct.values()]) df_stored = DataFrameSerializer.restore_dataframe( key=dataset.partitions["cluster_1"].files["core"], store=store) pdt.assert_frame_equal(df, df_stored) df_stored = DataFrameSerializer.restore_dataframe( key=dataset.partitions["cluster_2"].files["core"], store=store) pdt.assert_frame_equal(df, df_stored) df_stored = DataFrameSerializer.restore_dataframe( key=dataset.partitions["cluster_1"].files["helper"], store=store) pdt.assert_frame_equal(df_helper, df_stored) df_stored = DataFrameSerializer.restore_dataframe( key=dataset.partitions["cluster_2"].files["helper"], store=store) pdt.assert_frame_equal(df_helper, df_stored)
def test_store_dataframes_as_dataset_mp(metadata_version, store): df = pd.DataFrame({ "P": np.arange(0, 10), "L": np.arange(0, 10), "TARGET": np.arange(10, 20) }) df2 = pd.DataFrame({"P": np.arange(0, 10), "info": np.arange(100, 110)}) mp = MetaPartition( label=gen_uuid(), data={ "core": df, "helper": df2 }, metadata_version=metadata_version, ) dataset = store_dataframes_as_dataset( store=store, dataset_uuid="dataset_uuid", dfs=mp, metadata_version=metadata_version, ) assert isinstance(dataset, DatasetMetadata) assert len(dataset.partitions) == 1 assert dataset.metadata_version == metadata_version stored_dataset = DatasetMetadata.load_from_store("dataset_uuid", store) assert dataset == stored_dataset