def test_conditions(driver, function_store, existing_cube): parts_source1 = set( DatasetMetadata.load_from_store( existing_cube.ktk_dataset_uuid("source"), function_store() ).partitions ) parts_enrich1 = set( DatasetMetadata.load_from_store( existing_cube.ktk_dataset_uuid("enrich"), function_store() ).partitions ) parts_source_to_delete = {part for part in parts_source1 if "p=0" not in part} result = driver( cube=existing_cube, store=function_store, ktk_cube_dataset_ids=["source"], conditions=C("p") > 0, ) assert set(result.keys()) == {"source", "enrich"} ds_source = result["source"] ds_enrich = result["enrich"] parts_source2 = set(ds_source.partitions) parts_enrich2 = set(ds_enrich.partitions) assert parts_enrich1 == parts_enrich2 assert parts_source1 - parts_source_to_delete == parts_source2
def test_append_partitions(driver, function_store, existing_cube): partitions_source_1 = set( DatasetMetadata.load_from_store( existing_cube.ktk_dataset_uuid("source"), function_store()).partitions.keys()) partitions_enrich_1 = set( DatasetMetadata.load_from_store( existing_cube.ktk_dataset_uuid("enrich"), function_store()).partitions.keys()) df_source = pd.DataFrame({ "x": [0, 1, 2, 3], "p": [0, 0, 1, 1], "v1": [20, 21, 22, 23], "i1": [20, 21, 22, 23], }) result = driver(data={"source": df_source}, cube=existing_cube, store=function_store) assert set(result.keys()) == {"source"} ds_source = result["source"] ds_enrich = DatasetMetadata.load_from_store( existing_cube.ktk_dataset_uuid("enrich"), function_store()) partitions_source_2 = set(ds_source.partitions.keys()) partitions_enrich_2 = set(ds_enrich.partitions.keys()) assert len(partitions_source_2) > len(partitions_source_1) assert partitions_source_1.issubset(partitions_source_2) assert partitions_enrich_2 == partitions_enrich_1
def test_commit_dataset_from_metapartition(dataset_function, store): new_data = [ pd.DataFrame( OrderedDict([ ("P", [5]), ("L", [5]), ("TARGET", [5]), ("DATE", [datetime.date(2016, 3, 23)]), ])) ] new_partition = write_single_partition(store=store, dataset_uuid=dataset_function.uuid, data=new_data) pre_commit_dataset = DatasetMetadata.load_from_store( uuid=dataset_function.uuid, store=store) # Cannot assert equal since the metadata is differently ordered assert pre_commit_dataset == dataset_function updated_dataset = commit_dataset( store=store, dataset_uuid=dataset_function.uuid, new_partitions=new_partition, delete_scope=None, partition_on=None, ) assert updated_dataset != dataset_function assert updated_dataset.uuid == dataset_function.uuid assert len( updated_dataset.partitions) == len(dataset_function.partitions) + 1 # ensure that the new dataset is actually the one on disc loaded_dataset = DatasetMetadata.load_from_store(uuid=updated_dataset.uuid, store=store) assert loaded_dataset == updated_dataset # Read the data and check whether the rows above are included. # This checks whether all necessary informations were updated in the header # (e.g. files attributes of the partitions) actual = read_table(store=store, dataset_uuid=dataset_function.uuid) df_expected = pd.DataFrame( OrderedDict([ ( "DATE", [ datetime.date(2016, 3, 23), datetime.date(2010, 1, 1), datetime.date(2009, 12, 31), ], ), ("L", [5, 1, 2]), ("P", [5, 1, 2]), ("TARGET", [5, 1, 2]), ])) actual = actual.sort_values("DATE", ascending=False).reset_index(drop=True) assert_frame_equal(df_expected, actual)
def test_simple(cli, built_cube, skv, store): ds = DatasetMetadata.load_from_store(built_cube.ktk_dataset_uuid("source"), store) assert "v1" not in ds.indices result = cli("--store=cubes", "my_cube", "index", "source", "v1") assert result.exit_code == 0 ds = DatasetMetadata.load_from_store(built_cube.ktk_dataset_uuid("source"), store) assert "v1" in ds.indices
def test_indices(driver, function_store, existing_cube): idx1_1 = set( DatasetMetadata.load_from_store( existing_cube.ktk_dataset_uuid("source"), function_store() ) .load_all_indices(function_store()) .indices["i1"] .index_dct.keys() ) idx2_1 = set( DatasetMetadata.load_from_store( existing_cube.ktk_dataset_uuid("enrich"), function_store() ) .load_all_indices(function_store()) .indices["i2"] .index_dct.keys() ) df_source = pd.DataFrame( { "x": [0, 1, 2, 3], "p": [0, 0, 1, 1], "v1": [20, 21, 22, 23], "i1": [20, 21, 22, 23], } ) result = driver( data={"source": df_source}, cube=existing_cube, store=function_store ) assert set(result.keys()) == {"source"} ds_source = result["source"] ds_enrich = DatasetMetadata.load_from_store( existing_cube.ktk_dataset_uuid("enrich"), function_store() ) idx1_2 = set( ds_source.load_all_indices(function_store()).indices["i1"].index_dct.keys() ) idx2_2 = set( ds_enrich.load_all_indices(function_store()).indices["i2"].index_dct.keys() ) assert idx1_1.issubset(idx1_2) assert len(idx1_1) < len(idx1_2) assert idx2_1 == idx2_2
def test_all(cli, built_cube, skv, store): result = cli("--store=cubes", "my_cube", "index", "source", "*") assert result.exit_code == 0 ds = DatasetMetadata.load_from_store(built_cube.ktk_dataset_uuid("source"), store) assert set(ds.indices.keys()) == set(get_dataset_columns(ds))
def test_cube_blacklist_dimension_index(function_store, driver): cube1 = Cube( dimension_columns=["A", "B"], partition_columns=["P"], uuid_prefix="cube", seed_dataset="source", ) df_1 = pd.DataFrame({"A": range(10), "P": 1, "B": 1, "payload": ""}) build_cube( data={"source": df_1}, cube=cube1, store=function_store, metadata={"source": {"meta_at_create": "data"}}, ) cube2 = Cube( dimension_columns=["A", "B"], partition_columns=["P"], uuid_prefix="cube", seed_dataset="source", suppress_index_on=["B"], ) df_2 = pd.DataFrame({"A": range(10), "P": 1, "B": 2, "payload": ""}) driver( data={"source": df_2}, cube=cube2, store=function_store, remove_conditions=None ) dataset_uuid = cube2.ktk_dataset_uuid(cube2.seed_dataset) dm = DatasetMetadata.load_from_store( dataset_uuid, function_store(), load_all_indices=True ) obs_values = dm.indices["B"].observed_values() assert sorted(obs_values) == [1, 2]
def test_create_dataset_header(store, metadata_storage_format, frozen_time): table_meta = {"table": make_meta(pd.DataFrame({"col": [1]}), origin="1")} new_dataset = create_empty_dataset_header( store=store, table_meta=table_meta, dataset_uuid="new_dataset_uuid", metadata_storage_format=metadata_storage_format, metadata_version=4, ) expected_dataset = DatasetMetadata( uuid="new_dataset_uuid", metadata_version=4, explicit_partitions=False, table_meta=table_meta, ) assert new_dataset == expected_dataset storage_keys = list(store.keys()) assert len(storage_keys) == 2 loaded = DatasetMetadata.load_from_store(store=store, uuid="new_dataset_uuid") assert loaded == expected_dataset # If the read succeeds, the schema is written read_schema_metadata(dataset_uuid=new_dataset.uuid, store=store, table="table")
def test_store_dataframes_as_dataset_dfs_input_formats(store): df1 = pd.DataFrame({"B": [pd.Timestamp("2019")]}) df2 = pd.DataFrame({"A": [1.4]}) formats = [ { "data": { "D": df1, "S": df2 } }, { "D": df1, "S": df2 }, { "data": [("D", df1), ("S", df2)] }, [("D", df1), ("S", df2)], ] for input_format in formats: dataset = store_dataframes_as_dataset(store=store, dataset_uuid="dataset_uuid", dfs=[input_format], overwrite=True) stored_dataset = DatasetMetadata.load_from_store("dataset_uuid", store) assert dataset == stored_dataset
def test_store_dataframes_as_dataset_no_pipeline_partition_on(store): df = pd.DataFrame({ "P": np.arange(0, 10), "L": np.arange(0, 10), "TARGET": np.arange(10, 20) }) df2 = pd.DataFrame({"P": np.arange(0, 10), "info": np.arange(100, 110)}) dataset = store_dataframes_as_dataset( store=store, dataset_uuid="dataset_uuid", dfs={ "core": df, "helper": df2 }, partition_on="P", metadata_version=4, ) assert isinstance(dataset, DatasetMetadata) assert len(dataset.partitions) == 10 stored_dataset = DatasetMetadata.load_from_store("dataset_uuid", store) assert dataset == stored_dataset
def test_store_dataframes_as_dataset_batch_mode(store_factory, metadata_version, bound_store_dataframes): # TODO: Kick this out? values_p1 = [1, 2, 3] values_p2 = [4, 5, 6] df = pd.DataFrame({"P": values_p1}) df2 = pd.DataFrame({"P": values_p2}) df_list = [[df, df2]] dataset = bound_store_dataframes( df_list, store=store_factory, dataset_uuid="dataset_uuid", metadata_version=metadata_version, secondary_indices="P", ) assert isinstance(dataset, DatasetMetadata) assert len(dataset.partitions) == 2 store = store_factory() stored_dataset = DatasetMetadata.load_from_store( "dataset_uuid", store).load_all_indices(store) assert dataset.uuid == stored_dataset.uuid assert dataset.metadata == stored_dataset.metadata assert dataset.partitions == stored_dataset.partitions assert "P" in dataset.indices
def test_store_dataframes_as_dataset_empty_dataframe(store_factory, metadata_version, df_all_types, bound_store_dataframes): """ Test that writing an empty column succeeds. In particular, this may fail due to too strict schema validation. """ df_empty = df_all_types.drop(0) assert df_empty.empty df_list = [df_empty] dataset = bound_store_dataframes( df_list, store=store_factory, dataset_uuid="dataset_uuid", metadata_version=metadata_version, ) assert isinstance(dataset, DatasetMetadata) assert len(dataset.partitions) == 1 store = store_factory() stored_dataset = DatasetMetadata.load_from_store("dataset_uuid", store) assert dataset.uuid == stored_dataset.uuid assert dataset.metadata == stored_dataset.metadata assert dataset.partitions == stored_dataset.partitions df_stored = DataFrameSerializer.restore_dataframe(key=next( iter(dataset.partitions.values())).files["table"], store=store) pdt.assert_frame_equal(df_empty, df_stored)
def test_store_dataframes_as_dataset_list_input( store_factory, metadata_version, bound_store_dataframes ): df = pd.DataFrame( {"P": np.arange(0, 10), "L": np.arange(0, 10), "TARGET": np.arange(10, 20)} ) df2 = pd.DataFrame( { "P": np.arange(100, 110), "L": np.arange(100, 110), "TARGET": np.arange(10, 20), } ) df_list = [df, df2] dataset = bound_store_dataframes( df_list, store=store_factory, dataset_uuid="dataset_uuid", metadata_version=metadata_version, ) assert isinstance(dataset, DatasetMetadata) assert len(dataset.partitions) == 2 stored_dataset = DatasetMetadata.load_from_store("dataset_uuid", store_factory()) assert dataset == stored_dataset
def test_store_dataframes_as_dataset_mp_partition_on_none( metadata_version, store, store_factory, bound_store_dataframes ): df = pd.DataFrame( {"P": np.arange(0, 10), "L": np.arange(0, 10), "TARGET": np.arange(10, 20)} ) df2 = pd.DataFrame({"P": np.arange(0, 10), "info": np.arange(100, 110)}) mp = MetaPartition( label=gen_uuid(), data={"core": df, "helper": df2}, metadata_version=metadata_version, ) df_list = [None, mp] dataset = bound_store_dataframes( df_list, store=store_factory, dataset_uuid="dataset_uuid", metadata_version=metadata_version, partition_on=["P"], ) assert isinstance(dataset, DatasetMetadata) assert dataset.partition_keys == ["P"] assert len(dataset.partitions) == 10 assert dataset.metadata_version == metadata_version stored_dataset = DatasetMetadata.load_from_store("dataset_uuid", store) assert dataset == stored_dataset
def test_store_dataset_from_partitions(meta_partitions_files_only, store, frozen_time): dataset = store_dataset_from_partitions( partition_list=meta_partitions_files_only, dataset_uuid="dataset_uuid", store=store, dataset_metadata={"some": "metadata"}, ) expected_metadata = { "some": "metadata", "creation_time": TIME_TO_FREEZE_ISO } assert dataset.metadata == expected_metadata assert sorted(dataset.partitions.values(), key=lambda x: x.label) == sorted( [mp.partition for mp in meta_partitions_files_only], key=lambda x: x.label) assert dataset.uuid == "dataset_uuid" store_files = list(store.keys()) # Dataset metadata: 1 file expected_number_files = 1 # common metadata for v4 datasets expected_number_files += 1 assert len(store_files) == expected_number_files # Ensure the dataset can be loaded properly stored_dataset = DatasetMetadata.load_from_store("dataset_uuid", store) assert dataset == stored_dataset
def test_store_dataframes_as_dataset(store_factory, metadata_version, bound_store_dataframes): df = pd.DataFrame({ "P": np.arange(0, 10), "L": np.arange(0, 10), "TARGET": np.arange(10, 20) }) df_helper = pd.DataFrame({ "P": np.arange(0, 10), "info": string.ascii_lowercase[:10] }) df_list = [ { "label": "cluster_1", "data": [("core", df.copy(deep=True)), ("helper", df_helper)], }, { "label": "cluster_2", "data": [("core", df.copy(deep=True)), ("helper", df_helper)], }, ] dataset = bound_store_dataframes( df_list, store=store_factory, dataset_uuid="dataset_uuid", metadata_version=metadata_version, secondary_indices=["P"], ) assert isinstance(dataset, DatasetMetadata) assert len(dataset.partitions) == 2 assert "P" in dataset.indices store = store_factory() stored_dataset = DatasetMetadata.load_from_store("dataset_uuid", store) assert dataset.uuid == stored_dataset.uuid assert dataset.metadata == stored_dataset.metadata assert dataset.partitions == stored_dataset.partitions index_dct = stored_dataset.indices["P"].load(store).index_dct assert sorted(index_dct.keys()) == list(range(0, 10)) assert any( [sorted(p) == ["cluster_1", "cluster_2"] for p in index_dct.values()]) df_stored = DataFrameSerializer.restore_dataframe( key=dataset.partitions["cluster_1"].files["core"], store=store) pdt.assert_frame_equal(df, df_stored) df_stored = DataFrameSerializer.restore_dataframe( key=dataset.partitions["cluster_2"].files["core"], store=store) pdt.assert_frame_equal(df, df_stored) df_stored = DataFrameSerializer.restore_dataframe( key=dataset.partitions["cluster_1"].files["helper"], store=store) pdt.assert_frame_equal(df_helper, df_stored) df_stored = DataFrameSerializer.restore_dataframe( key=dataset.partitions["cluster_2"].files["helper"], store=store) pdt.assert_frame_equal(df_helper, df_stored)
def test_store_dataframes_as_dataset_mp(metadata_version, store): df = pd.DataFrame({ "P": np.arange(0, 10), "L": np.arange(0, 10), "TARGET": np.arange(10, 20) }) df2 = pd.DataFrame({"P": np.arange(0, 10), "info": np.arange(100, 110)}) mp = MetaPartition( label=gen_uuid(), data={ "core": df, "helper": df2 }, metadata_version=metadata_version, ) dataset = store_dataframes_as_dataset( store=store, dataset_uuid="dataset_uuid", dfs=mp, metadata_version=metadata_version, ) assert isinstance(dataset, DatasetMetadata) assert len(dataset.partitions) == 1 assert dataset.metadata_version == metadata_version stored_dataset = DatasetMetadata.load_from_store("dataset_uuid", store) assert dataset == stored_dataset
def test_update_secondary_indices_subset(store_factory, bound_update_dataset): df1 = pd.DataFrame({"A": range(10), "indexed": 1}) dataset_uuid = "dataset_uuid" bound_update_dataset(df1, dataset_uuid=dataset_uuid, store=store_factory, secondary_indices="indexed") df2 = pd.DataFrame({"A": range(10), "indexed": 2}) # secondary index is omitted. Kartothek should pick it up regardless bound_update_dataset(df2, dataset_uuid=dataset_uuid, store=store_factory) dm = DatasetMetadata.load_from_store(dataset_uuid, store_factory(), load_all_indices=True) obs_values = dm.indices["indexed"].observed_values() assert sorted(obs_values) == [1, 2] with pytest.raises(ValueError, match="Incorrect indices provided"): # secondary index is omitted. Kartothek should pick it up regardless bound_update_dataset(df2, dataset_uuid=dataset_uuid, store=store_factory, secondary_indices="A")
def test_load_from_store_with_indices(store): meta_dct = { "dataset_metadata_version": 4, "dataset_uuid": "uuid", "partitions": { "product_id=1/part_1": { "files": { "core_data": "dataset_uuid/table/location_id=1/part_1.parquet" } } }, "indices": { "product_id": { "1": ["part_1"], "2": ["part_1"], "100": ["part_1"], "34": ["part_1"], } }, } store.put("uuid.by-dataset-metadata.json", simplejson.dumps(meta_dct).encode("utf-8")) df = pd.DataFrame({"index": [1], "location_id": [1], "product_id": [1]}) store_schema_metadata(make_meta(df, origin="core"), "uuid", store, "core_data") storage_key = "uuid/some_index.parquet" index2 = ExplicitSecondaryIndex( column="location_id", index_dct={ 1: ["part_1", "part_2"], 3: ["part_3"] }, index_storage_key=storage_key, dtype=pa.int64(), ) index2.store(store, "dataset_uuid") dmd = DatasetMetadata.load_from_store(store=store, uuid="uuid") assert "location_id" not in dmd.indices dmd = DatasetMetadata.load_from_store(store=store, uuid="uuid", load_all_indices=True) assert "location_id" in dmd.indices
def test_dask_partitions(metadata_version): """ Create partitions for one table with dask and check that it can be read with kartothek """ import dask.dataframe bucket_dir = tempfile.mkdtemp() dataset_uuid = "uuid+namespace-attribute12_underscored" os.mkdir("{}/{}".format(bucket_dir, dataset_uuid)) table_dir = "{}/{}/core".format(bucket_dir, dataset_uuid) os.mkdir(table_dir) store = storefact.get_store_from_url("hfs://{}".format(bucket_dir)) locations = ["L-{}".format(i) for i in range(2)] df = pd.DataFrame() for location in locations: core = pd.DataFrame( data={ "date": np.array( ["2017-11-23", "2017-11-23", "2017-11-24", "2017-11-24"] ), "product": np.array(["P-0", "P-1", "P-0", "P-1"]), "location": location, "value": np.array(random.sample(range(1, 100), 4)), } ) df = pd.concat([df, core]) ddf = dask.dataframe.from_pandas(df, npartitions=1) dask.dataframe.to_parquet(ddf, table_dir, partition_on=["location"]) partition0 = "{}/core/location=L-0/part.0.parquet".format(dataset_uuid) partition1 = "{}/core/location=L-1/part.0.parquet".format(dataset_uuid) metadata = { "dataset_metadata_version": metadata_version, "dataset_uuid": dataset_uuid, } expected_partitions = { "partitions": { "location=L-0": {"files": {"core": partition0}}, "location=L-1": {"files": {"core": partition1}}, } } expected_tables = {"tables": {"core": ["date", "product", "value"]}} store.put( "{}.by-dataset-metadata.json".format(dataset_uuid), simplejson.dumps(metadata).encode(), ) metadata.update(expected_partitions) metadata.update(expected_tables) dmd = DatasetMetadata.load_from_store(dataset_uuid, store) actual_partitions = dmd.to_dict()["partitions"] # we partition on location ID which has two values assert len(actual_partitions) == 2 assert dmd.partition_keys == ["location"]
def test_roundtrip_empty_with_store(store, metadata_version): dataset_uuid = "dataset_uuid" dataset = DatasetMetadata(uuid=dataset_uuid, metadata_version=metadata_version) store.put( "{}.by-dataset-metadata.json".format(dataset_uuid), simplejson.dumps(dataset.to_dict()).encode("utf-8"), ) assert dataset == DatasetMetadata.load_from_store(dataset_uuid, store)
def discover_datasets_unchecked( uuid_prefix: str, store: Union[Callable[[], KeyValueStore], KeyValueStore], filter_ktk_cube_dataset_ids: Optional[Union[str, Iterable[str]]] = None, ) -> Dict[str, DatasetMetadata]: """ Get all known datasets that may belong to a give cube w/o applying any checks. .. warning:: The results are not checked for validity. Found datasets may be incompatible w/ the given cube. Use :meth:`check_datasets` to check the results, or go for :meth:`discover_datasets` in the first place. Parameters ---------- uuid_prefix Dataset UUID prefix. store KV store. filter_ktk_cube_dataset_ids Optional selection of datasets to include. Returns ------- datasets: Dict[str, DatasetMetadata] All discovered datasets. Empty Dict if no dataset is found """ if callable(store): store = store() filter_ktk_cube_dataset_ids = converter_str_set_optional( filter_ktk_cube_dataset_ids) prefix = uuid_prefix + KTK_CUBE_UUID_SEPERATOR names = _discover_dataset_meta_files(prefix, store) if filter_ktk_cube_dataset_ids is not None: names = { name for name in names if name[len(prefix):] in filter_ktk_cube_dataset_ids } result = {} # sorted iteration for determistic error messages in case DatasetMetadata.load_from_store fails for name in sorted(names): try: result[name[len(prefix):]] = DatasetMetadata.load_from_store( uuid=name, store=store, load_schema=True, load_all_indices=False) except KeyError as e: _logger.warning( 'Ignore dataset "{name}" due to KeyError: {e}'.format( name=name, e=e)) return result
def test_update_of_dataset_with_non_default_table_name(store_factory, bound_update_dataset): """ Tests that datasets with table names other than "table" can be created, updated and read successfully (regression test for issue #445). """ # Create initial dataset dataset_uuid = "dataset_uuid" df_create = pd.DataFrame({ "date": [date(2021, 1, 1), date(2021, 1, 2)], "value": range(2) }) store_dataframes_as_dataset( dfs=[df_create], store=store_factory, dataset_uuid=dataset_uuid, table_name="non-default-name", partition_on=["date"], ) dm = DatasetMetadata.load_from_store(dataset_uuid, store_factory()) assert dm.table_name == "non-default-name" # Update dataset df_update = pd.DataFrame({ "date": [date(2021, 1, 3), date(2021, 1, 4)], "value": range(2) }) bound_update_dataset( [df_update], store=store_factory, dataset_uuid=dataset_uuid, table_name="non-default-name", partition_on=["date"], ) dm = DatasetMetadata.load_from_store(dataset_uuid, store_factory()) assert dm.table_name == "non-default-name" # Assert equality of dataframe df_read = (read_dataset_as_ddf(dataset_uuid, store_factory(), "table").compute().reset_index(drop=True)) df_expected = df_create.append(df_update).reset_index(drop=True) pd.testing.assert_frame_equal(df_read, df_expected)
def test_metadata_factory_from_dataset_no_store(function_store, ds, load_schema): ds2 = DatasetMetadata.load_from_store( "uuid", function_store(), load_schema=load_schema ) factory = metadata_factory_from_dataset(ds2, with_schema=load_schema) assert factory.dataset_metadata is ds2 store = factory.store with pytest.raises(NotImplementedError): store.get("foo")
def test_update_dataset_with_partitions__reducer_delete_only( store_factory, metadata_version, frozen_time_em, bound_update_dataset, store ): partitions = [ { "label": "cluster_1", "data": [("core", pd.DataFrame({"p": [1]}))], "indices": {"p": ExplicitSecondaryIndex("p", index_dct={1: ["cluster_1"]})}, }, { "label": "cluster_2", "data": [("core", pd.DataFrame({"p": [2]}))], "indices": {"p": ExplicitSecondaryIndex("p", index_dct={2: ["cluster_2"]})}, }, ] dataset = store_dataframes_as_dataset( dfs=partitions, store=store_factory, metadata={"dataset": "metadata"}, dataset_uuid="dataset_uuid", metadata_version=metadata_version, ) dataset = dataset.load_index("p", store) empty_part = [] dataset_updated = bound_update_dataset( [empty_part], store=store_factory, dataset_uuid="dataset_uuid", delete_scope=[{"p": 1}], metadata={"extra": "metadata"}, default_metadata_version=metadata_version, secondary_indices=["p"], ) dataset_updated = dataset_updated.load_index("p", store) assert sorted(dataset.partitions) == ["cluster_1", "cluster_2"] assert list(dataset_updated.partitions) == ["cluster_2"] store_files = list(store.keys()) # 1 dataset metadata file and 1 index file and 2 partition files # note: the update writes a new index file but due to frozen_time this gets # the same name as the previous one and overwrites it. expected_number_files = 4 # common metadata for v4 datasets (1 table) expected_number_files += 1 assert len(store_files) == expected_number_files assert dataset.indices["p"].index_dct == {1: ["cluster_1"], 2: ["cluster_2"]} assert dataset_updated.indices["p"].index_dct == {2: ["cluster_2"]} # Ensure the dataset can be loaded properly stored_dataset = DatasetMetadata.load_from_store("dataset_uuid", store) stored_dataset = stored_dataset.load_index("p", store) assert dataset_updated == stored_dataset
def test_update_dataset_with_partitions__reducer_nonexistent( store_factory, metadata_version, frozen_time_em, bound_update_dataset, store): part3 = { "label": "cluster_3", "data": [("core", pd.DataFrame({"p": [3]}))], "indices": { "p": ExplicitSecondaryIndex("p", index_dct={3: ["cluster_3"]}) }, } dataset_updated = bound_update_dataset( [part3], store=store_factory, dataset_uuid="dataset_uuid", delete_scope=[{ "p": 1 }], metadata={"extra": "metadata"}, default_metadata_version=metadata_version, secondary_indices=["p"], ) dataset_updated = dataset_updated.load_index("p", store) ind_updated = dataset_updated.indices["p"] cluster_3_label = ind_updated.eval_operator(op="==", value=3).pop() expected_metadata = {"extra": "metadata"} expected_metadata["creation_time"] = TIME_TO_FREEZE_ISO assert dataset_updated.metadata == expected_metadata assert list(dataset_updated.partitions) == [cluster_3_label] updated_part_c3 = dataset_updated.partitions[cluster_3_label] assert updated_part_c3.label == cluster_3_label assert dataset_updated.uuid == "dataset_uuid" store_files = list(store.keys()) # 1 dataset metadata file and 1 index file and 1 partition files # note: the update writes a new index file but due to frozen_time this gets # the same name as the previous one and overwrites it. expected_number_files = 3 # common metadata for v4 datasets (1 table) expected_number_files += 1 assert len(store_files) == expected_number_files exp_updated_idx = {3: [cluster_3_label]} assert dataset_updated.indices["p"].index_dct == exp_updated_idx # Ensure the dataset can be loaded properly stored_dataset = DatasetMetadata.load_from_store("dataset_uuid", store) stored_dataset = stored_dataset.load_index("p", store) assert dataset_updated == stored_dataset
def test_update_partitions(driver, function_store, remove_partitions, new_partitions): df_source, cube = _write_cube(function_store) df_source_new = pd.DataFrame( { "i1": range(200, 200 + len(new_partitions)), "p": np.array(new_partitions, np.int64), "v1": range(300, 300 + len(new_partitions)), "x": range(100, 100 + len(new_partitions)), } ) # what should remain of the old data: df_source_of_old = df_source.loc[~df_source["p"].isin(set(remove_partitions))] df_source_expected_after = pd.concat( [df_source_of_old, df_source_new], sort=False, ignore_index=True ) remove_conditions = C("p").isin(remove_partitions) result = driver( data={"source": df_source_new}, remove_conditions=remove_conditions, cube=cube, store=function_store, ktk_cube_dataset_ids={"source"}, metadata={"source": {"some_new_meta": 42}}, ) assert set(result.keys()) == {"source"} dm_source_after = DatasetMetadata.load_from_store( cube.ktk_dataset_uuid("source"), function_store(), load_all_indices=True ) assert "some_new_meta" in dm_source_after.metadata assert "meta_at_create" in dm_source_after.metadata # check values for "p" are as expected: expected_p_source = (set(df_source["p"].unique()) - set(remove_partitions)) | set( new_partitions ) assert set(dm_source_after.indices["p"].index_dct) == expected_p_source df_read = query_cube(cube, function_store)[0] assert set(df_read.columns) == set(df_source_expected_after.columns) for df in (df_read, df_source_expected_after): df.sort_values("x", inplace=True) df.reset_index(drop=True, inplace=True) pd.testing.assert_frame_equal(df_read, df_source_expected_after)
def test_metadata_factory_from_dataset_with_store(function_store, ds, load_schema): ds2 = DatasetMetadata.load_from_store( "uuid", function_store(), load_schema=load_schema ) factory = metadata_factory_from_dataset( ds2, with_schema=load_schema, store=function_store ) assert factory.dataset_metadata is ds2 store = factory.store store.put("foo", b"bar") assert store.get("foo") == b"bar"
def test_cube_update_secondary_indices_subset(function_store, driver): cube1 = Cube( dimension_columns=["A"], partition_columns=["P"], uuid_prefix="cube", seed_dataset="source", index_columns=["indexed"], ) df_1 = pd.DataFrame({"A": range(10), "P": 1, "indexed": 1, "not-indexed": 1}) build_cube( data={"source": df_1}, cube=cube1, store=function_store, metadata={"source": {"meta_at_create": "data"}}, ) cube2 = Cube( dimension_columns=["A"], partition_columns=["P"], uuid_prefix="cube", seed_dataset="source", ) df_2 = pd.DataFrame({"A": range(10, 20), "P": 1, "indexed": 2, "not-indexed": 1}) driver( data={"source": df_2}, cube=cube2, store=function_store, remove_conditions=None ) dataset_uuid = cube2.ktk_dataset_uuid(cube2.seed_dataset) dm = DatasetMetadata.load_from_store( dataset_uuid, function_store(), load_all_indices=True ) obs_values = dm.indices["indexed"].observed_values() assert sorted(obs_values) == [1, 2] cube2 = Cube( dimension_columns=["A"], partition_columns=["P"], uuid_prefix="cube", seed_dataset="source", index_columns=["not-indexed"], ) with pytest.raises( ValueError, match='ExplicitSecondaryIndex or PartitionIndex "not-indexed" is missing in dataset', ): driver( data={"source": df_2}, cube=cube2, store=function_store, remove_conditions=None, )
def test_update_dataset_with_partitions_delete_only(store_factory, metadata_version, frozen_time_em, bound_update_dataset, store): partitions = [ pd.DataFrame({"p": [1]}), pd.DataFrame({"p": [2]}), ] dataset = store_dataframes_as_dataset( dfs=partitions, store=store_factory, metadata={"dataset": "metadata"}, dataset_uuid="dataset_uuid", secondary_indices=["p"], metadata_version=metadata_version, ) dataset = dataset.load_index("p", store) # FIXME: is this a regression? dataset_updated = bound_update_dataset( None, store=store_factory, dataset_uuid="dataset_uuid", delete_scope=[{ "p": 1 }], metadata={"extra": "metadata"}, default_metadata_version=metadata_version, secondary_indices=["p"], ) dataset_updated = dataset_updated.load_index("p", store) assert len(dataset.partitions) == 2 assert len(dataset_updated.partitions) == 1 store_files = list(store.keys()) # 1 dataset metadata file and 1 index file and 2 partition files # note: the update writes a new index file but due to frozen_time this gets # the same name as the previous one and overwrites it. expected_number_files = 4 # common metadata for v4 datasets (1 table) expected_number_files += 1 assert len(store_files) == expected_number_files assert set(dataset.indices["p"].observed_values()) == {1, 2} assert set(dataset_updated.indices["p"].observed_values()) == {2} # Ensure the dataset can be loaded properly stored_dataset = DatasetMetadata.load_from_store("dataset_uuid", store) stored_dataset = stored_dataset.load_index("p", store) assert dataset_updated == stored_dataset