def test_raises_multiple_seeds(self, cube, function_store): store_data( cube=cube, function_store=function_store, df=pd.DataFrame({ "x": [0], "y": [0], "p": [0], "q": [0] }), name=cube.seed_dataset, metadata={KTK_CUBE_METADATA_KEY_IS_SEED: True}, ) store_data( cube=cube, function_store=function_store, df=pd.DataFrame({ "x": [0], "p": [0], "q": [0], "i1": [0] }), name="enrich", metadata={KTK_CUBE_METADATA_KEY_IS_SEED: True}, ) with pytest.raises(ValueError) as exc: discover_cube(cube.uuid_prefix, function_store) assert ( str(exc.value) == 'Found multiple possible seed datasets for cube "cube": enrich, myseed' )
def test_raises_no_seed(self, cube, function_store): store_data( cube=cube, function_store=function_store, df=pd.DataFrame({"x": [0], "y": [0], "p": [0], "q": [0], "i1": [0]}), name=cube.seed_dataset, metadata={}, ) with pytest.raises(ValueError) as exc: discover_cube(cube.uuid_prefix, function_store) assert str(exc.value) == 'Could not find seed dataset for cube "cube".'
def test_raises_partition_keys_missing_old_metadata(self, cube, function_store): store_data( cube=cube, function_store=function_store, df=pd.DataFrame({"x": [0], "y": [0], "p": [0], "q": [0]}), name=cube.seed_dataset, partition_on=None, new_ktk_cube_metadata=False, ) with pytest.raises(ValueError) as exc: discover_cube(cube.uuid_prefix, function_store) assert str(exc.value) == 'Seed dataset ("myseed") has no partition keys.'
def test_raises_partition_keys_impossible(self, cube, function_store): store_data( cube=cube, function_store=function_store, df=pd.DataFrame({"x": [0], "y": [0], "p": [0], "q": [0]}), name=cube.seed_dataset, partition_on=[], ) with pytest.raises(ValueError) as exc: discover_cube(cube.uuid_prefix, function_store) assert ( str(exc.value) == 'Seed dataset "myseed" has missing partition columns: p, q' )
def test_raises_dimension_columns_missing(self, cube, function_store): store_data( cube=cube, function_store=function_store, df=pd.DataFrame({"x": [0], "y": [0], "p": [0], "q": [0]}), name=cube.seed_dataset, metadata={KTK_CUBE_METADATA_KEY_IS_SEED: True}, ) with pytest.raises(ValueError) as exc: discover_cube(cube.uuid_prefix, function_store) assert ( str(exc.value) == 'Could not recover dimension columns from seed dataset ("myseed") of cube "cube".' )
def test_raises_timestamp_col_is_not_ktk_cube_ts(self, cube, function_store): store_data( cube=cube, function_store=function_store, df=pd.DataFrame( {"x": [0], "y": [0], "p": [0], "q": [0], "ts": [pd.Timestamp("2000")]} ), partition_on=["p", "q", "ts"], name=cube.seed_dataset, new_ktk_cube_metadata=False, ) with pytest.raises( NotImplementedError, match="Can only read old cubes if the timestamp column is 'KLEE_TS', but 'ts' was detected.", ): discover_cube(cube.uuid_prefix, function_store)
def test_partition_keys_no_nonseed_other(self, cube, function_store): store_data( cube=cube, function_store=function_store, df=pd.DataFrame({ "x": [0], "y": [0], "p": [0], "q": [0] }), name=cube.seed_dataset, partition_on=["p", "q"], ) store_data( cube=cube, function_store=function_store, df=pd.DataFrame({ "x": [0], "y": [0], "i1": [0], "v1": [0] }), name="enrich", partition_on=[], ) cube_actual, datasets = discover_cube(cube.uuid_prefix, function_store) assert cube_actual == cube assert set(datasets.keys()) == {cube.seed_dataset, "enrich"} ds_seed = datasets[cube.seed_dataset] assert ds_seed.primary_indices_loaded ds_enrich = datasets["enrich"] assert ( not ds_enrich.partition_keys) or ds_enrich.primary_indices_loaded
def test_multiple(self, cube, function_store): store_data( cube=cube, function_store=function_store, df=pd.DataFrame({ "x": [0], "y": [0], "p": [0], "q": [0] }), name=cube.seed_dataset, ) store_data( cube=cube, function_store=function_store, df=pd.DataFrame({ "x": [0], "p": [0], "q": [0], "i1": [0] }), name="enrich", ) cube_actual, datasets = discover_cube(cube.uuid_prefix, function_store) assert cube_actual == cube assert set(datasets.keys()) == {cube.seed_dataset, "enrich"} ds_seed = datasets[cube.seed_dataset] assert ds_seed.primary_indices_loaded ds_enrich = datasets["enrich"] assert ds_enrich.primary_indices_loaded
def get_cube(store, uuid_prefix): """ Get cube from store. Parameters ---------- uuid_prefix: str Dataset UUID prefix. store: Union[Callable[[], simplekv.KeyValueStore], simplekv.KeyValueStore] KV store. Returns ------- cube: Cube Cube specification. datasets: Dict[str, kartothek.core.dataset.DatasetMetadata] All discovered datasets. Raises ------ click.UsageError In case cube was not found. """ try: return discover_cube(uuid_prefix, store) except ValueError as e: raise click.UsageError("Could not load cube: {e}".format(e=e))
def test_timestamp_col_compat(self, cube, function_store): """ Tests that cubes are still readable after timestamp removal. """ metadata_dimension_columns_old = "klee_dimension_columns" metadata_is_seed_old = "klee_is_seed" metadata_partition_columns_old = "klee_partition_columns" metadata_timestamp_column_old = "klee_timestamp_column" timestamp_column_old = "KLEE_TS" store_data( cube=cube, function_store=function_store, df=pd.DataFrame({ "x": [0], "y": [0], "p": [0], "q": [0], timestamp_column_old: [pd.Timestamp("2000")], "i1": [0], "a": [0], }), partition_on=["p", "q", timestamp_column_old], name=cube.seed_dataset, metadata={ metadata_dimension_columns_old: cube.dimension_columns, metadata_is_seed_old: True, metadata_partition_columns_old: cube.partition_columns, metadata_timestamp_column_old: timestamp_column_old, }, ) store_data( cube=cube, function_store=function_store, df=pd.DataFrame({ "x": [0], "y": [0], "p": [0], "q": [0], timestamp_column_old: [pd.Timestamp("2000")], "b": [0], }), partition_on=["p", "q", timestamp_column_old], name="enrich", metadata={ metadata_dimension_columns_old: cube.dimension_columns, metadata_is_seed_old: False, metadata_partition_columns_old: cube.partition_columns, metadata_timestamp_column_old: timestamp_column_old, }, ) cube_discoverd, datasets_discovered = discover_cube( cube.uuid_prefix, function_store) assert cube == cube_discoverd assert set(datasets_discovered.keys()) == {cube.seed_dataset, "enrich"}
def test_reads_suppress_index(self, cube, function_store): cube = cube.copy(suppress_index_on=cube.dimension_columns) store_data( cube=cube, function_store=function_store, df=pd.DataFrame({"x": [0], "y": [0], "p": [0], "q": [0], "i1": [0]}), name=cube.seed_dataset, ) cube_actual, datasets = discover_cube(cube.uuid_prefix, function_store) assert cube_actual == cube
def test_reads_suppress_index_default(self, cube, function_store): # test that reading also works for old metadata that does not contain the suppress_index_on method. store_data( cube=cube, function_store=function_store, df=pd.DataFrame({"x": [0], "y": [0], "p": [0], "q": [0], "i1": [0]}), name=cube.seed_dataset, write_suppress_index_on=False, ) cube_actual, datasets = discover_cube(cube.uuid_prefix, function_store) assert cube_actual == cube
def test_seed_only(self, cube, function_store): store_data( cube=cube, function_store=function_store, df=pd.DataFrame({"x": [0], "y": [0], "p": [0], "q": [0], "i1": [0]}), name=cube.seed_dataset, ) cube_actual, datasets = discover_cube(cube.uuid_prefix, function_store) assert cube_actual == cube assert set(datasets.keys()) == {cube.seed_dataset} ds = datasets[cube.seed_dataset] assert ds.primary_indices_loaded
def test_raises_partition_keys_impossible_old_metadata( self, cube, function_store): store_data( cube=cube, function_store=function_store, df=pd.DataFrame({ "x": [0], "y": [0], "p": [0], "q": [0], "KLEE_TS": [pd.Timestamp("2000")], }), partition_on=["KLEE_TS"], name=cube.seed_dataset, new_ktk_cube_metadata=False, ) with pytest.raises(ValueError) as exc: discover_cube(cube.uuid_prefix, function_store) assert ( str(exc.value) == 'Seed dataset ("myseed") has only a single partition key (KLEE_TS) but should have at least 2.' )
def test_without_partition_timestamp_metadata(self, cube, function_store): # test discovery of a cube without metadata keys # "KLEE_TS" and KTK_CUBE_METADATA_PARTITION_COLUMNS still works store_data( cube=cube, function_store=function_store, df=pd.DataFrame({ "x": [0], "y": [0], "p": [0], "q": [0], "KLEE_TS": [pd.Timestamp("2000")], "i1": [0], }), partition_on=["p", "q", "KLEE_TS"], name=cube.seed_dataset, new_ktk_cube_metadata=False, ) cube_actual, datasets = discover_cube(cube.uuid_prefix, function_store) assert cube_actual == cube assert set(datasets.keys()) == {cube.seed_dataset}