Пример #1
0
def test_conditions(driver, function_store, existing_cube):
    parts_source1 = set(
        DatasetMetadata.load_from_store(
            existing_cube.ktk_dataset_uuid("source"), function_store()
        ).partitions
    )
    parts_enrich1 = set(
        DatasetMetadata.load_from_store(
            existing_cube.ktk_dataset_uuid("enrich"), function_store()
        ).partitions
    )

    parts_source_to_delete = {part for part in parts_source1 if "p=0" not in part}

    result = driver(
        cube=existing_cube,
        store=function_store,
        ktk_cube_dataset_ids=["source"],
        conditions=C("p") > 0,
    )

    assert set(result.keys()) == {"source", "enrich"}

    ds_source = result["source"]
    ds_enrich = result["enrich"]

    parts_source2 = set(ds_source.partitions)
    parts_enrich2 = set(ds_enrich.partitions)

    assert parts_enrich1 == parts_enrich2
    assert parts_source1 - parts_source_to_delete == parts_source2
def test_append_partitions(driver, function_store, existing_cube):
    partitions_source_1 = set(
        DatasetMetadata.load_from_store(
            existing_cube.ktk_dataset_uuid("source"),
            function_store()).partitions.keys())
    partitions_enrich_1 = set(
        DatasetMetadata.load_from_store(
            existing_cube.ktk_dataset_uuid("enrich"),
            function_store()).partitions.keys())

    df_source = pd.DataFrame({
        "x": [0, 1, 2, 3],
        "p": [0, 0, 1, 1],
        "v1": [20, 21, 22, 23],
        "i1": [20, 21, 22, 23],
    })

    result = driver(data={"source": df_source},
                    cube=existing_cube,
                    store=function_store)

    assert set(result.keys()) == {"source"}

    ds_source = result["source"]
    ds_enrich = DatasetMetadata.load_from_store(
        existing_cube.ktk_dataset_uuid("enrich"), function_store())

    partitions_source_2 = set(ds_source.partitions.keys())
    partitions_enrich_2 = set(ds_enrich.partitions.keys())

    assert len(partitions_source_2) > len(partitions_source_1)
    assert partitions_source_1.issubset(partitions_source_2)

    assert partitions_enrich_2 == partitions_enrich_1
Пример #3
0
def test_commit_dataset_from_metapartition(dataset_function, store):
    new_data = [
        pd.DataFrame(
            OrderedDict([
                ("P", [5]),
                ("L", [5]),
                ("TARGET", [5]),
                ("DATE", [datetime.date(2016, 3, 23)]),
            ]))
    ]
    new_partition = write_single_partition(store=store,
                                           dataset_uuid=dataset_function.uuid,
                                           data=new_data)
    pre_commit_dataset = DatasetMetadata.load_from_store(
        uuid=dataset_function.uuid, store=store)
    # Cannot assert equal since the metadata is differently ordered
    assert pre_commit_dataset == dataset_function

    updated_dataset = commit_dataset(
        store=store,
        dataset_uuid=dataset_function.uuid,
        new_partitions=new_partition,
        delete_scope=None,
        partition_on=None,
    )
    assert updated_dataset != dataset_function

    assert updated_dataset.uuid == dataset_function.uuid
    assert len(
        updated_dataset.partitions) == len(dataset_function.partitions) + 1

    # ensure that the new dataset is actually the one on disc
    loaded_dataset = DatasetMetadata.load_from_store(uuid=updated_dataset.uuid,
                                                     store=store)
    assert loaded_dataset == updated_dataset

    # Read the data and check whether the rows above are included.
    # This checks whether all necessary informations were updated in the header
    # (e.g. files attributes of the partitions)
    actual = read_table(store=store, dataset_uuid=dataset_function.uuid)
    df_expected = pd.DataFrame(
        OrderedDict([
            (
                "DATE",
                [
                    datetime.date(2016, 3, 23),
                    datetime.date(2010, 1, 1),
                    datetime.date(2009, 12, 31),
                ],
            ),
            ("L", [5, 1, 2]),
            ("P", [5, 1, 2]),
            ("TARGET", [5, 1, 2]),
        ]))
    actual = actual.sort_values("DATE", ascending=False).reset_index(drop=True)

    assert_frame_equal(df_expected, actual)
Пример #4
0
def test_simple(cli, built_cube, skv, store):
    ds = DatasetMetadata.load_from_store(built_cube.ktk_dataset_uuid("source"),
                                         store)
    assert "v1" not in ds.indices

    result = cli("--store=cubes", "my_cube", "index", "source", "v1")
    assert result.exit_code == 0

    ds = DatasetMetadata.load_from_store(built_cube.ktk_dataset_uuid("source"),
                                         store)
    assert "v1" in ds.indices
Пример #5
0
def test_indices(driver, function_store, existing_cube):
    idx1_1 = set(
        DatasetMetadata.load_from_store(
            existing_cube.ktk_dataset_uuid("source"), function_store()
        )
        .load_all_indices(function_store())
        .indices["i1"]
        .index_dct.keys()
    )
    idx2_1 = set(
        DatasetMetadata.load_from_store(
            existing_cube.ktk_dataset_uuid("enrich"), function_store()
        )
        .load_all_indices(function_store())
        .indices["i2"]
        .index_dct.keys()
    )

    df_source = pd.DataFrame(
        {
            "x": [0, 1, 2, 3],
            "p": [0, 0, 1, 1],
            "v1": [20, 21, 22, 23],
            "i1": [20, 21, 22, 23],
        }
    )

    result = driver(
        data={"source": df_source}, cube=existing_cube, store=function_store
    )

    assert set(result.keys()) == {"source"}

    ds_source = result["source"]
    ds_enrich = DatasetMetadata.load_from_store(
        existing_cube.ktk_dataset_uuid("enrich"), function_store()
    )

    idx1_2 = set(
        ds_source.load_all_indices(function_store()).indices["i1"].index_dct.keys()
    )
    idx2_2 = set(
        ds_enrich.load_all_indices(function_store()).indices["i2"].index_dct.keys()
    )

    assert idx1_1.issubset(idx1_2)
    assert len(idx1_1) < len(idx1_2)

    assert idx2_1 == idx2_2
Пример #6
0
def test_all(cli, built_cube, skv, store):
    result = cli("--store=cubes", "my_cube", "index", "source", "*")
    assert result.exit_code == 0

    ds = DatasetMetadata.load_from_store(built_cube.ktk_dataset_uuid("source"),
                                         store)
    assert set(ds.indices.keys()) == set(get_dataset_columns(ds))
Пример #7
0
def test_cube_blacklist_dimension_index(function_store, driver):

    cube1 = Cube(
        dimension_columns=["A", "B"],
        partition_columns=["P"],
        uuid_prefix="cube",
        seed_dataset="source",
    )
    df_1 = pd.DataFrame({"A": range(10), "P": 1, "B": 1, "payload": ""})
    build_cube(
        data={"source": df_1},
        cube=cube1,
        store=function_store,
        metadata={"source": {"meta_at_create": "data"}},
    )

    cube2 = Cube(
        dimension_columns=["A", "B"],
        partition_columns=["P"],
        uuid_prefix="cube",
        seed_dataset="source",
        suppress_index_on=["B"],
    )
    df_2 = pd.DataFrame({"A": range(10), "P": 1, "B": 2, "payload": ""})
    driver(
        data={"source": df_2}, cube=cube2, store=function_store, remove_conditions=None
    )

    dataset_uuid = cube2.ktk_dataset_uuid(cube2.seed_dataset)
    dm = DatasetMetadata.load_from_store(
        dataset_uuid, function_store(), load_all_indices=True
    )
    obs_values = dm.indices["B"].observed_values()

    assert sorted(obs_values) == [1, 2]
Пример #8
0
def test_create_dataset_header(store, metadata_storage_format, frozen_time):
    table_meta = {"table": make_meta(pd.DataFrame({"col": [1]}), origin="1")}
    new_dataset = create_empty_dataset_header(
        store=store,
        table_meta=table_meta,
        dataset_uuid="new_dataset_uuid",
        metadata_storage_format=metadata_storage_format,
        metadata_version=4,
    )

    expected_dataset = DatasetMetadata(
        uuid="new_dataset_uuid",
        metadata_version=4,
        explicit_partitions=False,
        table_meta=table_meta,
    )
    assert new_dataset == expected_dataset

    storage_keys = list(store.keys())
    assert len(storage_keys) == 2

    loaded = DatasetMetadata.load_from_store(store=store,
                                             uuid="new_dataset_uuid")
    assert loaded == expected_dataset

    # If the read succeeds, the schema is written
    read_schema_metadata(dataset_uuid=new_dataset.uuid,
                         store=store,
                         table="table")
Пример #9
0
def test_store_dataframes_as_dataset_dfs_input_formats(store):
    df1 = pd.DataFrame({"B": [pd.Timestamp("2019")]})
    df2 = pd.DataFrame({"A": [1.4]})
    formats = [
        {
            "data": {
                "D": df1,
                "S": df2
            }
        },
        {
            "D": df1,
            "S": df2
        },
        {
            "data": [("D", df1), ("S", df2)]
        },
        [("D", df1), ("S", df2)],
    ]
    for input_format in formats:
        dataset = store_dataframes_as_dataset(store=store,
                                              dataset_uuid="dataset_uuid",
                                              dfs=[input_format],
                                              overwrite=True)
        stored_dataset = DatasetMetadata.load_from_store("dataset_uuid", store)
        assert dataset == stored_dataset
Пример #10
0
def test_store_dataframes_as_dataset_no_pipeline_partition_on(store):
    df = pd.DataFrame({
        "P": np.arange(0, 10),
        "L": np.arange(0, 10),
        "TARGET": np.arange(10, 20)
    })

    df2 = pd.DataFrame({"P": np.arange(0, 10), "info": np.arange(100, 110)})

    dataset = store_dataframes_as_dataset(
        store=store,
        dataset_uuid="dataset_uuid",
        dfs={
            "core": df,
            "helper": df2
        },
        partition_on="P",
        metadata_version=4,
    )

    assert isinstance(dataset, DatasetMetadata)
    assert len(dataset.partitions) == 10

    stored_dataset = DatasetMetadata.load_from_store("dataset_uuid", store)

    assert dataset == stored_dataset
Пример #11
0
def test_store_dataframes_as_dataset_batch_mode(store_factory,
                                                metadata_version,
                                                bound_store_dataframes):
    # TODO: Kick this out?
    values_p1 = [1, 2, 3]
    values_p2 = [4, 5, 6]
    df = pd.DataFrame({"P": values_p1})
    df2 = pd.DataFrame({"P": values_p2})

    df_list = [[df, df2]]

    dataset = bound_store_dataframes(
        df_list,
        store=store_factory,
        dataset_uuid="dataset_uuid",
        metadata_version=metadata_version,
        secondary_indices="P",
    )

    assert isinstance(dataset, DatasetMetadata)
    assert len(dataset.partitions) == 2

    store = store_factory()
    stored_dataset = DatasetMetadata.load_from_store(
        "dataset_uuid", store).load_all_indices(store)
    assert dataset.uuid == stored_dataset.uuid
    assert dataset.metadata == stored_dataset.metadata
    assert dataset.partitions == stored_dataset.partitions

    assert "P" in dataset.indices
Пример #12
0
def test_store_dataframes_as_dataset_empty_dataframe(store_factory,
                                                     metadata_version,
                                                     df_all_types,
                                                     bound_store_dataframes):
    """
    Test that writing an empty column succeeds.
    In particular, this may fail due to too strict schema validation.
    """
    df_empty = df_all_types.drop(0)

    assert df_empty.empty
    df_list = [df_empty]

    dataset = bound_store_dataframes(
        df_list,
        store=store_factory,
        dataset_uuid="dataset_uuid",
        metadata_version=metadata_version,
    )

    assert isinstance(dataset, DatasetMetadata)
    assert len(dataset.partitions) == 1

    store = store_factory()
    stored_dataset = DatasetMetadata.load_from_store("dataset_uuid", store)
    assert dataset.uuid == stored_dataset.uuid
    assert dataset.metadata == stored_dataset.metadata
    assert dataset.partitions == stored_dataset.partitions

    df_stored = DataFrameSerializer.restore_dataframe(key=next(
        iter(dataset.partitions.values())).files["table"],
                                                      store=store)
    pdt.assert_frame_equal(df_empty, df_stored)
Пример #13
0
def test_store_dataframes_as_dataset_list_input(
    store_factory, metadata_version, bound_store_dataframes
):
    df = pd.DataFrame(
        {"P": np.arange(0, 10), "L": np.arange(0, 10), "TARGET": np.arange(10, 20)}
    )
    df2 = pd.DataFrame(
        {
            "P": np.arange(100, 110),
            "L": np.arange(100, 110),
            "TARGET": np.arange(10, 20),
        }
    )
    df_list = [df, df2]

    dataset = bound_store_dataframes(
        df_list,
        store=store_factory,
        dataset_uuid="dataset_uuid",
        metadata_version=metadata_version,
    )

    assert isinstance(dataset, DatasetMetadata)
    assert len(dataset.partitions) == 2
    stored_dataset = DatasetMetadata.load_from_store("dataset_uuid", store_factory())
    assert dataset == stored_dataset
Пример #14
0
def test_store_dataframes_as_dataset_mp_partition_on_none(
    metadata_version, store, store_factory, bound_store_dataframes
):
    df = pd.DataFrame(
        {"P": np.arange(0, 10), "L": np.arange(0, 10), "TARGET": np.arange(10, 20)}
    )

    df2 = pd.DataFrame({"P": np.arange(0, 10), "info": np.arange(100, 110)})

    mp = MetaPartition(
        label=gen_uuid(),
        data={"core": df, "helper": df2},
        metadata_version=metadata_version,
    )

    df_list = [None, mp]
    dataset = bound_store_dataframes(
        df_list,
        store=store_factory,
        dataset_uuid="dataset_uuid",
        metadata_version=metadata_version,
        partition_on=["P"],
    )

    assert isinstance(dataset, DatasetMetadata)
    assert dataset.partition_keys == ["P"]
    assert len(dataset.partitions) == 10
    assert dataset.metadata_version == metadata_version

    stored_dataset = DatasetMetadata.load_from_store("dataset_uuid", store)

    assert dataset == stored_dataset
Пример #15
0
def test_store_dataset_from_partitions(meta_partitions_files_only, store,
                                       frozen_time):
    dataset = store_dataset_from_partitions(
        partition_list=meta_partitions_files_only,
        dataset_uuid="dataset_uuid",
        store=store,
        dataset_metadata={"some": "metadata"},
    )

    expected_metadata = {
        "some": "metadata",
        "creation_time": TIME_TO_FREEZE_ISO
    }

    assert dataset.metadata == expected_metadata
    assert sorted(dataset.partitions.values(),
                  key=lambda x: x.label) == sorted(
                      [mp.partition for mp in meta_partitions_files_only],
                      key=lambda x: x.label)
    assert dataset.uuid == "dataset_uuid"

    store_files = list(store.keys())
    # Dataset metadata: 1 file
    expected_number_files = 1
    # common metadata for v4 datasets
    expected_number_files += 1
    assert len(store_files) == expected_number_files

    # Ensure the dataset can be loaded properly
    stored_dataset = DatasetMetadata.load_from_store("dataset_uuid", store)
    assert dataset == stored_dataset
Пример #16
0
def test_store_dataframes_as_dataset(store_factory, metadata_version,
                                     bound_store_dataframes):
    df = pd.DataFrame({
        "P": np.arange(0, 10),
        "L": np.arange(0, 10),
        "TARGET": np.arange(10, 20)
    })

    df_helper = pd.DataFrame({
        "P": np.arange(0, 10),
        "info": string.ascii_lowercase[:10]
    })

    df_list = [
        {
            "label": "cluster_1",
            "data": [("core", df.copy(deep=True)), ("helper", df_helper)],
        },
        {
            "label": "cluster_2",
            "data": [("core", df.copy(deep=True)), ("helper", df_helper)],
        },
    ]

    dataset = bound_store_dataframes(
        df_list,
        store=store_factory,
        dataset_uuid="dataset_uuid",
        metadata_version=metadata_version,
        secondary_indices=["P"],
    )

    assert isinstance(dataset, DatasetMetadata)
    assert len(dataset.partitions) == 2

    assert "P" in dataset.indices

    store = store_factory()
    stored_dataset = DatasetMetadata.load_from_store("dataset_uuid", store)
    assert dataset.uuid == stored_dataset.uuid
    assert dataset.metadata == stored_dataset.metadata
    assert dataset.partitions == stored_dataset.partitions

    index_dct = stored_dataset.indices["P"].load(store).index_dct
    assert sorted(index_dct.keys()) == list(range(0, 10))
    assert any(
        [sorted(p) == ["cluster_1", "cluster_2"] for p in index_dct.values()])

    df_stored = DataFrameSerializer.restore_dataframe(
        key=dataset.partitions["cluster_1"].files["core"], store=store)
    pdt.assert_frame_equal(df, df_stored)
    df_stored = DataFrameSerializer.restore_dataframe(
        key=dataset.partitions["cluster_2"].files["core"], store=store)
    pdt.assert_frame_equal(df, df_stored)
    df_stored = DataFrameSerializer.restore_dataframe(
        key=dataset.partitions["cluster_1"].files["helper"], store=store)
    pdt.assert_frame_equal(df_helper, df_stored)
    df_stored = DataFrameSerializer.restore_dataframe(
        key=dataset.partitions["cluster_2"].files["helper"], store=store)
    pdt.assert_frame_equal(df_helper, df_stored)
Пример #17
0
def test_store_dataframes_as_dataset_mp(metadata_version, store):
    df = pd.DataFrame({
        "P": np.arange(0, 10),
        "L": np.arange(0, 10),
        "TARGET": np.arange(10, 20)
    })

    df2 = pd.DataFrame({"P": np.arange(0, 10), "info": np.arange(100, 110)})

    mp = MetaPartition(
        label=gen_uuid(),
        data={
            "core": df,
            "helper": df2
        },
        metadata_version=metadata_version,
    )

    dataset = store_dataframes_as_dataset(
        store=store,
        dataset_uuid="dataset_uuid",
        dfs=mp,
        metadata_version=metadata_version,
    )

    assert isinstance(dataset, DatasetMetadata)
    assert len(dataset.partitions) == 1
    assert dataset.metadata_version == metadata_version

    stored_dataset = DatasetMetadata.load_from_store("dataset_uuid", store)

    assert dataset == stored_dataset
Пример #18
0
def test_update_secondary_indices_subset(store_factory, bound_update_dataset):
    df1 = pd.DataFrame({"A": range(10), "indexed": 1})
    dataset_uuid = "dataset_uuid"
    bound_update_dataset(df1,
                         dataset_uuid=dataset_uuid,
                         store=store_factory,
                         secondary_indices="indexed")

    df2 = pd.DataFrame({"A": range(10), "indexed": 2})
    # secondary index is omitted. Kartothek should pick it up regardless
    bound_update_dataset(df2, dataset_uuid=dataset_uuid, store=store_factory)

    dm = DatasetMetadata.load_from_store(dataset_uuid,
                                         store_factory(),
                                         load_all_indices=True)
    obs_values = dm.indices["indexed"].observed_values()

    assert sorted(obs_values) == [1, 2]

    with pytest.raises(ValueError, match="Incorrect indices provided"):
        # secondary index is omitted. Kartothek should pick it up regardless
        bound_update_dataset(df2,
                             dataset_uuid=dataset_uuid,
                             store=store_factory,
                             secondary_indices="A")
Пример #19
0
def test_load_from_store_with_indices(store):
    meta_dct = {
        "dataset_metadata_version": 4,
        "dataset_uuid": "uuid",
        "partitions": {
            "product_id=1/part_1": {
                "files": {
                    "core_data":
                    "dataset_uuid/table/location_id=1/part_1.parquet"
                }
            }
        },
        "indices": {
            "product_id": {
                "1": ["part_1"],
                "2": ["part_1"],
                "100": ["part_1"],
                "34": ["part_1"],
            }
        },
    }
    store.put("uuid.by-dataset-metadata.json",
              simplejson.dumps(meta_dct).encode("utf-8"))
    df = pd.DataFrame({"index": [1], "location_id": [1], "product_id": [1]})
    store_schema_metadata(make_meta(df, origin="core"), "uuid", store,
                          "core_data")

    storage_key = "uuid/some_index.parquet"
    index2 = ExplicitSecondaryIndex(
        column="location_id",
        index_dct={
            1: ["part_1", "part_2"],
            3: ["part_3"]
        },
        index_storage_key=storage_key,
        dtype=pa.int64(),
    )
    index2.store(store, "dataset_uuid")

    dmd = DatasetMetadata.load_from_store(store=store, uuid="uuid")
    assert "location_id" not in dmd.indices

    dmd = DatasetMetadata.load_from_store(store=store,
                                          uuid="uuid",
                                          load_all_indices=True)
    assert "location_id" in dmd.indices
Пример #20
0
def test_dask_partitions(metadata_version):
    """
    Create partitions for one table with dask
    and check that it can be read with kartothek
    """
    import dask.dataframe

    bucket_dir = tempfile.mkdtemp()
    dataset_uuid = "uuid+namespace-attribute12_underscored"
    os.mkdir("{}/{}".format(bucket_dir, dataset_uuid))
    table_dir = "{}/{}/core".format(bucket_dir, dataset_uuid)
    os.mkdir(table_dir)
    store = storefact.get_store_from_url("hfs://{}".format(bucket_dir))

    locations = ["L-{}".format(i) for i in range(2)]
    df = pd.DataFrame()
    for location in locations:
        core = pd.DataFrame(
            data={
                "date": np.array(
                    ["2017-11-23", "2017-11-23", "2017-11-24", "2017-11-24"]
                ),
                "product": np.array(["P-0", "P-1", "P-0", "P-1"]),
                "location": location,
                "value": np.array(random.sample(range(1, 100), 4)),
            }
        )
        df = pd.concat([df, core])

    ddf = dask.dataframe.from_pandas(df, npartitions=1)
    dask.dataframe.to_parquet(ddf, table_dir, partition_on=["location"])

    partition0 = "{}/core/location=L-0/part.0.parquet".format(dataset_uuid)
    partition1 = "{}/core/location=L-1/part.0.parquet".format(dataset_uuid)
    metadata = {
        "dataset_metadata_version": metadata_version,
        "dataset_uuid": dataset_uuid,
    }
    expected_partitions = {
        "partitions": {
            "location=L-0": {"files": {"core": partition0}},
            "location=L-1": {"files": {"core": partition1}},
        }
    }
    expected_tables = {"tables": {"core": ["date", "product", "value"]}}

    store.put(
        "{}.by-dataset-metadata.json".format(dataset_uuid),
        simplejson.dumps(metadata).encode(),
    )

    metadata.update(expected_partitions)
    metadata.update(expected_tables)
    dmd = DatasetMetadata.load_from_store(dataset_uuid, store)
    actual_partitions = dmd.to_dict()["partitions"]
    # we partition on location ID which has two values
    assert len(actual_partitions) == 2
    assert dmd.partition_keys == ["location"]
Пример #21
0
def test_roundtrip_empty_with_store(store, metadata_version):
    dataset_uuid = "dataset_uuid"
    dataset = DatasetMetadata(uuid=dataset_uuid,
                              metadata_version=metadata_version)
    store.put(
        "{}.by-dataset-metadata.json".format(dataset_uuid),
        simplejson.dumps(dataset.to_dict()).encode("utf-8"),
    )
    assert dataset == DatasetMetadata.load_from_store(dataset_uuid, store)
Пример #22
0
def discover_datasets_unchecked(
    uuid_prefix: str,
    store: Union[Callable[[], KeyValueStore], KeyValueStore],
    filter_ktk_cube_dataset_ids: Optional[Union[str, Iterable[str]]] = None,
) -> Dict[str, DatasetMetadata]:
    """
    Get all known datasets that may belong to a give cube w/o applying any checks.

    .. warning::
        The results are not checked for validity. Found datasets may be incompatible w/ the given cube. Use
        :meth:`check_datasets` to check the results, or go for :meth:`discover_datasets` in the first place.

    Parameters
    ----------
    uuid_prefix
        Dataset UUID prefix.
    store
        KV store.
    filter_ktk_cube_dataset_ids
        Optional selection of datasets to include.

    Returns
    -------
    datasets: Dict[str, DatasetMetadata]
        All discovered datasets. Empty Dict if no dataset is found
    """
    if callable(store):
        store = store()
    filter_ktk_cube_dataset_ids = converter_str_set_optional(
        filter_ktk_cube_dataset_ids)
    prefix = uuid_prefix + KTK_CUBE_UUID_SEPERATOR

    names = _discover_dataset_meta_files(prefix, store)

    if filter_ktk_cube_dataset_ids is not None:
        names = {
            name
            for name in names
            if name[len(prefix):] in filter_ktk_cube_dataset_ids
        }

    result = {}
    # sorted iteration for determistic error messages in case DatasetMetadata.load_from_store fails
    for name in sorted(names):
        try:
            result[name[len(prefix):]] = DatasetMetadata.load_from_store(
                uuid=name,
                store=store,
                load_schema=True,
                load_all_indices=False)
        except KeyError as e:
            _logger.warning(
                'Ignore dataset "{name}" due to KeyError: {e}'.format(
                    name=name, e=e))

    return result
Пример #23
0
def test_update_of_dataset_with_non_default_table_name(store_factory,
                                                       bound_update_dataset):
    """
    Tests that datasets with table names other than "table" can be created,
    updated and read successfully (regression test for issue #445).
    """

    # Create initial dataset
    dataset_uuid = "dataset_uuid"
    df_create = pd.DataFrame({
        "date": [date(2021, 1, 1), date(2021, 1, 2)],
        "value": range(2)
    })
    store_dataframes_as_dataset(
        dfs=[df_create],
        store=store_factory,
        dataset_uuid=dataset_uuid,
        table_name="non-default-name",
        partition_on=["date"],
    )
    dm = DatasetMetadata.load_from_store(dataset_uuid, store_factory())
    assert dm.table_name == "non-default-name"

    # Update dataset
    df_update = pd.DataFrame({
        "date": [date(2021, 1, 3), date(2021, 1, 4)],
        "value": range(2)
    })
    bound_update_dataset(
        [df_update],
        store=store_factory,
        dataset_uuid=dataset_uuid,
        table_name="non-default-name",
        partition_on=["date"],
    )
    dm = DatasetMetadata.load_from_store(dataset_uuid, store_factory())
    assert dm.table_name == "non-default-name"

    # Assert equality of dataframe
    df_read = (read_dataset_as_ddf(dataset_uuid, store_factory(),
                                   "table").compute().reset_index(drop=True))
    df_expected = df_create.append(df_update).reset_index(drop=True)
    pd.testing.assert_frame_equal(df_read, df_expected)
Пример #24
0
def test_metadata_factory_from_dataset_no_store(function_store, ds, load_schema):
    ds2 = DatasetMetadata.load_from_store(
        "uuid", function_store(), load_schema=load_schema
    )
    factory = metadata_factory_from_dataset(ds2, with_schema=load_schema)
    assert factory.dataset_metadata is ds2

    store = factory.store
    with pytest.raises(NotImplementedError):
        store.get("foo")
Пример #25
0
def test_update_dataset_with_partitions__reducer_delete_only(
    store_factory, metadata_version, frozen_time_em, bound_update_dataset, store
):
    partitions = [
        {
            "label": "cluster_1",
            "data": [("core", pd.DataFrame({"p": [1]}))],
            "indices": {"p": ExplicitSecondaryIndex("p", index_dct={1: ["cluster_1"]})},
        },
        {
            "label": "cluster_2",
            "data": [("core", pd.DataFrame({"p": [2]}))],
            "indices": {"p": ExplicitSecondaryIndex("p", index_dct={2: ["cluster_2"]})},
        },
    ]
    dataset = store_dataframes_as_dataset(
        dfs=partitions,
        store=store_factory,
        metadata={"dataset": "metadata"},
        dataset_uuid="dataset_uuid",
        metadata_version=metadata_version,
    )
    dataset = dataset.load_index("p", store)

    empty_part = []
    dataset_updated = bound_update_dataset(
        [empty_part],
        store=store_factory,
        dataset_uuid="dataset_uuid",
        delete_scope=[{"p": 1}],
        metadata={"extra": "metadata"},
        default_metadata_version=metadata_version,
        secondary_indices=["p"],
    )
    dataset_updated = dataset_updated.load_index("p", store)

    assert sorted(dataset.partitions) == ["cluster_1", "cluster_2"]
    assert list(dataset_updated.partitions) == ["cluster_2"]

    store_files = list(store.keys())
    # 1 dataset metadata file and 1 index file and 2 partition files
    # note: the update writes a new index file but due to frozen_time this gets
    # the same name as the previous one and overwrites it.
    expected_number_files = 4
    # common metadata for v4 datasets (1 table)
    expected_number_files += 1
    assert len(store_files) == expected_number_files

    assert dataset.indices["p"].index_dct == {1: ["cluster_1"], 2: ["cluster_2"]}
    assert dataset_updated.indices["p"].index_dct == {2: ["cluster_2"]}

    # Ensure the dataset can be loaded properly
    stored_dataset = DatasetMetadata.load_from_store("dataset_uuid", store)
    stored_dataset = stored_dataset.load_index("p", store)
    assert dataset_updated == stored_dataset
Пример #26
0
def test_update_dataset_with_partitions__reducer_nonexistent(
        store_factory, metadata_version, frozen_time_em, bound_update_dataset,
        store):

    part3 = {
        "label": "cluster_3",
        "data": [("core", pd.DataFrame({"p": [3]}))],
        "indices": {
            "p": ExplicitSecondaryIndex("p", index_dct={3: ["cluster_3"]})
        },
    }
    dataset_updated = bound_update_dataset(
        [part3],
        store=store_factory,
        dataset_uuid="dataset_uuid",
        delete_scope=[{
            "p": 1
        }],
        metadata={"extra": "metadata"},
        default_metadata_version=metadata_version,
        secondary_indices=["p"],
    )
    dataset_updated = dataset_updated.load_index("p", store)
    ind_updated = dataset_updated.indices["p"]
    cluster_3_label = ind_updated.eval_operator(op="==", value=3).pop()

    expected_metadata = {"extra": "metadata"}

    expected_metadata["creation_time"] = TIME_TO_FREEZE_ISO

    assert dataset_updated.metadata == expected_metadata
    assert list(dataset_updated.partitions) == [cluster_3_label]

    updated_part_c3 = dataset_updated.partitions[cluster_3_label]

    assert updated_part_c3.label == cluster_3_label
    assert dataset_updated.uuid == "dataset_uuid"

    store_files = list(store.keys())
    # 1 dataset metadata file and 1 index file and 1 partition files
    # note: the update writes a new index file but due to frozen_time this gets
    # the same name as the previous one and overwrites it.
    expected_number_files = 3

    # common metadata for v4 datasets (1 table)
    expected_number_files += 1
    assert len(store_files) == expected_number_files
    exp_updated_idx = {3: [cluster_3_label]}
    assert dataset_updated.indices["p"].index_dct == exp_updated_idx

    # Ensure the dataset can be loaded properly
    stored_dataset = DatasetMetadata.load_from_store("dataset_uuid", store)
    stored_dataset = stored_dataset.load_index("p", store)
    assert dataset_updated == stored_dataset
Пример #27
0
def test_update_partitions(driver, function_store, remove_partitions, new_partitions):
    df_source, cube = _write_cube(function_store)

    df_source_new = pd.DataFrame(
        {
            "i1": range(200, 200 + len(new_partitions)),
            "p": np.array(new_partitions, np.int64),
            "v1": range(300, 300 + len(new_partitions)),
            "x": range(100, 100 + len(new_partitions)),
        }
    )

    # what should remain of the old data:
    df_source_of_old = df_source.loc[~df_source["p"].isin(set(remove_partitions))]
    df_source_expected_after = pd.concat(
        [df_source_of_old, df_source_new], sort=False, ignore_index=True
    )

    remove_conditions = C("p").isin(remove_partitions)

    result = driver(
        data={"source": df_source_new},
        remove_conditions=remove_conditions,
        cube=cube,
        store=function_store,
        ktk_cube_dataset_ids={"source"},
        metadata={"source": {"some_new_meta": 42}},
    )

    assert set(result.keys()) == {"source"}

    dm_source_after = DatasetMetadata.load_from_store(
        cube.ktk_dataset_uuid("source"), function_store(), load_all_indices=True
    )

    assert "some_new_meta" in dm_source_after.metadata
    assert "meta_at_create" in dm_source_after.metadata

    # check values for "p" are as expected:
    expected_p_source = (set(df_source["p"].unique()) - set(remove_partitions)) | set(
        new_partitions
    )
    assert set(dm_source_after.indices["p"].index_dct) == expected_p_source

    df_read = query_cube(cube, function_store)[0]

    assert set(df_read.columns) == set(df_source_expected_after.columns)

    for df in (df_read, df_source_expected_after):
        df.sort_values("x", inplace=True)
        df.reset_index(drop=True, inplace=True)

    pd.testing.assert_frame_equal(df_read, df_source_expected_after)
Пример #28
0
def test_metadata_factory_from_dataset_with_store(function_store, ds, load_schema):
    ds2 = DatasetMetadata.load_from_store(
        "uuid", function_store(), load_schema=load_schema
    )
    factory = metadata_factory_from_dataset(
        ds2, with_schema=load_schema, store=function_store
    )
    assert factory.dataset_metadata is ds2

    store = factory.store
    store.put("foo", b"bar")
    assert store.get("foo") == b"bar"
Пример #29
0
def test_cube_update_secondary_indices_subset(function_store, driver):

    cube1 = Cube(
        dimension_columns=["A"],
        partition_columns=["P"],
        uuid_prefix="cube",
        seed_dataset="source",
        index_columns=["indexed"],
    )
    df_1 = pd.DataFrame({"A": range(10), "P": 1, "indexed": 1, "not-indexed": 1})
    build_cube(
        data={"source": df_1},
        cube=cube1,
        store=function_store,
        metadata={"source": {"meta_at_create": "data"}},
    )

    cube2 = Cube(
        dimension_columns=["A"],
        partition_columns=["P"],
        uuid_prefix="cube",
        seed_dataset="source",
    )
    df_2 = pd.DataFrame({"A": range(10, 20), "P": 1, "indexed": 2, "not-indexed": 1})
    driver(
        data={"source": df_2}, cube=cube2, store=function_store, remove_conditions=None
    )

    dataset_uuid = cube2.ktk_dataset_uuid(cube2.seed_dataset)
    dm = DatasetMetadata.load_from_store(
        dataset_uuid, function_store(), load_all_indices=True
    )
    obs_values = dm.indices["indexed"].observed_values()

    assert sorted(obs_values) == [1, 2]

    cube2 = Cube(
        dimension_columns=["A"],
        partition_columns=["P"],
        uuid_prefix="cube",
        seed_dataset="source",
        index_columns=["not-indexed"],
    )
    with pytest.raises(
        ValueError,
        match='ExplicitSecondaryIndex or PartitionIndex "not-indexed" is missing in dataset',
    ):
        driver(
            data={"source": df_2},
            cube=cube2,
            store=function_store,
            remove_conditions=None,
        )
Пример #30
0
def test_update_dataset_with_partitions_delete_only(store_factory,
                                                    metadata_version,
                                                    frozen_time_em,
                                                    bound_update_dataset,
                                                    store):
    partitions = [
        pd.DataFrame({"p": [1]}),
        pd.DataFrame({"p": [2]}),
    ]
    dataset = store_dataframes_as_dataset(
        dfs=partitions,
        store=store_factory,
        metadata={"dataset": "metadata"},
        dataset_uuid="dataset_uuid",
        secondary_indices=["p"],
        metadata_version=metadata_version,
    )
    dataset = dataset.load_index("p", store)

    # FIXME: is this a regression?
    dataset_updated = bound_update_dataset(
        None,
        store=store_factory,
        dataset_uuid="dataset_uuid",
        delete_scope=[{
            "p": 1
        }],
        metadata={"extra": "metadata"},
        default_metadata_version=metadata_version,
        secondary_indices=["p"],
    )
    dataset_updated = dataset_updated.load_index("p", store)

    assert len(dataset.partitions) == 2
    assert len(dataset_updated.partitions) == 1

    store_files = list(store.keys())
    # 1 dataset metadata file and 1 index file and 2 partition files
    # note: the update writes a new index file but due to frozen_time this gets
    # the same name as the previous one and overwrites it.
    expected_number_files = 4
    # common metadata for v4 datasets (1 table)
    expected_number_files += 1
    assert len(store_files) == expected_number_files

    assert set(dataset.indices["p"].observed_values()) == {1, 2}
    assert set(dataset_updated.indices["p"].observed_values()) == {2}

    # Ensure the dataset can be loaded properly
    stored_dataset = DatasetMetadata.load_from_store("dataset_uuid", store)
    stored_dataset = stored_dataset.load_index("p", store)
    assert dataset_updated == stored_dataset