Пример #1
0
def test_invalid_uuid():
    expected = {
        "dataset_metadata_version": 4,
        "dataset_uuid": "uuid.",
        "partitions": {
            "part_1": {
                "files": {
                    "core": "file.parquet"
                }
            }
        },
    }
    with pytest.raises(ValueError):
        DatasetMetadata.from_dict(expected)

    expected = {
        "dataset_metadata_version": 4,
        "dataset_uuid": "mañana",
        "partitions": {
            "part_1": {
                "files": {
                    "core": "file.parquet"
                }
            }
        },
    }
    with pytest.raises(ValueError):
        DatasetMetadata.from_dict(expected)
Пример #2
0
def test_dataset_get_indices_as_dataframe_duplicates():
    ds = DatasetMetadata(
        "some_uuid",
        indices={
            "l_external_code":
            ExplicitSecondaryIndex("l_external_code", {
                "1": ["part1", "part2"],
                "2": ["part1", "part2"]
            }),
            "p_external_code":
            ExplicitSecondaryIndex("p_external_code", {
                "1": ["part1"],
                "2": ["part2"]
            }),
        },
    )
    expected = pd.DataFrame(
        OrderedDict([
            ("p_external_code", ["1", "1", "2", "2"]),
            ("l_external_code", ["1", "2", "1", "2"]),
        ]),
        index=pd.Index(["part1", "part1", "part2", "part2"], name="partition"),
    )
    result = ds.get_indices_as_dataframe()
    pdt.assert_frame_equal(result, expected)
Пример #3
0
def test_conditions(driver, function_store, existing_cube):
    parts_source1 = set(
        DatasetMetadata.load_from_store(
            existing_cube.ktk_dataset_uuid("source"), function_store()
        ).partitions
    )
    parts_enrich1 = set(
        DatasetMetadata.load_from_store(
            existing_cube.ktk_dataset_uuid("enrich"), function_store()
        ).partitions
    )

    parts_source_to_delete = {part for part in parts_source1 if "p=0" not in part}

    result = driver(
        cube=existing_cube,
        store=function_store,
        ktk_cube_dataset_ids=["source"],
        conditions=C("p") > 0,
    )

    assert set(result.keys()) == {"source", "enrich"}

    ds_source = result["source"]
    ds_enrich = result["enrich"]

    parts_source2 = set(ds_source.partitions)
    parts_enrich2 = set(ds_enrich.partitions)

    assert parts_enrich1 == parts_enrich2
    assert parts_source1 - parts_source_to_delete == parts_source2
Пример #4
0
def test_fail_wrong_types(driver, function_store):
    """
    Might catch nasty pandas and other type bugs.
    """
    df_source = pd.DataFrame(
        {"x": [0, 1, 2, 3], "p": [0, 0, 1, 1], "v1": [10, 11, 12, 13]}
    )
    df_enrich = pd.DataFrame(
        {"x": [0.0, 1.0, 2.0, 3.0], "p": [0, 0, 1, 1], "v2": [20, 21, 22, 23]}
    )
    cube = Cube(
        dimension_columns=["x"],
        partition_columns=["p"],
        uuid_prefix="cube",
        seed_dataset="source",
    )
    with pytest.raises(MultiTableCommitAborted) as exc_info:
        driver(
            data={"source": df_source, "enrich": df_enrich},
            cube=cube,
            store=function_store,
        )

    cause = exc_info.value.__cause__
    assert isinstance(cause, ValueError)
    assert 'Found incompatible entries for column "x"' in str(cause)
    assert not DatasetMetadata.exists(cube.ktk_dataset_uuid("source"), function_store())
    assert not DatasetMetadata.exists(cube.ktk_dataset_uuid("enrich"), function_store())
def test_append_partitions(driver, function_store, existing_cube):
    partitions_source_1 = set(
        DatasetMetadata.load_from_store(
            existing_cube.ktk_dataset_uuid("source"),
            function_store()).partitions.keys())
    partitions_enrich_1 = set(
        DatasetMetadata.load_from_store(
            existing_cube.ktk_dataset_uuid("enrich"),
            function_store()).partitions.keys())

    df_source = pd.DataFrame({
        "x": [0, 1, 2, 3],
        "p": [0, 0, 1, 1],
        "v1": [20, 21, 22, 23],
        "i1": [20, 21, 22, 23],
    })

    result = driver(data={"source": df_source},
                    cube=existing_cube,
                    store=function_store)

    assert set(result.keys()) == {"source"}

    ds_source = result["source"]
    ds_enrich = DatasetMetadata.load_from_store(
        existing_cube.ktk_dataset_uuid("enrich"), function_store())

    partitions_source_2 = set(ds_source.partitions.keys())
    partitions_enrich_2 = set(ds_enrich.partitions.keys())

    assert len(partitions_source_2) > len(partitions_source_1)
    assert partitions_source_1.issubset(partitions_source_2)

    assert partitions_enrich_2 == partitions_enrich_1
Пример #6
0
def test_fail_partition_on_4(driver, function_store):
    df_source = pd.DataFrame(
        {"x": [0, 1, 2, 3], "p": [0, 0, 1, 1], "v1": [10, 11, 12, 13]}
    )
    df_enrich = pd.DataFrame(
        {"x": [0, 1, 2, 3], "p": [0, 0, 1, 1], "v2": [20, 21, 22, 23]}
    )
    cube = Cube(
        dimension_columns=["x"],
        partition_columns=["p"],
        uuid_prefix="cube",
        seed_dataset="source",
    )

    with pytest.raises(
        ValueError, match="Unspecified but provided partition columns in enrich: p"
    ):
        driver(
            data={"source": df_source, "enrich": df_enrich},
            cube=cube,
            store=function_store,
            partition_on={"enrich": []},
        )
    assert not DatasetMetadata.exists(cube.ktk_dataset_uuid("source"), function_store())
    assert not DatasetMetadata.exists(cube.ktk_dataset_uuid("enrich"), function_store())
Пример #7
0
def test_fail_nondistinc_payload(driver, function_store):
    """
    This would lead to problems during the query phase.
    """
    df_source = pd.DataFrame(
        {"x": [0, 1, 2, 3], "p": [0, 0, 1, 1], "v1": [10, 11, 12, 13]}
    )
    df_enrich = pd.DataFrame(
        {"x": [0, 1, 2, 3], "p": [0, 0, 1, 1], "v1": [20, 21, 22, 23]}
    )
    cube = Cube(
        dimension_columns=["x"],
        partition_columns=["p"],
        uuid_prefix="cube",
        seed_dataset="source",
    )
    with pytest.raises(MultiTableCommitAborted) as exc_info:
        driver(
            data={"source": df_source, "enrich": df_enrich},
            cube=cube,
            store=function_store,
        )
    cause = exc_info.value.__cause__
    assert isinstance(cause, ValueError)
    assert "Found columns present in multiple datasets" in str(cause)
    assert not DatasetMetadata.exists(cube.ktk_dataset_uuid("source"), function_store())
    assert not DatasetMetadata.exists(cube.ktk_dataset_uuid("enrich"), function_store())
Пример #8
0
def test_create_dataset_header(store, metadata_storage_format, frozen_time):
    table_meta = {"table": make_meta(pd.DataFrame({"col": [1]}), origin="1")}
    new_dataset = create_empty_dataset_header(
        store=store,
        table_meta=table_meta,
        dataset_uuid="new_dataset_uuid",
        metadata_storage_format=metadata_storage_format,
        metadata_version=4,
    )

    expected_dataset = DatasetMetadata(
        uuid="new_dataset_uuid",
        metadata_version=4,
        explicit_partitions=False,
        table_meta=table_meta,
    )
    assert new_dataset == expected_dataset

    storage_keys = list(store.keys())
    assert len(storage_keys) == 2

    loaded = DatasetMetadata.load_from_store(store=store,
                                             uuid="new_dataset_uuid")
    assert loaded == expected_dataset

    # If the read succeeds, the schema is written
    read_schema_metadata(dataset_uuid=new_dataset.uuid,
                         store=store,
                         table="table")
Пример #9
0
def test_copy(frozen_time):
    ds = DatasetMetadata(
        uuid="uuid",
        partitions={"partition_label": {
            "files": {}
        }},
        metadata={"some": "metadata"},
        indices={
            "column":
            ExplicitSecondaryIndex(column="column",
                                   index_dct={1: ["partition_label"]})
        },
        explicit_partitions=True,
        partition_keys=["P", "L"],
    )
    new_ds = ds.copy()
    # Check if the copy is identical
    assert new_ds == ds
    # ... but not the same object
    assert id(new_ds) != id(ds)

    new_ds = ds.copy(metadata={"new": "metadata"})
    assert id(new_ds) != id(ds)
    assert new_ds.metadata == {
        "new": "metadata",
        # The DatasetMetadata constructor ensure that the creation time is
        # always present.
        "creation_time": "2000-01-01T01:01:01.000001",
    }
Пример #10
0
def test_commit_dataset_from_metapartition(dataset_function, store):
    new_data = [
        pd.DataFrame(
            OrderedDict([
                ("P", [5]),
                ("L", [5]),
                ("TARGET", [5]),
                ("DATE", [datetime.date(2016, 3, 23)]),
            ]))
    ]
    new_partition = write_single_partition(store=store,
                                           dataset_uuid=dataset_function.uuid,
                                           data=new_data)
    pre_commit_dataset = DatasetMetadata.load_from_store(
        uuid=dataset_function.uuid, store=store)
    # Cannot assert equal since the metadata is differently ordered
    assert pre_commit_dataset == dataset_function

    updated_dataset = commit_dataset(
        store=store,
        dataset_uuid=dataset_function.uuid,
        new_partitions=new_partition,
        delete_scope=None,
        partition_on=None,
    )
    assert updated_dataset != dataset_function

    assert updated_dataset.uuid == dataset_function.uuid
    assert len(
        updated_dataset.partitions) == len(dataset_function.partitions) + 1

    # ensure that the new dataset is actually the one on disc
    loaded_dataset = DatasetMetadata.load_from_store(uuid=updated_dataset.uuid,
                                                     store=store)
    assert loaded_dataset == updated_dataset

    # Read the data and check whether the rows above are included.
    # This checks whether all necessary informations were updated in the header
    # (e.g. files attributes of the partitions)
    actual = read_table(store=store, dataset_uuid=dataset_function.uuid)
    df_expected = pd.DataFrame(
        OrderedDict([
            (
                "DATE",
                [
                    datetime.date(2016, 3, 23),
                    datetime.date(2010, 1, 1),
                    datetime.date(2009, 12, 31),
                ],
            ),
            ("L", [5, 1, 2]),
            ("P", [5, 1, 2]),
            ("TARGET", [5, 1, 2]),
        ]))
    actual = actual.sort_values("DATE", ascending=False).reset_index(drop=True)

    assert_frame_equal(df_expected, actual)
Пример #11
0
def test_roundtrip_empty_with_store(store, metadata_version):
    dataset_uuid = "dataset_uuid"
    dataset = DatasetMetadata(uuid=dataset_uuid,
                              metadata_version=metadata_version)
    store.put(
        "{}.by-dataset-metadata.json".format(dataset_uuid),
        simplejson.dumps(dataset.to_dict()).encode("utf-8"),
    )
    assert dataset == DatasetMetadata.load_from_store(dataset_uuid, store)
Пример #12
0
def test_existing_indices_are_added_when_missing_in_cube():
    """
    Test that indices already existing in the dataset are added to the validated cube
    """
    source_metadata = DatasetMetadata.from_dict({
        "dataset_uuid":
        "source",
        "dataset_metadata_version":
        4,
        "schema":
        FakeExtraTableMetadata(),
        "partition_keys": ["p"],
        "indices": {
            "d1": {
                "1": ["part_1"]
            },
            "d2": {
                "1": ["part_1"]
            },
            "i1": {
                "1": ["part_1"]
            },
            "i2": {
                "1": ["part_1"]
            },
        },
    })
    extra_metadata = DatasetMetadata.from_dict({
        "dataset_uuid":
        "extra",
        "dataset_metadata_version":
        4,
        "schema":
        FakeExtraTableMetadata(),
        "partition_keys": ["p"],
        "indices": {
            "i1": {
                "1": ["part_1"]
            }
        },
    })
    cube = Cube(
        dimension_columns=["d1", "d2"],
        partition_columns=["p"],
        uuid_prefix="cube",
        seed_dataset="source",
        index_columns=["i1"],
    )

    validated_cube = ensure_valid_cube_indices(
        {
            "source": source_metadata,
            "extra": extra_metadata
        }, cube)

    assert validated_cube.index_columns == {"i1", "i2"}
Пример #13
0
def test_no_indices_are_suppressed_when_they_already_exist():
    """
    Test that no indicies marked as suppressed in the cube are actually suppressed when
    they are already present in the dataset
    """
    source_metadata = DatasetMetadata.from_dict({
        "dataset_uuid":
        "source",
        "dataset_metadata_version":
        4,
        "schema":
        FakeSeedTableMetadata(),
        "partition_keys": ["p"],
        "indices": {
            "d1": {
                "1": ["part_1"]
            },
            "d2": {
                "1": ["part_1"]
            },
            "i1": {
                "1": ["part_1"]
            },
        },
    })
    extra_metadata = DatasetMetadata.from_dict({
        "dataset_uuid":
        "extra",
        "dataset_metadata_version":
        4,
        "schema":
        FakeExtraTableMetadata(),
        "partition_keys": ["p"],
        "indices": {
            "i1": {
                "1": ["part_1"]
            }
        },
    })
    cube = Cube(
        dimension_columns=["d1", "d2"],
        partition_columns=["p"],
        uuid_prefix="cube",
        seed_dataset="source",
        suppress_index_on=["d1", "d2"],
    )

    validated_cube = ensure_valid_cube_indices(
        {
            "source": source_metadata,
            "extra": extra_metadata
        }, cube)

    assert validated_cube.suppress_index_on == frozenset()
Пример #14
0
def test_simple(cli, built_cube, skv, store):
    ds = DatasetMetadata.load_from_store(built_cube.ktk_dataset_uuid("source"),
                                         store)
    assert "v1" not in ds.indices

    result = cli("--store=cubes", "my_cube", "index", "source", "v1")
    assert result.exit_code == 0

    ds = DatasetMetadata.load_from_store(built_cube.ktk_dataset_uuid("source"),
                                         store)
    assert "v1" in ds.indices
Пример #15
0
def test_cube_with_valid_indices_is_not_modified_by_validation():
    """
    Test that a cube with valid indices is not modified by `ensure_valid_cube_indices`
    """
    source_metadata = DatasetMetadata.from_dict({
        "dataset_uuid":
        "source",
        "dataset_metadata_version":
        4,
        "schema":
        FakeSeedTableMetadata(),
        "partition_keys": ["p"],
        "indices": {
            "d1": {
                "1": ["part_1"]
            },
            "d2": {
                "1": ["part_1"]
            },
            "i1": {
                "1": ["part_1"]
            },
        },
    })
    extra_metadata = DatasetMetadata.from_dict({
        "dataset_uuid":
        "extra",
        "dataset_metadata_version":
        4,
        "schema":
        FakeExtraTableMetadata(),
        "partition_keys": ["p"],
        "indices": {
            "i1": {
                "1": ["part_1"]
            }
        },
    })
    cube = Cube(
        dimension_columns=["d1", "d2"],
        partition_columns=["p"],
        uuid_prefix="cube",
        seed_dataset="source",
        index_columns=["i1"],
    )

    validated_cube = ensure_valid_cube_indices(
        {
            "source": source_metadata,
            "extra": extra_metadata
        }, cube)

    assert validated_cube == cube
Пример #16
0
def test_raises_when_cube_defines_index_not_in_dataset():
    """
    Test that a `ValueError` is raised when the cube defines an index that is not part of a dataset
    """
    source_metadata = DatasetMetadata.from_dict({
        "dataset_uuid":
        "source",
        "dataset_metadata_version":
        4,
        "schema":
        FakeSeedTableMetadata(),
        "partition_keys": ["p"],
        "indices": {
            "d1": {
                "1": ["part_1"]
            },
            "d2": {
                "1": ["part_1"]
            },
            "i1": {
                "1": ["part_1"]
            },
        },
    })
    extra_metadata = DatasetMetadata.from_dict({
        "dataset_uuid":
        "extra",
        "dataset_metadata_version":
        4,
        "schema":
        FakeExtraTableMetadata(),
        "partition_keys": ["p"],
        "indices": {
            "i1": {
                "1": ["part_1"]
            }
        },
    })
    cube = Cube(
        dimension_columns=["d1", "d2"],
        partition_columns=["p"],
        uuid_prefix="cube",
        seed_dataset="source",
        index_columns=["i2"],
    )

    with pytest.raises(ValueError):
        ensure_valid_cube_indices(
            {
                "source": source_metadata,
                "extra": extra_metadata
            }, cube)
Пример #17
0
def test_complicated_uuid():
    expected = {
        "dataset_metadata_version": 4,
        "dataset_uuid": "uuid+namespace-attribute12_underscored",
        "partitions": {
            "part_1": {
                "files": {
                    "core": "file.parquet"
                }
            }
        },
    }
    DatasetMetadata.from_dict(expected)
Пример #18
0
def test_fail_all_empty(driver, function_store):
    """
    Might happen due to DB-based filters.
    """
    df = pd.DataFrame(
        {"x": [0, 1, 2, 3], "p": [0, 0, 1, 1], "v": [10, 11, 12, 13]}
    ).loc[[]]
    cube = Cube(dimension_columns=["x"], partition_columns=["p"], uuid_prefix="cube")
    with pytest.raises(ValueError) as exc:
        driver(data=df, cube=cube, store=function_store)
    assert "Cannot write empty datasets: seed" in str(exc.value)

    assert not DatasetMetadata.exists(cube.ktk_dataset_uuid("source"), function_store())
    assert not DatasetMetadata.exists(cube.ktk_dataset_uuid("enrich"), function_store())
Пример #19
0
def test_indices(driver, function_store, existing_cube):
    idx1_1 = set(
        DatasetMetadata.load_from_store(
            existing_cube.ktk_dataset_uuid("source"), function_store()
        )
        .load_all_indices(function_store())
        .indices["i1"]
        .index_dct.keys()
    )
    idx2_1 = set(
        DatasetMetadata.load_from_store(
            existing_cube.ktk_dataset_uuid("enrich"), function_store()
        )
        .load_all_indices(function_store())
        .indices["i2"]
        .index_dct.keys()
    )

    df_source = pd.DataFrame(
        {
            "x": [0, 1, 2, 3],
            "p": [0, 0, 1, 1],
            "v1": [20, 21, 22, 23],
            "i1": [20, 21, 22, 23],
        }
    )

    result = driver(
        data={"source": df_source}, cube=existing_cube, store=function_store
    )

    assert set(result.keys()) == {"source"}

    ds_source = result["source"]
    ds_enrich = DatasetMetadata.load_from_store(
        existing_cube.ktk_dataset_uuid("enrich"), function_store()
    )

    idx1_2 = set(
        ds_source.load_all_indices(function_store()).indices["i1"].index_dct.keys()
    )
    idx2_2 = set(
        ds_enrich.load_all_indices(function_store()).indices["i2"].index_dct.keys()
    )

    assert idx1_1.issubset(idx1_2)
    assert len(idx1_1) < len(idx1_2)

    assert idx2_1 == idx2_2
Пример #20
0
def test_overlap_keyspace(store, metadata_version):
    dataset_uuid1 = "uuid+namespace-attribute12_underscored"
    dataset_uuid2 = "uuid+namespace-attribute12_underscored_ext"
    table = "core"

    for dataset_uuid in (dataset_uuid1, dataset_uuid2):
        partition0 = "location=L-0".format(dataset_uuid)
        partition0_key = "{}/{}/{}/data.parquet".format(
            dataset_uuid, table, partition0)
        metadata = {
            "dataset_metadata_version": metadata_version,
            "dataset_uuid": dataset_uuid,
        }

        # put two partitions for two tables each to store
        store.put(
            "{}{}.json".format(dataset_uuid, naming.METADATA_BASE_SUFFIX),
            simplejson.dumps(metadata).encode("utf-8"),
        )
        store.put(partition0_key, b"test")
        store_schema_metadata(
            make_meta(pd.DataFrame({"location": ["L-0"]}), origin="1"),
            dataset_uuid,
            store,
            "core",
        )

    for dataset_uuid in (dataset_uuid1, dataset_uuid2):
        partition0_label = "location=L-0/data".format(dataset_uuid)
        partition0_key = "{}/{}/{}.parquet".format(dataset_uuid, table,
                                                   partition0_label)
        expected_partitions = {
            "location=L-0/data": {
                "files": {
                    "core": partition0_key
                }
            }
        }
        expected_indices = {"location": {"L-0": ["location=L-0/data"]}}
        assert DatasetMetadata.storage_keys(dataset_uuid, store) == [
            "{}{}.json".format(dataset_uuid, naming.METADATA_BASE_SUFFIX),
            _get_common_metadata_key(dataset_uuid, "core"),
            partition0_key,
        ]
        dmd = DatasetMetadata.load_from_store(dataset_uuid, store)
        dmd = dmd.load_partition_indices()
        dmd_dict = dmd.to_dict()
        assert dmd_dict["partitions"] == expected_partitions
        assert dmd_dict["indices"] == expected_indices
Пример #21
0
def validate_partition_keys(
    dataset_uuid,
    store,
    ds_factory,
    default_metadata_version,
    partition_on,
    **load_kwargs,
):
    if ds_factory or DatasetMetadata.exists(dataset_uuid,
                                            _instantiate_store(store)):
        ds_factory = _ensure_factory(
            dataset_uuid=dataset_uuid,
            store=store,
            factory=ds_factory,
            load_dataset_metadata=load_kwargs.pop("load_dataset_metadata",
                                                  True),
        )

        ds_metadata_version = ds_factory.metadata_version
        if partition_on:
            if not isinstance(partition_on, list):
                partition_on = [partition_on]
            if partition_on != ds_factory.partition_keys:
                raise ValueError(
                    "Incompatible set of partition keys encountered. "
                    "Input partitioning was `{}` while actual dataset was `{}`"
                    .format(partition_on, ds_factory.partition_keys))
        else:
            partition_on = ds_factory.partition_keys
    else:
        ds_factory = None
        ds_metadata_version = default_metadata_version
    return ds_factory, ds_metadata_version, partition_on
Пример #22
0
def test_store_dataset_from_partitions(meta_partitions_files_only, store,
                                       frozen_time):
    dataset = store_dataset_from_partitions(
        partition_list=meta_partitions_files_only,
        dataset_uuid="dataset_uuid",
        store=store,
        dataset_metadata={"some": "metadata"},
    )

    expected_metadata = {
        "some": "metadata",
        "creation_time": TIME_TO_FREEZE_ISO
    }

    assert dataset.metadata == expected_metadata
    assert sorted(dataset.partitions.values(),
                  key=lambda x: x.label) == sorted(
                      [mp.partition for mp in meta_partitions_files_only],
                      key=lambda x: x.label)
    assert dataset.uuid == "dataset_uuid"

    store_files = list(store.keys())
    # Dataset metadata: 1 file
    expected_number_files = 1
    # common metadata for v4 datasets
    expected_number_files += 1
    assert len(store_files) == expected_number_files

    # Ensure the dataset can be loaded properly
    stored_dataset = DatasetMetadata.load_from_store("dataset_uuid", store)
    assert dataset == stored_dataset
Пример #23
0
def test_store_dataframes_as_dataset_mp_partition_on_none(
    metadata_version, store, store_factory, bound_store_dataframes
):
    df = pd.DataFrame(
        {"P": np.arange(0, 10), "L": np.arange(0, 10), "TARGET": np.arange(10, 20)}
    )

    df2 = pd.DataFrame({"P": np.arange(0, 10), "info": np.arange(100, 110)})

    mp = MetaPartition(
        label=gen_uuid(),
        data={"core": df, "helper": df2},
        metadata_version=metadata_version,
    )

    df_list = [None, mp]
    dataset = bound_store_dataframes(
        df_list,
        store=store_factory,
        dataset_uuid="dataset_uuid",
        metadata_version=metadata_version,
        partition_on=["P"],
    )

    assert isinstance(dataset, DatasetMetadata)
    assert dataset.partition_keys == ["P"]
    assert len(dataset.partitions) == 10
    assert dataset.metadata_version == metadata_version

    stored_dataset = DatasetMetadata.load_from_store("dataset_uuid", store)

    assert dataset == stored_dataset
Пример #24
0
def test_store_dataframes_as_dataset_list_input(
    store_factory, metadata_version, bound_store_dataframes
):
    df = pd.DataFrame(
        {"P": np.arange(0, 10), "L": np.arange(0, 10), "TARGET": np.arange(10, 20)}
    )
    df2 = pd.DataFrame(
        {
            "P": np.arange(100, 110),
            "L": np.arange(100, 110),
            "TARGET": np.arange(10, 20),
        }
    )
    df_list = [df, df2]

    dataset = bound_store_dataframes(
        df_list,
        store=store_factory,
        dataset_uuid="dataset_uuid",
        metadata_version=metadata_version,
    )

    assert isinstance(dataset, DatasetMetadata)
    assert len(dataset.partitions) == 2
    stored_dataset = DatasetMetadata.load_from_store("dataset_uuid", store_factory())
    assert dataset == stored_dataset
Пример #25
0
def test_cube_blacklist_dimension_index(function_store, driver):

    cube1 = Cube(
        dimension_columns=["A", "B"],
        partition_columns=["P"],
        uuid_prefix="cube",
        seed_dataset="source",
    )
    df_1 = pd.DataFrame({"A": range(10), "P": 1, "B": 1, "payload": ""})
    build_cube(
        data={"source": df_1},
        cube=cube1,
        store=function_store,
        metadata={"source": {"meta_at_create": "data"}},
    )

    cube2 = Cube(
        dimension_columns=["A", "B"],
        partition_columns=["P"],
        uuid_prefix="cube",
        seed_dataset="source",
        suppress_index_on=["B"],
    )
    df_2 = pd.DataFrame({"A": range(10), "P": 1, "B": 2, "payload": ""})
    driver(
        data={"source": df_2}, cube=cube2, store=function_store, remove_conditions=None
    )

    dataset_uuid = cube2.ktk_dataset_uuid(cube2.seed_dataset)
    dm = DatasetMetadata.load_from_store(
        dataset_uuid, function_store(), load_all_indices=True
    )
    obs_values = dm.indices["B"].observed_values()

    assert sorted(obs_values) == [1, 2]
Пример #26
0
def test_all(cli, built_cube, skv, store):
    result = cli("--store=cubes", "my_cube", "index", "source", "*")
    assert result.exit_code == 0

    ds = DatasetMetadata.load_from_store(built_cube.ktk_dataset_uuid("source"),
                                         store)
    assert set(ds.indices.keys()) == set(get_dataset_columns(ds))
Пример #27
0
def test_update_secondary_indices_subset(store_factory, bound_update_dataset):
    df1 = pd.DataFrame({"A": range(10), "indexed": 1})
    dataset_uuid = "dataset_uuid"
    bound_update_dataset(df1,
                         dataset_uuid=dataset_uuid,
                         store=store_factory,
                         secondary_indices="indexed")

    df2 = pd.DataFrame({"A": range(10), "indexed": 2})
    # secondary index is omitted. Kartothek should pick it up regardless
    bound_update_dataset(df2, dataset_uuid=dataset_uuid, store=store_factory)

    dm = DatasetMetadata.load_from_store(dataset_uuid,
                                         store_factory(),
                                         load_all_indices=True)
    obs_values = dm.indices["indexed"].observed_values()

    assert sorted(obs_values) == [1, 2]

    with pytest.raises(ValueError, match="Incorrect indices provided"):
        # secondary index is omitted. Kartothek should pick it up regardless
        bound_update_dataset(df2,
                             dataset_uuid=dataset_uuid,
                             store=store_factory,
                             secondary_indices="A")
Пример #28
0
def test_store_dataframes_as_dataset_no_pipeline_partition_on(store):
    df = pd.DataFrame({
        "P": np.arange(0, 10),
        "L": np.arange(0, 10),
        "TARGET": np.arange(10, 20)
    })

    df2 = pd.DataFrame({"P": np.arange(0, 10), "info": np.arange(100, 110)})

    dataset = store_dataframes_as_dataset(
        store=store,
        dataset_uuid="dataset_uuid",
        dfs={
            "core": df,
            "helper": df2
        },
        partition_on="P",
        metadata_version=4,
    )

    assert isinstance(dataset, DatasetMetadata)
    assert len(dataset.partitions) == 10

    stored_dataset = DatasetMetadata.load_from_store("dataset_uuid", store)

    assert dataset == stored_dataset
Пример #29
0
def test_store_dataframes_as_dataset(store_factory, metadata_version,
                                     bound_store_dataframes):
    df = pd.DataFrame({
        "P": np.arange(0, 10),
        "L": np.arange(0, 10),
        "TARGET": np.arange(10, 20)
    })

    df_helper = pd.DataFrame({
        "P": np.arange(0, 10),
        "info": string.ascii_lowercase[:10]
    })

    df_list = [
        {
            "label": "cluster_1",
            "data": [("core", df.copy(deep=True)), ("helper", df_helper)],
        },
        {
            "label": "cluster_2",
            "data": [("core", df.copy(deep=True)), ("helper", df_helper)],
        },
    ]

    dataset = bound_store_dataframes(
        df_list,
        store=store_factory,
        dataset_uuid="dataset_uuid",
        metadata_version=metadata_version,
        secondary_indices=["P"],
    )

    assert isinstance(dataset, DatasetMetadata)
    assert len(dataset.partitions) == 2

    assert "P" in dataset.indices

    store = store_factory()
    stored_dataset = DatasetMetadata.load_from_store("dataset_uuid", store)
    assert dataset.uuid == stored_dataset.uuid
    assert dataset.metadata == stored_dataset.metadata
    assert dataset.partitions == stored_dataset.partitions

    index_dct = stored_dataset.indices["P"].load(store).index_dct
    assert sorted(index_dct.keys()) == list(range(0, 10))
    assert any(
        [sorted(p) == ["cluster_1", "cluster_2"] for p in index_dct.values()])

    df_stored = DataFrameSerializer.restore_dataframe(
        key=dataset.partitions["cluster_1"].files["core"], store=store)
    pdt.assert_frame_equal(df, df_stored)
    df_stored = DataFrameSerializer.restore_dataframe(
        key=dataset.partitions["cluster_2"].files["core"], store=store)
    pdt.assert_frame_equal(df, df_stored)
    df_stored = DataFrameSerializer.restore_dataframe(
        key=dataset.partitions["cluster_1"].files["helper"], store=store)
    pdt.assert_frame_equal(df_helper, df_stored)
    df_stored = DataFrameSerializer.restore_dataframe(
        key=dataset.partitions["cluster_2"].files["helper"], store=store)
    pdt.assert_frame_equal(df_helper, df_stored)
Пример #30
0
def test_store_dataframes_as_dataset_mp(metadata_version, store):
    df = pd.DataFrame({
        "P": np.arange(0, 10),
        "L": np.arange(0, 10),
        "TARGET": np.arange(10, 20)
    })

    df2 = pd.DataFrame({"P": np.arange(0, 10), "info": np.arange(100, 110)})

    mp = MetaPartition(
        label=gen_uuid(),
        data={
            "core": df,
            "helper": df2
        },
        metadata_version=metadata_version,
    )

    dataset = store_dataframes_as_dataset(
        store=store,
        dataset_uuid="dataset_uuid",
        dfs=mp,
        metadata_version=metadata_version,
    )

    assert isinstance(dataset, DatasetMetadata)
    assert len(dataset.partitions) == 1
    assert dataset.metadata_version == metadata_version

    stored_dataset = DatasetMetadata.load_from_store("dataset_uuid", store)

    assert dataset == stored_dataset