Exemplo n.º 1
0
def test_partition_on_raises_no_cols_left(empty):
    original_df = pd.DataFrame({"test": [1, 2, 3]})
    if empty:
        original_df = original_df.loc[[]]
    mp = MetaPartition(
        label="label_1", file="file", data=original_df, metadata_version=4
    )
    with pytest.raises(ValueError) as e:
        mp.partition_on(["test"])
    assert str(e.value) == "No data left to save outside partition columns"
Exemplo n.º 2
0
def test_partition_on_raises_pocols_missing(empty):
    original_df = pd.DataFrame({"test": [1, 2, 3]})
    if empty:
        original_df = original_df.loc[[]]
    mp = MetaPartition(
        label="label_1", file="file", data=original_df, metadata_version=4
    )
    with pytest.raises(ValueError) as e:
        mp.partition_on(["test", "foo", "bar"])
    assert str(e.value) == "Partition column(s) missing: bar, foo"
Exemplo n.º 3
0
def test_partition_on_scalar_intermediate(df_not_nested):
    """
    Test against a bug where grouping leaves a scalar value
    """
    assert len(df_not_nested) == 1
    mp = MetaPartition(label="somelabel",
                       data={"table": df_not_nested},
                       metadata_version=4)
    for col in df_not_nested:
        if col == "byte":
            with pytest.raises(UnicodeDecodeError):
                mp.partition_on(col)
        else:
            new_mp = mp.partition_on(col)
            assert len(new_mp) == 1
def test_partition_on_one_level_ts():
    original_df = pd.DataFrame({
        "test": [
            pd.Timestamp("2001-01-01"),
            pd.Timestamp("2001-01-02"),
            pd.Timestamp("2001-01-03"),
        ],
        "some_values": [1, 2, 3],
    })
    mp = MetaPartition(label="label_1",
                       file="file",
                       data=original_df,
                       metadata_version=4)

    new_mp = mp.partition_on(["test"])

    assert len(new_mp.metapartitions) == 3

    labels = set()
    for mp in new_mp:
        labels.add(mp.label)
        assert len(mp.data) == 1
        assert mp.data is not None
        df = mp.data
        assert df._is_view

        # try to be agnostic about the order
        assert len(df) == 1
        assert "test" not in df
    expected_labels = set([
        "test=2001-01-01%2000%3A00%3A00/label_1",
        "test=2001-01-02%2000%3A00%3A00/label_1",
        "test=2001-01-03%2000%3A00%3A00/label_1",
    ])
    assert labels == expected_labels
Exemplo n.º 5
0
def test_partition_urlencode():
    original_df = pd.DataFrame({"ÖŒå": [1, 2, 3], "some_values": [1, 2, 3]})
    mp = MetaPartition(label="label_1", data=original_df, metadata_version=4)

    new_mp = mp.partition_on(["ÖŒå"])

    assert len(new_mp.metapartitions) == 3

    labels = set()
    for mp in new_mp:
        labels.add(mp.label)
        assert len(mp.data) == 1
        assert mp.data is not None
        df = mp.data
        assert df._is_view

        # try to be agnostic about the order
        assert len(df) == 1
        assert "ÖŒå" not in df
    expected_labels = set(
        [
            "%C3%96%C5%92%C3%A5=1/label_1",
            "%C3%96%C5%92%C3%A5=2/label_1",
            "%C3%96%C5%92%C3%A5=3/label_1",
        ]
    )
    assert labels == expected_labels
Exemplo n.º 6
0
def test_partition_on_one_level():
    original_df = pd.DataFrame({"test": [1, 2, 3], "some_values": [1, 2, 3]})
    mp = MetaPartition(
        label="label_1",
        files={"core": "file"},
        data={"core": original_df},
        dataset_metadata={"dataset": "metadata"},
        metadata_version=4,
    )

    new_mp = mp.partition_on(["test"])

    assert len(new_mp.metapartitions) == 3

    labels = set()
    for mp in new_mp:
        labels.add(mp.label)
        assert len(mp.data) == 1
        assert "core" in mp.data
        df = mp.data["core"]
        assert df._is_view

        # try to be agnostic about the order
        assert len(df) == 1
        assert "test" not in df
    expected_labels = set(
        ["test=1/label_1", "test=2/label_1", "test=3/label_1"])
    assert labels == expected_labels
Exemplo n.º 7
0
def test_partition_on_roundtrip(store):
    original_df = pd.DataFrame(
        OrderedDict([("test", [1, 2, 3]), ("some_values", [1, 2, 3])])
    )
    mp = MetaPartition(label="label_1", data=original_df, metadata_version=4)

    new_mp = mp.partition_on(["test"])
    new_mp = new_mp.store_dataframes(store=store, dataset_uuid="some_uuid")
    store_schema_metadata(new_mp.schema, "some_uuid", store)
    # Test immediately after dropping and later once with new metapartition to check table meta reloading
    new_mp = new_mp.load_dataframes(store=store)
    assert len(new_mp.metapartitions) == 3
    dfs = []
    for internal_mp in new_mp:
        dfs.append(internal_mp.data)
    actual_df = pd.concat(dfs).sort_values(by="test").reset_index(drop=True)
    pdt.assert_frame_equal(original_df, actual_df)

    for i in range(1, 4):
        # Check with fresh metapartitions
        new_mp = MetaPartition(
            label=f"test={i}/label_1",
            file=f"some_uuid/table/test={i}/label_1.parquet",
            metadata_version=4,
        )
        new_mp = new_mp.load_dataframes(store=store)

        actual_df = new_mp.data

        expected_df = pd.DataFrame(OrderedDict([("test", [i]), ("some_values", [i])]))
        pdt.assert_frame_equal(expected_df, actual_df)
Exemplo n.º 8
0
def test_partition_on_with_primary_index(df_not_nested):
    mp = MetaPartition(
        label="pkey=1/base_label",
        data=df_not_nested,
        partition_keys=["pkey"],
        metadata_version=4,
    )
    new = mp.partition_on(["pkey", "int64"])

    split_label = new.label.split("/")

    assert len(split_label) == 3
    assert split_label[0] == "pkey=1"
    assert split_label[1] == "int64=1"
    assert split_label[2] == "base_label"

    assert mp == mp.partition_on(["pkey"])
Exemplo n.º 9
0
def test_partition_on_scalar_intermediate(df_not_nested, col):
    """
    Test against a bug where grouping leaves a scalar value
    """
    assert len(df_not_nested) == 1
    mp = MetaPartition(label="somelabel", data=df_not_nested, metadata_version=4)
    new_mp = mp.partition_on(col)
    assert len(new_mp) == 1
def test_partition_on_explicit_index():
    original_df = pd.DataFrame({
        "level1": [1, 2, 1, 2, 1, 2],
        "level2": [1, 1, 1, 2, 2, 2],
        "explicit_index_col": np.arange(0, 6),
    })
    mp = MetaPartition(
        label="label_1",
        file="file",
        data=original_df,
        indices={
            "explicit_index_col":
            {value: ["label_1"]
             for value in np.arange(0, 6)}
        },
        metadata_version=4,
    )
    new_mp = mp.partition_on(["level1", "level2"])
    assert len(new_mp) == 4

    expected_indices = {
        "explicit_index_col":
        ExplicitSecondaryIndex(
            "explicit_index_col",
            {
                0: ["level1=1/level2=1/label_1"],
                2: ["level1=1/level2=1/label_1"]
            },
        )
    }
    assert expected_indices == new_mp["level1=1/level2=1/label_1"].indices

    expected_indices = {
        "explicit_index_col":
        ExplicitSecondaryIndex("explicit_index_col",
                               {4: ["level1=1/level2=2/label_1"]})
    }
    assert expected_indices == new_mp["level1=1/level2=2/label_1"].indices

    expected_indices = {
        "explicit_index_col":
        ExplicitSecondaryIndex("explicit_index_col",
                               {1: ["level1=2/level2=1/label_1"]})
    }
    assert expected_indices == new_mp["level1=2/level2=1/label_1"].indices

    expected_indices = {
        "explicit_index_col":
        ExplicitSecondaryIndex(
            "explicit_index_col",
            {
                3: ["level1=2/level2=2/label_1"],
                5: ["level1=2/level2=2/label_1"]
            },
        )
    }
    assert expected_indices == new_mp["level1=2/level2=2/label_1"].indices
Exemplo n.º 11
0
def test_partition_on_valid_schemas():
    """
    Ensure that partitioning is possible even if the output schemas of the
    sub partitions may be different
    """
    df = pd.DataFrame({"partition_col": [0, 1], "values": [None, "str"]})
    mp = MetaPartition(label="base_label", data=df, metadata_version=4)
    mp = mp.partition_on(["partition_col"])
    assert len(mp) == 2
    expected_meta = make_meta(df, origin="1", partition_keys="partition_col")
    assert mp.schema == expected_meta
def test_partition_on_keeps_table_name():
    mp = MetaPartition(
        label="label_1",
        data=pd.DataFrame({
            "P": [1, 2, 1, 2],
            "L": [1, 1, 2, 2]
        }),
        table_name="non-default-name",
    )
    repartitioned_mp = mp.partition_on(["P"])
    assert repartitioned_mp.table_name == "non-default-name"
Exemplo n.º 13
0
def test_partition_on_nested():
    original_df = pd.DataFrame(
        {
            "level1": [1, 2, 3, 1, 2, 3],
            "level2": [1, 1, 1, 2, 2, 2],
            "no_index_col": np.arange(0, 6),
        }
    )
    mp = MetaPartition(
        label="label_1",
        files={"core": "file"},
        data={"core": original_df},
        dataset_metadata={"dataset": "metadata"},
        metadata_version=4,
    )
    mp2 = MetaPartition(
        label="label_2",
        files={"core": "file"},
        data={"core": original_df},
        dataset_metadata={"dataset": "metadata"},
        metadata_version=4,
    )
    mp = mp.add_metapartition(mp2)
    new_mp = mp.partition_on(["level1", "level2"])
    assert len(new_mp.metapartitions) == 12

    labels = []
    for mp in new_mp:
        labels.append(mp.label)
        assert len(mp.data) == 1
        assert "core" in mp.data
        df = mp.data["core"]
        assert df._is_view

        # try to be agnostic about the order
        assert len(df) == 1
        assert "level1" not in df
        assert "level2" not in df
        assert "no_index_col" in df
    expected_labels = [
        "level1=1/level2=1/label_1",
        "level1=1/level2=2/label_1",
        "level1=2/level2=1/label_1",
        "level1=2/level2=2/label_1",
        "level1=3/level2=1/label_1",
        "level1=3/level2=2/label_1",
        "level1=1/level2=1/label_2",
        "level1=1/level2=2/label_2",
        "level1=2/level2=1/label_2",
        "level1=2/level2=2/label_2",
        "level1=3/level2=1/label_2",
        "level1=3/level2=2/label_2",
    ]
    assert sorted(labels) == sorted(expected_labels)
Exemplo n.º 14
0
def store_dataframes_as_dataset(
    store,
    dataset_uuid,
    dfs,
    metadata=None,
    partition_on=None,
    df_serializer=None,
    overwrite=False,
    metadata_storage_format=DEFAULT_METADATA_STORAGE_FORMAT,
    metadata_version=DEFAULT_METADATA_VERSION,
):
    """
    Utility function to store a list of dataframes as a partitioned dataset with multiple tables (files).

    Useful for very small datasets where all data fits into memory.

    Parameters
    ----------
    dfs : dict of pd.DataFrame or pd.DataFrame
        The dataframe(s) to be stored. If only a single dataframe is passed, it will be stored as the `core` table.

    Returns
    -------
    The stored dataset

    """
    if dataset_uuid is None:
        dataset_uuid = gen_uuid()

    if isinstance(dfs, dict):
        dfs = {"data": [(table, df) for table, df in dfs.items()]}

    if not overwrite:
        raise_if_dataset_exists(dataset_uuid=dataset_uuid, store=store)

    mp = parse_input_to_metapartition(dfs, metadata_version)

    if partition_on:
        mp = MetaPartition.partition_on(mp, partition_on)

    mps = mp.store_dataframes(store=store,
                              dataset_uuid=dataset_uuid,
                              df_serializer=df_serializer)

    return store_dataset_from_partitions(
        partition_list=mps,
        dataset_uuid=dataset_uuid,
        store=store,
        dataset_metadata=metadata,
        metadata_storage_format=metadata_storage_format,
    )
Exemplo n.º 15
0
def test_partition_on_stable_order():
    """
    Assert that the partition_on algo is stable wrt to row ordering
    """
    unique_values = 3
    total_values = 20
    random_index = np.repeat(
        np.arange(unique_values), int(np.ceil(total_values / unique_values))
    )[:total_values]
    np.random.shuffle(random_index)
    df = pd.DataFrame(
        {"partition_key": random_index, "sorted_col": range(total_values)}
    )
    mp = MetaPartition(label="label_1", data=df, metadata_version=4)
    new_mp = mp.partition_on("partition_key")
    for sub_mp in new_mp:
        sub_df = sub_mp.data
        assert sub_df.sorted_col.is_monotonic
Exemplo n.º 16
0
def test_partition_on_multiple_tables_empty_table():
    original_df = pd.DataFrame({"level1": [1, 2, 3], "no_index_col": np.arange(0, 3)})
    mp = MetaPartition(
        label="label_1",
        data=OrderedDict(
            [
                ("core", original_df),
                ("empty_table", pd.DataFrame(columns=["level1", "another_col"])),
            ]
        ),
        metadata_version=4,
    )
    new_mp = mp.partition_on("level1")

    labels = []
    for mp in new_mp:
        labels.append(mp.label)
        assert "empty_table" in mp.data
        assert mp.data["empty_table"].empty
        assert set(mp.data["empty_table"].columns) == {"another_col"}
Exemplo n.º 17
0
def test_partition_two_level():
    original_df = pd.DataFrame(
        {
            "level1": [1, 2, 3, 1, 2, 3],
            "level2": [1, 1, 1, 2, 2, 2],
            "no_index_col": np.arange(0, 6),
        }
    )
    mp = MetaPartition(
        label="label_1", file="file", data=original_df, metadata_version=4
    )

    new_mp = mp.partition_on(["level1", "level2"])
    assert len(new_mp.metapartitions) == 6

    labels = []
    for mp in new_mp:
        labels.append(mp.label)
        assert len(mp.data) == 1
        assert mp.data is not None
        df = mp.data
        assert df._is_view

        # try to be agnostic about the order
        assert len(df) == 1
        assert "level1" not in df
        assert "level2" not in df
        assert "no_index_col" in df
    expected_labels = [
        "level1=1/level2=1/label_1",
        "level1=1/level2=2/label_1",
        "level1=2/level2=1/label_1",
        "level1=2/level2=2/label_1",
        "level1=3/level2=1/label_1",
        "level1=3/level2=2/label_1",
    ]
    assert sorted(labels) == sorted(expected_labels)
Exemplo n.º 18
0
def prepare_data_for_ktk(df,
                         ktk_cube_dataset_id,
                         cube,
                         existing_payload,
                         partition_on,
                         consume_df=False):
    """
    Prepare data so it can be handed over to Kartothek.

    Some checks will be applied to the data to ensure it is sane.

    Parameters
    ----------
    df: pandas.DataFrame
        DataFrame to be passed to Kartothek.
    ktk_cube_dataset_id: str
        Ktk_cube dataset UUID (w/o cube prefix).
    cube: kartothek.core.cube.cube.Cube
        Cube specification.
    existing_payload: Set[str]
        Existing payload columns.
    partition_on: Iterable[str]
        Partition-on attribute for given dataset.
    consume_df: bool
        Whether the incoming DataFrame can be destroyed while processing it.

    Returns
    -------
    mp: kartothek.io_components.metapartition.MetaPartition
        Kartothek-ready MetaPartition, may be sentinel (aka empty and w/o label).

    Raises
    ------
    ValueError
        In case anything is fishy.
    """
    check_user_df(ktk_cube_dataset_id, df, cube, existing_payload,
                  partition_on)

    if (df is None) or df.empty:
        # fast-path for empty DF
        return MetaPartition(
            label=None,
            metadata_version=KTK_CUBE_METADATA_VERSION,
            partition_keys=list(partition_on),
        )

    # TODO: find a more elegant solution that works w/o copy
    df_orig = df
    df = df.copy()
    if consume_df:
        # the original df is still referenced in the parent scope, so drop it
        df_orig.drop(columns=df_orig.columns,
                     index=df_orig.index,
                     inplace=True)
    df_columns = list(df.columns)
    df_columns_set = set(df_columns)

    # normalize value order and reset index
    sort_keys = [
        col for col in itertools.chain(cube.partition_columns,
                                       cube.dimension_columns)
        if col in df_columns_set
    ]
    df = sort_dataframe(df=df, columns=sort_keys)

    # check duplicate cells
    _check_duplicates(ktk_cube_dataset_id, df, sort_keys, cube)

    # check+convert column names to unicode strings
    df.rename(columns={c: converter_str(c) for c in df_columns}, inplace=True)

    # create MetaPartition object for easier handling
    mp = MetaPartition(
        label=gen_uuid(),
        data=df,
        metadata_version=KTK_CUBE_METADATA_VERSION,
    )
    del df

    # partition data
    mp = mp.partition_on(list(partition_on))

    # reset indices again (because partition_on breaks it)
    for mp2 in mp:
        mp2.data.reset_index(drop=True, inplace=True)
        del mp2

    # calculate indices
    indices_to_build = set(cube.index_columns) & df_columns_set
    if ktk_cube_dataset_id == cube.seed_dataset:
        indices_to_build |= set(cube.dimension_columns) - set(
            cube.suppress_index_on)
    indices_to_build -= set(partition_on)

    mp = mp.build_indices(indices_to_build)

    return mp
Exemplo n.º 19
0
def test_partition_on_with_primary_index_invalid(df_not_nested):
    mp = MetaPartition(
        label="pkey=1/pkey2=2/base_label",
        data=df_not_nested,
        partition_keys=["pkey", "pkey2"],
        metadata_version=4,
    )
    with pytest.raises(ValueError, match="Incompatible"):
        mp.partition_on("int64")

    with pytest.raises(ValueError, match="Incompatible"):
        mp.partition_on(["int64", "pkey"])

    with pytest.raises(ValueError, match="Incompatible"):
        mp.partition_on(["pkey", "int64"])

    with pytest.raises(ValueError, match="Incompatible"):
        mp.partition_on(["pkey2", "pkey1", "int64"])

    mp.partition_on(["pkey", "pkey2"])
    mp.partition_on(["pkey", "pkey2", "int64"])