예제 #1
0
def multipartition_cube(module_store, fullrange_data, fullrange_cube):
    def _gen(part):
        result = {}
        for dataset_id, df in fullrange_data.items():
            df = df.copy()
            df["z"] = part
            result[dataset_id] = df
        return result

    cube = fullrange_cube.copy(uuid_prefix="multipartition_cube")
    build_cube_from_bag(
        data=db.from_sequence([0, 1], partition_size=1).map(_gen),
        store=module_store,
        cube=cube,
        ktk_cube_dataset_ids=["seed", "enrich_dense", "enrich_sparse"],
    ).compute()
    return cube
예제 #2
0
def test_multifile(driver, function_store):
    dfs = [pd.DataFrame({"x": [i], "p": [0], "v1": [10]}) for i in range(2)]
    cube = Cube(dimension_columns=["x"],
                partition_columns=["p"],
                uuid_prefix="cube")
    build_cube_from_bag(data=db.from_sequence(dfs, partition_size=1),
                        cube=cube,
                        store=function_store).compute()

    result = driver(cube=cube, store=function_store)

    assert set(result.keys()) == {cube.seed_dataset}
    stats_seed = result[cube.seed_dataset]
    assert stats_seed["partitions"] == 1
    assert stats_seed["files"] == 2
    assert stats_seed["rows"] == 2
    assert stats_seed["blobsize"] > 0
예제 #3
0
def test_function_executed_once(driver, function_store, driver_name,
                                skip_eager):
    """
    Test that the payload function is only executed once per branch.

    This was a bug in the dask_bag backend.
    """
    if driver_name == "dask_dataframe":
        pytest.skip("not relevant for dask.dataframe")

    df_source1 = pd.DataFrame({"x": [0, 1], "p": [0, 0], "v1": [10, 11]})
    df_source2 = pd.DataFrame({"x": [2, 3], "p": [1, 1], "v1": [12, 13]})
    df_enrich1 = pd.DataFrame({"x": [0, 1], "p": [0, 0], "v2": [20, 21]})
    df_enrich2 = pd.DataFrame({"x": [2, 3], "p": [1, 1], "v2": [22, 23]})

    dfs = [
        {
            "source": df_source1,
            "enrich": df_enrich1
        },
        {
            "source": df_source2,
            "enrich": df_enrich2
        },
    ]

    cube = Cube(
        dimension_columns=["x"],
        partition_columns=["p"],
        uuid_prefix="cube",
        seed_dataset="source",
    )

    if driver_name in ("dask_bag_bs1", "dask_bag_bs3"):
        bag = db.from_sequence(
            dfs, partition_size=1 if driver_name == "dask_bag_bs1" else 3).map(
                _count_execution_to_store, store=function_store)
        bag = build_cube_from_bag(
            data=bag,
            cube=cube,
            store=function_store,
            ktk_cube_dataset_ids=["source", "enrich"],
        )
        bag.compute()
    else:
        raise ValueError(
            "Missing implementation for driver: {}".format(driver_name))

    assert len(function_store().keys(prefix="counter.")) == 2
예제 #4
0
def test_dask_bag_fusing(driver, function_store, driver_name, skip_eager):
    """
    There were two issues with the dask.bag write path.

    Ideal
    -----
    With 4 partitions and 2 datasets to write, it should look like this:

        o-+
          +
        o-+
          +-o
        o-+
          +
        o-+

    Missing linear fusing
    ---------------------
    The bags did not have linear fusing:

        o-o-o-o-o-+
                  +
        o-o-o-o-o-+
                  +-o
        o-o-o-o-o-+
                  +
        o-o-o-o-o-+

    Process-then-write instead of one-at-the-time
    ---------------------------------------------
    Due to the implementation of using 1 write bag per dataset and a pluck/split operation, the data for the whole bag
    partition was kept, then split, then written. Instead we aim for processing (including write) each DF in the
    partition and then move all metadata to the correct write path:

        o-s>-+
          v  |
          |  |
        o-------s>-+
          |  |  v  |
          |  |  |  |
        o-------------s>-+
          |  |  |  |  v  |
          |  |  |  |  |  |
        o-------------------s--+
          |  |  |  |  |  |  |  |
          +-----+-----+-----+-----o--+
             |     |     |     |     +-o
             +-----+-----+-----+--o--+
    """

    partition_size = 1 if driver_name == "dask_bag_bs1" else 3
    n_partitions = 4

    dfs = [{
        "source":
        pd.DataFrame({
            "x": [2 * i, 2 * i + 1],
            "p": i,
            "v1": 42
        }),
        "enrich":
        pd.DataFrame({
            "x": [2 * i, 2 * i + 1],
            "p": i,
            "v2": 1337
        }),
    } for i in range(partition_size * n_partitions)]

    cube = Cube(
        dimension_columns=["x"],
        partition_columns=["p"],
        uuid_prefix="cube",
        seed_dataset="source",
    )

    bag = db.from_sequence(dfs, partition_size=partition_size).map(
        _count_execution_to_store, store=function_store)
    bag = build_cube_from_bag(
        data=bag,
        cube=cube,
        store=function_store,
        ktk_cube_dataset_ids=["source", "enrich"],
    )
    dct = dask.optimize(bag)[0].__dask_graph__()
    tasks = {k for k, v in dct.items() if dask.core.istask(v)}
    assert len(tasks) == (n_partitions + 1)