Python build_cube_from_bag 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: kartothek.io.dask.bag_cube

메소드/함수: build_cube_from_bag

hotexamples.com에서의 예제들: 4

Python build_cube_from_bag - 4개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 kartothek.io.dask.bag_cube.build_cube_from_bag에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

예제 #1

파일 보기

def multipartition_cube(module_store, fullrange_data, fullrange_cube):
    def _gen(part):
        result = {}
        for dataset_id, df in fullrange_data.items():
            df = df.copy()
            df["z"] = part
            result[dataset_id] = df
        return result

    cube = fullrange_cube.copy(uuid_prefix="multipartition_cube")
    build_cube_from_bag(
        data=db.from_sequence([0, 1], partition_size=1).map(_gen),
        store=module_store,
        cube=cube,
        ktk_cube_dataset_ids=["seed", "enrich_dense", "enrich_sparse"],
    ).compute()
    return cube

예제 #2

파일 보기

파일: stats_cube.py 프로젝트: xhochy/kartothek

def test_multifile(driver, function_store):
    dfs = [pd.DataFrame({"x": [i], "p": [0], "v1": [10]}) for i in range(2)]
    cube = Cube(dimension_columns=["x"],
                partition_columns=["p"],
                uuid_prefix="cube")
    build_cube_from_bag(data=db.from_sequence(dfs, partition_size=1),
                        cube=cube,
                        store=function_store).compute()

    result = driver(cube=cube, store=function_store)

    assert set(result.keys()) == {cube.seed_dataset}
    stats_seed = result[cube.seed_dataset]
    assert stats_seed["partitions"] == 1
    assert stats_seed["files"] == 2
    assert stats_seed["rows"] == 2
    assert stats_seed["blobsize"] > 0

예제 #3

파일 보기

파일: test_build.py 프로젝트: xhochy/kartothek

def test_function_executed_once(driver, function_store, driver_name,
                                skip_eager):
    """
    Test that the payload function is only executed once per branch.

    This was a bug in the dask_bag backend.
    """
    if driver_name == "dask_dataframe":
        pytest.skip("not relevant for dask.dataframe")

    df_source1 = pd.DataFrame({"x": [0, 1], "p": [0, 0], "v1": [10, 11]})
    df_source2 = pd.DataFrame({"x": [2, 3], "p": [1, 1], "v1": [12, 13]})
    df_enrich1 = pd.DataFrame({"x": [0, 1], "p": [0, 0], "v2": [20, 21]})
    df_enrich2 = pd.DataFrame({"x": [2, 3], "p": [1, 1], "v2": [22, 23]})

    dfs = [
        {
            "source": df_source1,
            "enrich": df_enrich1
        },
        {
            "source": df_source2,
            "enrich": df_enrich2
        },
    ]

    cube = Cube(
        dimension_columns=["x"],
        partition_columns=["p"],
        uuid_prefix="cube",
        seed_dataset="source",
    )

    if driver_name in ("dask_bag_bs1", "dask_bag_bs3"):
        bag = db.from_sequence(
            dfs, partition_size=1 if driver_name == "dask_bag_bs1" else 3).map(
                _count_execution_to_store, store=function_store)
        bag = build_cube_from_bag(
            data=bag,
            cube=cube,
            store=function_store,
            ktk_cube_dataset_ids=["source", "enrich"],
        )
        bag.compute()
    else:
        raise ValueError(
            "Missing implementation for driver: {}".format(driver_name))

    assert len(function_store().keys(prefix="counter.")) == 2

예제 #4

파일 보기

파일: test_build.py 프로젝트: xhochy/kartothek

def test_dask_bag_fusing(driver, function_store, driver_name, skip_eager):
    """
    There were two issues with the dask.bag write path.

    Ideal
    -----
    With 4 partitions and 2 datasets to write, it should look like this:

        o-+
          +
        o-+
          +-o
        o-+
          +
        o-+

    Missing linear fusing
    ---------------------
    The bags did not have linear fusing:

        o-o-o-o-o-+
                  +
        o-o-o-o-o-+
                  +-o
        o-o-o-o-o-+
                  +
        o-o-o-o-o-+

    Process-then-write instead of one-at-the-time
    ---------------------------------------------
    Due to the implementation of using 1 write bag per dataset and a pluck/split operation, the data for the whole bag
    partition was kept, then split, then written. Instead we aim for processing (including write) each DF in the
    partition and then move all metadata to the correct write path:

        o-s>-+
          v  |
          |  |
        o-------s>-+
          |  |  v  |
          |  |  |  |
        o-------------s>-+
          |  |  |  |  v  |
          |  |  |  |  |  |
        o-------------------s--+
          |  |  |  |  |  |  |  |
          +-----+-----+-----+-----o--+
             |     |     |     |     +-o
             +-----+-----+-----+--o--+
    """

    partition_size = 1 if driver_name == "dask_bag_bs1" else 3
    n_partitions = 4

    dfs = [{
        "source":
        pd.DataFrame({
            "x": [2 * i, 2 * i + 1],
            "p": i,
            "v1": 42
        }),
        "enrich":
        pd.DataFrame({
            "x": [2 * i, 2 * i + 1],
            "p": i,
            "v2": 1337
        }),
    } for i in range(partition_size * n_partitions)]

    cube = Cube(
        dimension_columns=["x"],
        partition_columns=["p"],
        uuid_prefix="cube",
        seed_dataset="source",
    )

    bag = db.from_sequence(dfs, partition_size=partition_size).map(
        _count_execution_to_store, store=function_store)
    bag = build_cube_from_bag(
        data=bag,
        cube=cube,
        store=function_store,
        ktk_cube_dataset_ids=["source", "enrich"],
    )
    dct = dask.optimize(bag)[0].__dask_graph__()
    tasks = {k for k, v in dct.items() if dask.core.istask(v)}
    assert len(tasks) == (n_partitions + 1)