예제 #1
0
def ds(function_store, cube_has_ts_col):
    dfs = [pd.DataFrame({"x": x, "p": [0, 1], "i": 0, "_foo": 0}) for x in [0, 1]]
    if cube_has_ts_col:
        for df in dfs:
            df["KLEE_TS"] = pd.Timestamp("2019-01-01")

    mps = [
        MetaPartition(
            label="mp{}".format(i),
            data={SINGLE_TABLE: df},
            metadata_version=KTK_CUBE_METADATA_VERSION,
        )
        .partition_on(["p"] + (["KLEE_TS"] if cube_has_ts_col else []))
        .build_indices(["i"])
        for i, df in enumerate(dfs)
    ]

    return store_bag_as_dataset(
        bag=db.from_sequence(mps, partition_size=1),
        store=function_store,
        dataset_uuid="uuid",
        partition_on=(["p"] + (["KLEE_TS"] if cube_has_ts_col else [])),
        metadata_storage_format=KTK_CUBE_METADATA_STORAGE_FORMAT,
        metadata_version=KTK_CUBE_METADATA_VERSION,
        df_serializer=KTK_CUBE_DF_SERIALIZER,
    ).compute()
예제 #2
0
def _store_dataframes(execution_mode, df_list, *args, **kwargs):
    if execution_mode == "dask.bag":
        return store_bag_as_dataset(db.from_sequence(df_list), *args,
                                    **kwargs).compute()
    elif execution_mode == "dask.delayed":
        return store_delayed_as_dataset(df_list, *args, **kwargs).compute()
    else:
        raise ValueError("Unknown execution mode: {}".format(execution_mode))
예제 #3
0
def _store_dataframes(execution_mode, df_list, *args, **kwargs):
    if execution_mode == "dask.bag":
        bag = store_bag_as_dataset(db.from_sequence(df_list), *args, **kwargs)

        s = pickle.dumps(bag, pickle.HIGHEST_PROTOCOL)
        bag = pickle.loads(s)

        return bag.compute()
    elif execution_mode == "dask.delayed":
        tasks = store_delayed_as_dataset(df_list, *args, **kwargs)

        s = pickle.dumps(tasks, pickle.HIGHEST_PROTOCOL)
        tasks = pickle.loads(s)

        return tasks.compute()
    else:
        raise ValueError("Unknown execution mode: {}".format(execution_mode))
예제 #4
0
def _store_dataframes(df_list, *args, **kwargs):
    return store_bag_as_dataset(db.from_sequence(df_list), *args,
                                **kwargs).compute()
예제 #5
0
def _store_dataframes(df_list, *args, **kwargs):
    bag = store_bag_as_dataset(db.from_sequence(df_list), *args, **kwargs)
    s = pickle.dumps(bag, pickle.HIGHEST_PROTOCOL)
    bag = pickle.loads(s)
    return bag.compute()