def ds(function_store, cube_has_ts_col): dfs = [pd.DataFrame({"x": x, "p": [0, 1], "i": 0, "_foo": 0}) for x in [0, 1]] if cube_has_ts_col: for df in dfs: df["KLEE_TS"] = pd.Timestamp("2019-01-01") mps = [ MetaPartition( label="mp{}".format(i), data={SINGLE_TABLE: df}, metadata_version=KTK_CUBE_METADATA_VERSION, ) .partition_on(["p"] + (["KLEE_TS"] if cube_has_ts_col else [])) .build_indices(["i"]) for i, df in enumerate(dfs) ] return store_bag_as_dataset( bag=db.from_sequence(mps, partition_size=1), store=function_store, dataset_uuid="uuid", partition_on=(["p"] + (["KLEE_TS"] if cube_has_ts_col else [])), metadata_storage_format=KTK_CUBE_METADATA_STORAGE_FORMAT, metadata_version=KTK_CUBE_METADATA_VERSION, df_serializer=KTK_CUBE_DF_SERIALIZER, ).compute()
def _store_dataframes(execution_mode, df_list, *args, **kwargs): if execution_mode == "dask.bag": return store_bag_as_dataset(db.from_sequence(df_list), *args, **kwargs).compute() elif execution_mode == "dask.delayed": return store_delayed_as_dataset(df_list, *args, **kwargs).compute() else: raise ValueError("Unknown execution mode: {}".format(execution_mode))
def _store_dataframes(execution_mode, df_list, *args, **kwargs): if execution_mode == "dask.bag": bag = store_bag_as_dataset(db.from_sequence(df_list), *args, **kwargs) s = pickle.dumps(bag, pickle.HIGHEST_PROTOCOL) bag = pickle.loads(s) return bag.compute() elif execution_mode == "dask.delayed": tasks = store_delayed_as_dataset(df_list, *args, **kwargs) s = pickle.dumps(tasks, pickle.HIGHEST_PROTOCOL) tasks = pickle.loads(s) return tasks.compute() else: raise ValueError("Unknown execution mode: {}".format(execution_mode))
def _store_dataframes(df_list, *args, **kwargs): return store_bag_as_dataset(db.from_sequence(df_list), *args, **kwargs).compute()
def _store_dataframes(df_list, *args, **kwargs): bag = store_bag_as_dataset(db.from_sequence(df_list), *args, **kwargs) s = pickle.dumps(bag, pickle.HIGHEST_PROTOCOL) bag = pickle.loads(s) return bag.compute()