def test_tune_simple(): def t1(a: int, b: int) -> float: return a + b for distributable in [True, False, None]: with FugueWorkflow() as dag: df = space_to_df(dag, Space(a=Grid(0, 1), b=Grid(2, 3))) tune(df, t1, distributable=distributable).show() @tunable() def t2(e: ExecutionEngine, a: int, b: int) -> float: assert isinstance(e, ExecutionEngine) return a + b for distributable in [False, None]: with FugueWorkflow() as dag: df = space_to_df(dag, Space(a=Grid(0, 1), b=Grid(2, 3))) tune(df, t2, distributable=distributable).show() # equivalent syntax sugar with FugueWorkflow() as dag: t2.space(a=Grid(0, 1), b=Grid(2, 3)).tune(dag).show() with raises(FugueTuneCompileError): with FugueWorkflow() as dag: df = space_to_df(dag, Space(a=Grid(0, 1), b=Grid(2, 3))) tune(df, t2, distributable=True).show()
def test_space_to_df(): with FugueWorkflow() as dag: df = space_to_df(dag, Space(a=Grid(0, 1), b=Grid(2, 3))) df.assert_eq( dag.df( [ ['[{"a": 0, "b": 2}]'], ['[{"a": 0, "b": 3}]'], ['[{"a": 1, "b": 2}]'], ['[{"a": 1, "b": 3}]'], ], "__fmin_params__:str", )) with FugueWorkflow() as dag: df = space_to_df(dag, Space(a=Grid(0, 1), b=Grid(2, 3)), batch_size=3, shuffle=False) df.assert_eq( dag.df( [ ['[{"a": 0, "b": 2}, {"a": 0, "b": 3}, {"a": 1, "b": 2}]'], ['[{"a": 1, "b": 3}]'], ], "__fmin_params__:str", ))
def test_save_and_use(): dag = FugueWorkflow() a = dag.create(mock_create1, params=dict(n=1)) b = dag.create(mock_create1, params=dict(n=1)) a = a.save_and_use("xx", fmt="parquet", mode="overwrite") b.save_and_use("xx", mode="append") b.save_and_use("xx", mode="error") a = a.save_and_use("xx.csv", fmt="csv", mode="error", single=True, header=True) a = a.partition(by=["x"]).save_and_use("xx", mode="overwrite") dag.create(mock_create1, params=dict(n=2)).save_and_use("xx", mode="overwrite") assert_eq( """ a=create using mock_create1(n=1) b=create using mock_create1(n=1) a=save and use a overwrite parquet "xx" save and use b append "xx" save and use b to "xx" save and use a to single csv "xx.csv"(header=True) save and use prepartition by x overwrite "xx" save and use (create using mock_create1(n=2)) overwrite "xx" """, dag, )
def test_hyperband(tmpdir): def assert_metric(df: Iterable[Dict[str, Any]], metric: float, ct: int) -> None: n = 0 for row in df: if metric > 0: assert row[TUNE_REPORT_METRIC] == metric n += 1 assert n == ct space = Space(a=Grid(0, 1, 2, 3)) dag = FugueWorkflow() dataset = TuneDatasetBuilder(space, str(tmpdir)).build(dag) obj = F() res = optimize_by_hyperband( obj, dataset, plans=[ [[1.0, 3], [1.0, 2], [1.0, 1], [1.0, 1]], [[2.0, 2], [1.0, 1], [1.0, 1]], ], checkpoint_path=str(tmpdir), ) res.result().output(assert_metric, dict(metric=0.0, ct=2)) res.result(1).output(assert_metric, dict(metric=1.0, ct=1)) dag.run()
def test_visualize_top_n(tmpdir): def t1(a: int, b: int) -> float: return a + b with FugueWorkflow() as dag: df = space_to_df(dag, Space(a=Grid(0, 1), b=Grid(2, 3))) visualize_top_n(tune(df, t1, distributable=False), top=2) @tunable() def t2(df1: pd.DataFrame, df2: pd.DataFrame, a: int, b: int) -> Dict[str, Any]: return { "error": float(a + b + df1["y"].sum() + df2["y"].sum()), "metadata": { "a": a }, } e = NativeExecutionEngine(conf={FUGUE_TUNE_TEMP_PATH: str(tmpdir)}) with FugueWorkflow(e) as dag: df1 = dag.df([[0, 1], [1, 2], [0, 2]], "x:int,y:int").partition(by=["x"]) df2 = dag.df([[0, 10], [1, 20]], "x:int,y:int").partition(by=["x"]) res = t2.space(df1=df1, df2=df2, a=Grid(0, 1), b=Grid(2, 3)).tune() visualize_top_n(res, top=2)
def test_input_module(): # pylint: disable=no-value-for-parameter @module() def input1(wf: FugueWorkflow) -> WorkflowDataFrame: return wf.df([[0]], "a:int") @module() def input2(wf: FugueWorkflow, a: int) -> WorkflowDataFrame: return wf.df([[a]], "a:int") @module() def input3(wf: FugueWorkflow, a: int, b: int) -> WorkflowDataFrames: return WorkflowDataFrames(a=wf.df([[a]], "a:int"), b=wf.df([[b]], "b:int")) assert not input1.has_input assert input1.has_single_output assert not input1.has_no_output assert input3.has_multiple_output with FugueWorkflow() as dag: input1(dag).assert_eq(dag.df([[0]], "a:int")) with FugueWorkflow() as dag: input2(a=10, wf=dag).assert_eq(dag.df([[10]], "a:int")) with FugueWorkflow() as dag: dfs = input3(dag, 10, 11) dfs["a"].assert_eq(dag.df([[10]], "a:int")) dfs["b"].assert_eq(dag.df([[11]], "b:int"))
def make_dataset( self, dag: FugueWorkflow, dataset: Any, df: Any = None, df_name: str = TUNE_DATASET_DF_DEFAULT_NAME, test_df: Any = None, test_df_name: str = TUNE_DATASET_VALIDATION_DF_DEFAULT_NAME, partition_keys: Optional[List[str]] = None, temp_path: str = "", ) -> TuneDataset: assert_or_throw(dataset is not None, TuneCompileError("dataset can't be None")) if isinstance(dataset, TuneDataset): assert_or_throw( df is None, TuneCompileError("can't set df when dataset is TuneDataset")) return dataset if isinstance(dataset, Space): path = self.get_path_or_temp(temp_path) builder = TuneDatasetBuilder(dataset, path) if df is not None: wdf = dag.df(df) if partition_keys is not None and len(partition_keys) > 0: wdf = wdf.partition_by(*partition_keys) builder.add_df(df_name, wdf) if test_df is not None: wdf = dag.df(test_df) how = "cross" if partition_keys is not None and len(partition_keys) > 0: wdf = wdf.partition_by(*partition_keys) how = "inner" builder.add_df(test_df_name, wdf, how=how) return builder.build(dag, batch_size=1, shuffle=True) raise TuneCompileError(f"{dataset} can't be converted to TuneDataset")
def test_run_asha(tmpdir): class M(Monitor): def on_report(self, report: TrialReport) -> None: print(report.jsondict) def assert_metric(df: Iterable[Dict[str, Any]], metric: float, ct: int) -> None: n = 0 for row in df: assert row[TUNE_REPORT_METRIC] == metric n += 1 assert n == ct space = Space(a=Grid(0, 1, 2, 3)) dag = FugueWorkflow() dataset = TuneDatasetBuilder(space, str(tmpdir)).build(dag, shuffle=False) obj = F() res = optimize_by_continuous_asha( obj, dataset, plan=[[1.0, 3], [1.0, 2], [1.0, 1], [1.0, 1]], checkpoint_path=str(tmpdir), ) res.result(1).output(assert_metric, dict(metric=1.0, ct=1)) res = optimize_by_continuous_asha( obj, dataset, plan=[[2.0, 2], [1.0, 1], [1.0, 1]], checkpoint_path=str(tmpdir), monitor=M(), ) res.result(1).output(assert_metric, dict(metric=1.0, ct=1)) dag.run()
def test_create(): id0 = FugueWorkflow().df([[0]], "a:int32").workflow.spec_uuid() id1 = FugueWorkflow().df([[1]], "a:int32").workflow.spec_uuid() id2 = FugueWorkflow().df([[1]], "a:int32").workflow.spec_uuid() assert id1 != id0 assert id1 == id2
def assert_eq(expr, expected: FugueWorkflow): global_vars, local_vars = get_caller_global_local_vars() sql = FugueSQL(expr, "fugueLanguage", ignore_case=True, simple_assign=True) wf = FugueWorkflow() v = _Extensions( sql, FugueSQLHooks(), wf, global_vars=global_vars, local_vars=local_vars ) obj = v.visit(sql.tree) assert expected.spec_uuid() == v.workflow.spec_uuid()
def test_process_module(): # pylint: disable=no-value-for-parameter def process(df: pd.DataFrame, d: int = 1) -> pd.DataFrame: df["a"] += d return df @module def p1(wf: FugueSQLWorkflow, df: WorkflowDataFrame) -> WorkflowDataFrame: return df.process(process) @module() def p2(wf: FugueWorkflow, dfs: WorkflowDataFrames, d: int) -> WorkflowDataFrames: return WorkflowDataFrames( {k: v.process(process, params={"d": d}) for k, v in dfs.items()}) @module(as_method=True, name="p4") def p3(df: WorkflowDataFrame) -> WorkflowDataFrame: return df.process(process) assert p1.has_input assert not p1.has_dfs_input assert p2.has_dfs_input with FugueSQLWorkflow() as dag: df = dag.df([[0]], "a:int") p1(df).assert_eq(dag.df([[1]], "a:int")) p1(dag, df).assert_eq(dag.df([[1]], "a:int")) p1(df=df).assert_eq(dag.df([[1]], "a:int")) p1(df=df, wf=dag).assert_eq(dag.df([[1]], "a:int")) with FugueWorkflow() as dag: dfs = WorkflowDataFrames(aa=dag.df([[0]], "a:int"), bb=dag.df([[10]], "a:int")) r = p2(dag, dfs, 1) r["aa"].assert_eq(dag.df([[1]], "a:int")) r["bb"].assert_eq(dag.df([[11]], "a:int")) r = p2(dfs, 1) r["aa"].assert_eq(dag.df([[1]], "a:int")) r["bb"].assert_eq(dag.df([[11]], "a:int")) r = p2(d=1, dfs=dfs, wf=dag) r["aa"].assert_eq(dag.df([[1]], "a:int")) r["bb"].assert_eq(dag.df([[11]], "a:int")) r = p2(d=1, dfs=dfs) r["aa"].assert_eq(dag.df([[1]], "a:int")) r["bb"].assert_eq(dag.df([[11]], "a:int")) with FugueWorkflow() as dag: df = dag.df([[0]], "a:int") p3(df).assert_eq(dag.df([[1]], "a:int")) p3(df=df).assert_eq(dag.df([[1]], "a:int")) df.p4().assert_eq(dag.df([[1]], "a:int"))
def test_worflow_dataframes(): dag1 = FugueWorkflow() df1 = dag1.df([[0]], "a:int") df2 = dag1.df([[0]], "b:int") dag2 = FugueWorkflow() df3 = dag2.df([[0]], "a:int") dfs1 = WorkflowDataFrames(a=df1, b=df2) assert dfs1["a"] is df1 assert dfs1["b"] is df2 dfs2 = WorkflowDataFrames(dfs1, aa=df1, bb=df2) assert 4 == len(dfs2) with raises(ValueError): WorkflowDataFrames(a=df1, b=df3) with raises(ValueError): WorkflowDataFrames(a=df1, b=ArrayDataFrame([[0]], "a:int")) dag = FugueWorkflow() df = dag.df([[0, 1], [1, 1]], "a:int,b:int") assert df.partition_spec.empty df2 = df.partition(by=["a"]) assert df.partition_spec.empty assert df2.partition_spec == PartitionSpec(by=["a"]) df3 = df.partition_by("a", "b") assert df.partition_spec.empty assert df3.partition_spec == PartitionSpec(by=["a", "b"]) df4 = df.per_partition_by("a", "b") assert df.partition_spec.empty assert df4.partition_spec == PartitionSpec(by=["a", "b"], algo="even") df4 = df.per_row() assert df.partition_spec.empty assert df4.partition_spec == PartitionSpec("per_row")
def test_runtime_exception(): if sys.version_info < (3, 7): return def tr(df: pd.DataFrame) -> pd.DataFrame: raise Exception def show(df): df.show() dag = FugueWorkflow() df = dag.df([[0]], "a:int") df = df.transform(tr, schema="*") show(df) try: dag.run() except: assert len(traceback.extract_tb(sys.exc_info()[2])) < 10 try: dag.run("native", {FUGUE_CONF_WORKFLOW_EXCEPTION_OPTIMIZE: False}) except: assert len(traceback.extract_tb(sys.exc_info()[2])) > 10 try: dag.run("native", {FUGUE_CONF_WORKFLOW_EXCEPTION_HIDE: ""}) except: assert len(traceback.extract_tb(sys.exc_info()[2])) > 10
def suggest_sk_models_by_cv( space: Space, train_df: Any, scoring: str, cv: int = 5, temp_path: str = "", feature_prefix: str = "", label_col: str = "label", save_model: bool = False, partition_keys: Optional[List[str]] = None, top_n: int = 1, local_optimizer: Optional[NonIterativeObjectiveLocalOptimizer] = None, monitor: Any = None, stopper: Any = None, stop_check_interval: Any = None, distributed: Optional[bool] = None, execution_engine: Any = None, execution_engine_conf: Any = None, ) -> List[TrialReport]: dag = FugueWorkflow() dataset = TUNE_OBJECT_FACTORY.make_dataset( dag, space, df=train_df, partition_keys=partition_keys, temp_path=temp_path, ) objective = SKCVObjective( scoring=scoring, cv=cv, feature_prefix=feature_prefix, label_col=label_col, checkpoint_path=temp_path if save_model else None, ) study = optimize_noniterative( objective=objective, dataset=dataset, optimizer=local_optimizer, distributed=distributed, monitor=monitor, stopper=stopper, stop_check_interval=stop_check_interval, ) study.result(top_n).yield_dataframe_as("result") rows = list( dag.run( execution_engine, conf=execution_engine_conf, )["result"].as_dict_iterable()) return [ from_base64(r[TUNE_REPORT]) for r in sorted(rows, key=lambda r: r[TUNE_REPORT_METRIC]) ]
def test_transform(): w = (FugueWorkflow().df([[0], [1]], "a:int", data_determiner=to_uuid).transform( mock_transformer, schema=Schema("a:int"), params=dict(n=2))) assert_eq( """ create [[0],[1]] schema a:int transform using mock_transformer(n=2) schema a:int """, w.workflow, ) w = (FugueWorkflow().df([[0], [1]], "a:int", data_determiner=to_uuid).partition( by=["a"], presort="b DESC", num="ROWCOUNT/2").transform(mock_transformer, schema="*", params=dict(n=2))) assert_eq( """ create [[0],[1]] schema a:int transform prepartition ROWCOUNT / 2 by a presort b desc using mock_transformer(n=2) schema * """, w.workflow, ) def _func(a: int, b: int) -> int: return a + b w = (FugueWorkflow().df([[0], [1]], "a:int", data_determiner=to_uuid).partition( by=["a"], presort="b DESC", num="ROWCOUNT/2").transform(mock_transformer, schema="*", params=dict(n=2), callback=_func)) assert_eq( """ create [[0],[1]] schema a:int transform prepartition ROWCOUNT / 2 by a presort b desc using mock_transformer(n=2) schema * callback _func """, w.workflow, )
def test_checkpoint(): id0 = FugueWorkflow().df([[0]], "a:int32").workflow.spec_uuid() id1 = FugueWorkflow().df([[0]], "a:int32").checkpoint().workflow.spec_uuid() id2 = FugueWorkflow().df([[0]], "a:int32").checkpoint("1").workflow.spec_uuid() id3 = FugueWorkflow().df([[0]], "a:int32").checkpoint(1).workflow.spec_uuid() assert id1 != id0 assert id1 != id2 assert id2 != id0 assert id2 == id3
def test_dataset(tmpdir): space = Space(a=Grid(0, 1, 2, 3, 4), b=Grid(5, 6, 7, 8, 9)) builder = TuneDatasetBuilder(space, str(tmpdir)) dag = FugueWorkflow() dataset = builder.build(dag) ds = dataset.split([4, 1], 0) assert 2 == len(ds) ds[0].data.yield_dataframe_as("a") ds[1].data.yield_dataframe_as("b") res = dag.run() assert 25 == len(res["a"].as_array()) + len(res["b"].as_array()) assert len(res["b"].as_array()) < 10
def trim_index( compute_engine: FugueExecutionEngine, df_graph: FugueDataFrame, indexed: bool = False, directed: bool = True, max_out_deg: int = 0, random_seed: Optional[int] = None, ) -> Tuple[FugueDataFrame, Optional[FugueDataFrame]]: """ The very first steps to treat the input graph: 1) basic validation of the input graph format: at least have ["src", "dst"] cols, it will be an unweighted graph if no "weight" col. 2) trim some edges to avoid super hotspot vertices: random sampling will be done on all the edges of a vertex if the number of edges is greater than a threshold, this is critical to reduce data skewness and save disk space 3) index the graph vertices by using sequential integers to represent vertices, this is critical to save memory :param compute_engine: an execution engine supported by Fugue :param df_graph: the input graph data as general Fugue dataframe :param indexed: if the input graph is using sequential integers to note vertices :param directed: if the graph is directed or not :param max_out_deg: the threshold for trimming hotspot vertices, set it to <= 0 to turn off trimming :param random_seed: optional random seed, for testing only Returns a validated, trimmed, and indexed graph """ logging.info("trim_index(): start validating, trimming, and indexing ...") if "src" not in df_graph.schema or "dst" not in df_graph.schema: raise ValueError( f"Input graph NOT in the right format: {df_graph.schema}") params = {"max_out_degree": max_out_deg, "random_seed": random_seed} dag = FugueWorkflow(compute_engine) df = (dag.df(df_graph).partition(by=["src"]).transform( trim_hotspot_vertices, schema="*", params=params, ).compute()) name_id = None if indexed is True: return df, name_id if isinstance(compute_engine, SparkExecutionEngine): df_res, name_id = index_graph_spark(df.native, directed) # type: ignore return SparkDataFrame(df_res), SparkDataFrame(name_id) else: df_res, name_id = index_graph_pandas(df.as_pandas(), directed) return PandasDataFrame(df_res), PandasDataFrame(name_id)
def test_out_transform(): class OT(OutputTransformer): def process(self, df): return o = _to_output_transformer(OT) w = FugueWorkflow() w.df([[0], [1]], "a:int", data_determiner=to_uuid).out_transform( o, params=dict(n=2) ) assert_eq( """ create [[0],[1]] schema a:int outtransform using OT(n=2) """, w, ) w = FugueWorkflow() w.df([[0], [1]], "a:int", data_determiner=to_uuid).partition( by=["a"], presort="b DESC", num="ROWCOUNT/2" ).out_transform(mock_transformer, params=dict(n=2)) assert_eq( """ create [[0],[1]] schema a:int outtransform prepartition ROWCOUNT / 2 by a presort b desc using mock_transformer(n=2) """, w, )
def test_select_with(): dag = FugueWorkflow() dag.select("with x as ( select * from a ) , y as ( select * from b ) " "select * from x union select * from y") assert_eq( """ with x as (select * from a), y as (select * from b) select * from x union select * from y """, dag, )
def test_rename(): dag = FugueWorkflow() a = dag.create(mock_create1) b = a.rename({"a": "aa", "b": "bb"}) c = a.rename({"a": "aaa", "b": "bbb"}) assert_eq( """ a=create using mock_create1 rename columns a:aa,b:bb rename columns a:aaa,b:bbb from a """, dag, )
def test_alter_columns(): dag = FugueWorkflow() a = dag.create(mock_create1) a.alter_columns(Schema("a:str,b:str")) a.alter_columns(Schema("a:float,b:double")) assert_eq( """ a=create using mock_create1 alter columns a:str, b:str alter columns a:float, b:double from a """, dag, )
def test_sample(): dag = FugueWorkflow() a = dag.create(mock_create1) a.sample(frac=0.1, replace=False, seed=None) a.sample(n=5, replace=True, seed=7) assert_eq( """ a=create using mock_create1 sample 10 percent sample replace 5 rows seed 7 from a """, dag, )
def test_workflow_determinism_2(): dag1 = FugueWorkflow() dag1.create_data([[0], [0], [1]], "a:int32") # <---- a1 = dag1.create_data([[0], [0], [1]], "a:int32") b1 = a1.transform(mock_tf1, "*,b:int", pre_partition=dict(by=["a"], num=2)) a1.show() dag2 = FugueWorkflow() a2 = dag2.create_data([[0], [0], [1]], "a:int32") b2 = a2.transform(mock_tf1, "*,b:int", pre_partition=dict(by=["a"], num=2)) a2.show() assert a1.spec_uuid() == a2.spec_uuid() assert b1.spec_uuid() == b2.spec_uuid() assert dag1.spec_uuid() != dag2.spec_uuid()
def test_cotransform(): dag = FugueWorkflow() a1 = dag.create(mock_create1, params=dict(n=1)) a2 = dag.create(mock_create1, params=dict(n=2)) z = dag.zip(a1, a2) t = z.partition(num=3).transform(mock_cotransformer1, params=dict(n=3)) assert_eq( """ zip (create using mock_create1 params n:1), (create using mock_create1 params n:2) transform prepartition 3 using mock_cotransformer1(n=3) """, dag, )
def test_load(): dag = FugueWorkflow() dag.load("xx") dag.load("xx", fmt="csv") dag.load("xx", columns="a:int,b:str") dag.load("xx", columns=["a", "b"], header=True) assert_eq( """ load "xx" load csv "xx" load "xx" columns a:int, b:str load "xx"(header=True) columns a, b """, dag, )
def test_wf(): @tunable() def func(a: float, b: float, c: int, d: int) -> float: return a * a + b * b + c + d with FugueWorkflow() as dag: space = space_to_df( dag, Space(a=Grid(1, 2), b=Rand(-100, 100), c=Choice(1, -1), d=RandInt(0, 3)), ) tune(space, func, objective_runner=HyperoptRunner(100, seed=3)).show() with FugueWorkflow() as dag: space = space_to_df(dag, Space(a=Grid(1, 2), b=Grid(0, 1), c=1, d=2)) tune(space, func, objective_runner=HyperoptRunner(100, seed=3)).show()
def test_output_module(): # pylint: disable=no-value-for-parameter @module() def o1(wf: FugueWorkflow, df: WorkflowDataFrame) -> None: pass @module() def o2(wf: FugueWorkflow, df: WorkflowDataFrame): pass @module() def o3(df: WorkflowDataFrame): pass assert o1.has_input assert o1.has_no_output assert o2.has_no_output assert o3.has_no_output with FugueWorkflow() as dag: df = dag.df([[0]], "a:int") o1(df) o1(dag, df) o2(df=df) o2(df=df, wf=dag) o3(df)
def outputter(df: LocalDataFrame) -> None: keys = [ k for k in df.schema.names if not k.startswith("__df_") and not k.startswith("__fmin_") ] def show(subdf: pd.DataFrame) -> None: if subdf.shape[0] == 0: # pragma: no cover return subdf = subdf.sort_values("__fmin_value__").head(top) title = (json.dumps({k: str(subdf[k].iloc[0]) for k in keys}) if len(keys) > 0 else "") pdf = pd.DataFrame( [json.loads(x) for x in subdf["__fmin_params__"]]) fig = plt.figure(figsize=(12, 3 * len(pdf.columns))) if len(keys) > 0: fig.suptitle( title, va="center", size=15, weight="bold", y=0.93, ) for i in range(len(pdf.columns)): ax = fig.add_subplot(len(pdf.columns), 1, i + 1) pdf[pdf.columns[i]].hist(ax=ax).set_title(pdf.columns[i]) plt.subplots_adjust(hspace=0.5) if len(keys) == 0: show(df.as_pandas()) else: with FugueWorkflow() as dag: dag.df(df).partition(by=keys).out_transform(show)
def _process_stack_space(engine: ExecutionEngine, df: DataFrame, keys: List[str], space: Space) -> DataFrame: fe_schema = df.schema.extract(keys) + "__fmin_fe__:str" def _merge_space(df: List[Dict[str, Any]]) -> Iterable[Dict[str, Any]]: p = json.dumps([json.loads(row["__fmin_params__"]) for row in df]) res = df[0] res["__fmin_fe__"] = p yield res # schema: *-__fmin_fe__ def _construct_final_space( df: Iterable[Dict[str, Any]]) -> Iterable[Dict[str, Any]]: for row in df: op = json.loads(row["__fmin_params__"]) for o in op: o["_sk__estimators"] = row["__fmin_fe__"] row["__fmin_params__"] = json.dumps(op) yield row with FugueWorkflow(engine) as dag: ddf = dag.df(df) space_df = space_to_df(dag, space).broadcast() if len(keys) == 0: fe = ddf.process(_merge_space, schema=fe_schema) else: fe = ddf.partition(by=keys).transform(_merge_space, schema=fe_schema) result = fe.cross_join(space_df).transform(_construct_final_space) return result.result