def test_out_transform(): class OT(OutputTransformer): def process(self, df): return o = _to_output_transformer(OT) w = FugueWorkflow() w.df([[0], [1]], "a:int", data_determiner=to_uuid).out_transform( o, params=dict(n=2) ) assert_eq( """ create [[0],[1]] schema a:int outtransform using OT(n=2) """, w, ) w = FugueWorkflow() w.df([[0], [1]], "a:int", data_determiner=to_uuid).partition( by=["a"], presort="b DESC", num="ROWCOUNT/2" ).out_transform(mock_transformer, params=dict(n=2)) assert_eq( """ create [[0],[1]] schema a:int outtransform prepartition ROWCOUNT / 2 by a presort b desc using mock_transformer(n=2) """, w, )
def test_process_stack_space(tmpdir): space1 = ss(LinearRegression, normalize=Grid(True, False)) space2 = ss(LinearRegression, fit_intercept=Grid(True, False)) dag = FugueWorkflow() result0 = build_sk_cv( space1, dag.df(_create_mock_data()), scoring="neg_mean_absolute_error", cv=2, label_col="l", feature_prefix="f_", ).tune(distributable=False, serialize_path=str(tmpdir)) res0 = result0.process(_process_stack_space, params=dict(keys=[], space=space2)) res0.show() result1 = build_sk_cv( space1, dag.df(_create_mock_data()).partition(by=["p"]), scoring="neg_mean_absolute_error", cv=2, label_col="l", feature_prefix="f_", ).tune(distributable=False, serialize_path=str(tmpdir)) res1 = result1.process(_process_stack_space, params=dict(keys=["p"], space=space2)) dag.run() assert 2 == len(res0.result.as_array()) assert 8 == len(res1.result.as_array())
def make_dataset( self, dag: FugueWorkflow, dataset: Any, df: Any = None, df_name: str = TUNE_DATASET_DF_DEFAULT_NAME, test_df: Any = None, test_df_name: str = TUNE_DATASET_VALIDATION_DF_DEFAULT_NAME, partition_keys: Optional[List[str]] = None, temp_path: str = "", ) -> TuneDataset: assert_or_throw(dataset is not None, TuneCompileError("dataset can't be None")) if isinstance(dataset, TuneDataset): assert_or_throw( df is None, TuneCompileError("can't set df when dataset is TuneDataset")) return dataset if isinstance(dataset, Space): path = self.get_path_or_temp(temp_path) builder = TuneDatasetBuilder(dataset, path) if df is not None: wdf = dag.df(df) if partition_keys is not None and len(partition_keys) > 0: wdf = wdf.partition_by(*partition_keys) builder.add_df(df_name, wdf) if test_df is not None: wdf = dag.df(test_df) how = "cross" if partition_keys is not None and len(partition_keys) > 0: wdf = wdf.partition_by(*partition_keys) how = "inner" builder.add_df(test_df_name, wdf, how=how) return builder.build(dag, batch_size=1, shuffle=True) raise TuneCompileError(f"{dataset} can't be converted to TuneDataset")
def test_worflow_dataframes(): dag1 = FugueWorkflow() df1 = dag1.df([[0]], "a:int") df2 = dag1.df([[0]], "b:int") dag2 = FugueWorkflow() df3 = dag2.df([[0]], "a:int") dfs1 = WorkflowDataFrames(a=df1, b=df2) assert dfs1["a"] is df1 assert dfs1["b"] is df2 dfs2 = WorkflowDataFrames(dfs1, aa=df1, bb=df2) assert 4 == len(dfs2) with raises(ValueError): WorkflowDataFrames(a=df1, b=df3) with raises(ValueError): WorkflowDataFrames(a=df1, b=ArrayDataFrame([[0]], "a:int")) dag = FugueWorkflow() df = dag.df([[0, 1], [1, 1]], "a:int,b:int") assert df.partition_spec.empty df2 = df.partition(by=["a"]) assert df.partition_spec.empty assert df2.partition_spec == PartitionSpec(by=["a"]) df3 = df.partition_by("a", "b") assert df.partition_spec.empty assert df3.partition_spec == PartitionSpec(by=["a", "b"]) df4 = df.per_partition_by("a", "b") assert df.partition_spec.empty assert df4.partition_spec == PartitionSpec(by=["a", "b"], algo="even") df4 = df.per_row() assert df.partition_spec.empty assert df4.partition_spec == PartitionSpec("per_row")
def test_study(tmpdir): space = Space(a=Grid(-2, 0, 1)) input_df = pd.DataFrame([[0, 1], [1, 1], [0, 2]], columns=["a", "b"]) dag = FugueWorkflow() monitor = M() # no data partition builder = TuneDatasetBuilder(space, str(tmpdir)).add_df("b", dag.df(input_df)) dataset = builder.build(dag, 1) for distributed in [True, False, None]: # min_better = True result = optimize_noniterative( objective=to_noniterative_objective(objective), dataset=dataset, distributed=distributed, ) result.result()[[TUNE_REPORT, TUNE_REPORT_METRIC ]].output(assert_metric, params=dict(metrics=[3.0, 4.0, 7.0])) result.result(2)[[TUNE_REPORT, TUNE_REPORT_METRIC ]].output(assert_metric, params=dict(metrics=[3.0, 4.0])) # min_better = False result = optimize_noniterative( objective=to_noniterative_objective(objective, min_better=False), dataset=dataset, distributed=distributed, ) result.result()[[TUNE_REPORT, TUNE_REPORT_METRIC ]].output(assert_metric, params=dict(metrics=[-7.0, -4.0, -3.0])) result.result(2)[[TUNE_REPORT, TUNE_REPORT_METRIC ]].output(assert_metric, params=dict(metrics=[-7.0, -4.0])) # with data partition builder = TuneDatasetBuilder(space, str(tmpdir)).add_df( "b", dag.df(input_df).partition_by("a")) dataset = builder.build(dag, 1) for distributed in [True, False, None]: result = optimize_noniterative( objective=to_noniterative_objective(objective), dataset=dataset, distributed=distributed, monitor=monitor, ) result.result()[[TUNE_REPORT, TUNE_REPORT_METRIC]].output( assert_metric, params=dict(metrics=[2.0, 3.0, 6.0, 1.0, 2.0, 5.0])) result.result(1)[[TUNE_REPORT, TUNE_REPORT_METRIC ]].output(assert_metric, params=dict(metrics=[1.0, 2.0])) dag.run() assert 3 * 3 * 2 == len(monitor._reports)
def test_yield(tmpdir): df = pd.DataFrame([[0, 0]], columns=["a", "b"]) # schema: * def t(df: pd.DataFrame) -> pd.DataFrame: return df.assign(b=df.b + 1) dag = FugueWorkflow() dag.df(df).transform(t).yield_dataframe_as("x") result = dag.run()["x"] assert [[0, 1]] == result.as_array() dag1 = FugueWorkflow() dag1.df(df).transform(t).yield_file_as("x") dag1.run("", {FUGUE_CONF_WORKFLOW_CHECKPOINT_PATH: str(tmpdir)}) dag2 = FugueWorkflow() dag2.df(dag1.yields["x"]).transform(t).yield_dataframe_as("y") result = dag2.run("", {FUGUE_CONF_WORKFLOW_CHECKPOINT_PATH: str(tmpdir)})["y"] assert [[0, 2]] == result.as_array() dag3 = FugueWorkflow() dag3.df(dag2.yields["y"]).transform(t).yield_dataframe_as("z") result = dag3.run()["z"] assert [[0, 3]] == result.as_array()
def test_auto_persist(): dag1 = FugueWorkflow(NativeExecutionEngine()) df1 = dag1.df([[0]], "a:int") df1.show() df1.show() id1 = dag1.spec_uuid() dag2 = FugueWorkflow(NativeExecutionEngine({"fugue.workflow.auto_persist": True})) df1 = dag2.df([[0]], "a:int") df1.show() df1.show() id2 = dag2.spec_uuid() dag3 = FugueWorkflow(NativeExecutionEngine()) df1 = dag3.df([[0]], "a:int").weak_checkpoint(level=None) df1.show() df1.show() id3 = dag3.spec_uuid() assert id1 == id2 assert id2 == id3 dag2 = FugueWorkflow( NativeExecutionEngine( { "fugue.workflow.auto_persist": True, "fugue.workflow.auto_persist_value": "abc", } ) ) df1 = dag2.df([[0]], "a:int") df1.show() df1.show() id2 = dag2.spec_uuid() dag3 = FugueWorkflow(NativeExecutionEngine()) df1 = dag3.df([[0]], "a:int").weak_checkpoint(level="abc") df1.show() df1.show() id3 = dag3.spec_uuid() assert id2 == id3 dag1 = FugueWorkflow(NativeExecutionEngine()) df1 = dag1.df([[0]], "a:int") df1.show() id1 = dag1.spec_uuid() dag2 = FugueWorkflow(NativeExecutionEngine({"fugue.workflow.auto_persist": True})) df1 = dag2.df([[0]], "a:int") df1.show() # auto persist will not trigger id2 = dag2.spec_uuid() dag3 = FugueWorkflow(NativeExecutionEngine()) df1 = dag3.df([[0]], "a:int").weak_checkpoint(level=None) df1.show() id3 = dag3.spec_uuid() assert id1 == id2 assert id2 == id3 # checkpoint, including auto_persist doesn't change determinism
def test_auto_persist(): dag1 = FugueWorkflow(NativeExecutionEngine()) df1 = dag1.df([[0]], "a:int") df1.show() df1.show() id1 = dag1.spec_uuid() dag2 = FugueWorkflow( NativeExecutionEngine({"fugue.workflow.auto_persist": True})) df1 = dag2.df([[0]], "a:int") df1.show() df1.show() id2 = dag2.spec_uuid() dag3 = FugueWorkflow(NativeExecutionEngine()) df1 = dag3.df([[0]], "a:int").persist() df1.show() df1.show() id3 = dag3.spec_uuid() assert id1 != id2 assert id2 == id3 dag2 = FugueWorkflow( NativeExecutionEngine({ "fugue.workflow.auto_persist": True, "fugue.workflow.auto_persist_value": "abc" })) df1 = dag2.df([[0]], "a:int") df1.show() df1.show() id2 = dag2.spec_uuid() dag3 = FugueWorkflow(NativeExecutionEngine()) df1 = dag3.df([[0]], "a:int").persist("abc") df1.show() df1.show() id3 = dag3.spec_uuid() assert id2 == id3 dag1 = FugueWorkflow(NativeExecutionEngine()) df1 = dag1.df([[0]], "a:int") df1.show() id1 = dag1.spec_uuid() dag2 = FugueWorkflow( NativeExecutionEngine({"fugue.workflow.auto_persist": True})) df1 = dag2.df([[0]], "a:int") df1.show() # auto persist will not trigger id2 = dag2.spec_uuid() dag3 = FugueWorkflow(NativeExecutionEngine()) df1 = dag3.df([[0]], "a:int").persist() df1.show() id3 = dag3.spec_uuid() assert id1 == id2 assert id2 != id3
def test_workflow(): builder = FugueWorkflow() a = builder.create_data([[0], [0], [1]], "a:int") raises(InvalidOperationError, lambda: a._task.copy()) raises(InvalidOperationError, lambda: copy.copy(a._task)) raises(InvalidOperationError, lambda: copy.deepcopy(a._task)) a.show() a.show() raises(FugueWorkflowCompileError, lambda: builder.df(123)) b = a.transform(mock_tf1, "*,b:int", pre_partition=dict(by=["a"])) b.show() builder.create_data([[0], [1]], "b:int").show() c = ArrayDataFrame([[100]], "a:int") builder.show(a, b, c) b = a.partition(by=["a"]).transform(mock_tf2).persist().broadcast() b.show() builder.run() df_eq(a.result, [[0], [0], [1]], "a:int") raises(TypeError, lambda: builder.run("abc")) builder.run(FugueWorkflowContext()) df_eq(a.result, [[0], [0], [1]], "a:int") builder.run("NativeExecutionEngine") df_eq(b.result, [[0, 2], [0, 2], [1, 1]], "a:int,b:int") df_eq(b.compute(), [[0, 2], [0, 2], [1, 1]], "a:int,b:int") df_eq(b.compute(NativeExecutionEngine), [[0, 2], [0, 2], [1, 1]], "a:int,b:int")
def test_runtime_exception(): if sys.version_info < (3, 7): return def tr(df: pd.DataFrame) -> pd.DataFrame: raise Exception def show(df): df.show() dag = FugueWorkflow() df = dag.df([[0]], "a:int") df = df.transform(tr, schema="*") show(df) try: dag.run() except: assert len(traceback.extract_tb(sys.exc_info()[2])) < 10 try: dag.run("native", {FUGUE_CONF_WORKFLOW_EXCEPTION_OPTIMIZE: False}) except: assert len(traceback.extract_tb(sys.exc_info()[2])) > 10 try: dag.run("native", {FUGUE_CONF_WORKFLOW_EXCEPTION_HIDE: ""}) except: assert len(traceback.extract_tb(sys.exc_info()[2])) > 10
def test_run_ibis_duck(self): def _test1(con: ibis.BaseBackend) -> ibis.Expr: tb = con.table("a") return tb def _test2(con: ibis.BaseBackend) -> ibis.Expr: tb = con.table("a") return tb.mutate(c=tb.a + tb.b) dag = FugueWorkflow() df = dag.df([[0, 1], [2, 3]], "a:long,b:long") res = run_ibis(_test1, ibis_engine="duck", a=df) res.assert_eq(df) df = dag.df([[0, 1], [2, 3]], "a:long,b:long") res = run_ibis(_test2, ibis_engine="duckdb", a=df) df2 = dag.df([[0, 1, 1], [2, 3, 5]], "a:long,b:long,c:long") res.assert_eq(df2) dag.run(NativeExecutionEngine())
def test_out_transform(tmpdir): pdf = pd.DataFrame([[1, 10], [0, 0], [1, 1], [0, 20]], columns=["a", "b"]) class T: def __init__(self): self.n = 0 def f(self, df: Iterable[Dict[str, Any]]) -> None: self.n += 1 t = T() out_transform(pdf, t.f) assert 1 == t.n t = T() out_transform(pdf, t.f, partition=dict(by=["a"])) assert 2 == t.n dag = FugueWorkflow() dag.df(pdf).yield_dataframe_as("x1") dag.df(pdf).yield_dataframe_as("x2") dag.run("", {FUGUE_CONF_WORKFLOW_CHECKPOINT_PATH: str(tmpdir)}) t = T() out_transform(dag.yields["x1"], t.f) assert 1 == t.n t = T() out_transform( dag.yields["x2"], t.f, partition=dict(by=["a"]), engine_conf={FUGUE_CONF_WORKFLOW_CHECKPOINT_PATH: str(tmpdir)}, ) assert 2 == t.n # schema: * def f3(df: pd.DataFrame, called: callable) -> pd.DataFrame: called() return df cb = Callback() result = out_transform(pdf, f3, callback=cb.called) assert 1 == cb.ct
def test_yield(): dag = FugueWorkflow() dag.df([[0]], "a:int32").show() id0 = dag.spec_uuid() x = FugueWorkflow().df([[0]], "a:int32") x.yield_file_as("x") x.show() id1 = x.workflow.spec_uuid() x = FugueWorkflow().df([[0]], "a:int32") x.deterministic_checkpoint().yield_file_as("y") x.show() id2 = x.workflow.spec_uuid() x = FugueWorkflow().df([[0]], "a:int32") x.deterministic_checkpoint().yield_dataframe_as("z") x.show() id3 = x.workflow.spec_uuid() # yield doesn't change determinism assert id0 == id1 assert id0 == id2 assert id0 == id3
def test_transform_from_yield(tmpdir): # schema: *,x:int def f(df: pd.DataFrame) -> pd.DataFrame: return df.assign(x=1) dag = FugueWorkflow() dag.df([[0]], "a:int").yield_dataframe_as("x1") dag.df([[1]], "b:int").yield_dataframe_as("x2") dag.run("", {FUGUE_CONF_WORKFLOW_CHECKPOINT_PATH: str(tmpdir)}) result = transform(dag.yields["x1"], f) assert isinstance(result, DataFrame) assert result.as_array(type_safe=True) == [[0, 1]] result = transform( dag.yields["x2"], f, engine_conf={FUGUE_CONF_WORKFLOW_CHECKPOINT_PATH: str(tmpdir)}, ) assert isinstance(result, DataFrame) assert result.as_array(type_safe=True) == [[1, 1]]
def test_invalid_module(): # pylint: disable=no-value-for-parameter @module() def o1(wf: FugueWorkflow, df1: WorkflowDataFrame, df2: WorkflowDataFrame) -> None: pass @module() def o2(wf: FugueWorkflow, dfs: WorkflowDataFrames) -> None: pass dag1 = FugueWorkflow() df1 = dag1.df([[0]], "a:int") dag2 = FugueWorkflow() df2 = dag2.df([[1]], "a:int") with raises(ValueError): o1(df1, df2) with raises(ValueError): o2(WorkflowDataFrames(a=df1, b=df2))
def trim_index( compute_engine: FugueExecutionEngine, df_graph: FugueDataFrame, indexed: bool = False, directed: bool = True, max_out_deg: int = 0, random_seed: Optional[int] = None, ) -> Tuple[FugueDataFrame, Optional[FugueDataFrame]]: """ The very first steps to treat the input graph: 1) basic validation of the input graph format: at least have ["src", "dst"] cols, it will be an unweighted graph if no "weight" col. 2) trim some edges to avoid super hotspot vertices: random sampling will be done on all the edges of a vertex if the number of edges is greater than a threshold, this is critical to reduce data skewness and save disk space 3) index the graph vertices by using sequential integers to represent vertices, this is critical to save memory :param compute_engine: an execution engine supported by Fugue :param df_graph: the input graph data as general Fugue dataframe :param indexed: if the input graph is using sequential integers to note vertices :param directed: if the graph is directed or not :param max_out_deg: the threshold for trimming hotspot vertices, set it to <= 0 to turn off trimming :param random_seed: optional random seed, for testing only Returns a validated, trimmed, and indexed graph """ logging.info("trim_index(): start validating, trimming, and indexing ...") if "src" not in df_graph.schema or "dst" not in df_graph.schema: raise ValueError( f"Input graph NOT in the right format: {df_graph.schema}") params = {"max_out_degree": max_out_deg, "random_seed": random_seed} dag = FugueWorkflow(compute_engine) df = (dag.df(df_graph).partition(by=["src"]).transform( trim_hotspot_vertices, schema="*", params=params, ).compute()) name_id = None if indexed is True: return df, name_id if isinstance(compute_engine, SparkExecutionEngine): df_res, name_id = index_graph_spark(df.native, directed) # type: ignore return SparkDataFrame(df_res), SparkDataFrame(name_id) else: df_res, name_id = index_graph_pandas(df.as_pandas(), directed) return PandasDataFrame(df_res), PandasDataFrame(name_id)
def test_build_sk_cv(tmpdir): space = sum([ ss(LinearRegression, fit_intercept=Grid(True, False)), ss(LinearRegression, normalize=Grid(True, False)), ]) dag = FugueWorkflow() build_sk_cv( space, dag.df(_create_mock_data()), scoring="neg_mean_absolute_error", cv=4, label_col="l", feature_prefix="f_", save_path=str(tmpdir), ).tune(distributable=False, serialize_path=str(tmpdir)).show() dag.run()
def test_fill(): dag = FugueWorkflow() a = dag.df([[None, 1], [1, None]], "a:int, b:int", data_determiner=to_uuid) b = a.fillna({"a": 99, "b": -99}) assert_eq( """ a=create [[NULL, 1],[1, NULL]] schema a:int, b:int fill nulls params a:99, b:-99 from a""", dag, ) assert_eq( """ create [[NULL, 1],[1, NULL]] schema a:int, b:int fill nulls (a:99, b:-99)""", dag, )
def space_to_df(wf: FugueWorkflow, space: Space, batch_size: int = 1, shuffle: bool = True) -> WorkflowDataFrame: def get_data() -> Iterable[List[Any]]: it = list(space.encode()) # type: ignore if shuffle: random.seed(0) random.shuffle(it) res: List[Any] = [] for a in it: res.append(a) if batch_size == len(res): yield [json.dumps(res)] res = [] if len(res) > 0: yield [json.dumps(res)] return wf.df(IterableDataFrame(get_data(), "__fmin_params__:str"))
def suggest_sk_model( space: Space, train_df: Any, scoring: str, serialize_path: str, cv: int = 5, feature_prefix: str = "", label_col: str = "label", save_model: bool = False, partition_keys: List[str] = _EMPTY_LIST, top_n: int = 1, visualize_top_n: int = 0, objective_runner: Optional[ObjectiveRunner] = None, distributable: Optional[bool] = None, execution_engine: Any = None, ) -> List[Dict[str, Any]]: e = make_execution_engine(execution_engine) model_path = serialize_path if save_model else "" dag = FugueWorkflow() df = dag.df(train_df) if len(partition_keys) > 0: df = df.partition(by=partition_keys) skcv = build_sk_cv( space=space, train_df=df, scoring=scoring, cv=cv, feature_prefix=feature_prefix, label_col=label_col, save_path=model_path, ) result = skcv.tune( objective_runner=objective_runner, distributable=distributable, serialize_path=serialize_path, shuffle=True, ).persist() best = select_best(result, top=top_n) if top_n > 0 else result visualize_top(result, top=visualize_top_n) dag.run(e) return list(best.result.as_dict_iterable())
def _space_to_df(self, wf: FugueWorkflow, batch_size: int = 1, shuffle: bool = True) -> WorkflowDataFrame: def get_data() -> Iterable[List[Any]]: it = list(self._space) # type: ignore if shuffle: random.seed(0) random.shuffle(it) res: List[Any] = [] for a in it: res.append(a) if batch_size == len(res): yield [pickle.dumps(res)] res = [] if len(res) > 0: yield [pickle.dumps(res)] return wf.df( IterableDataFrame(get_data(), f"{TUNE_DATASET_PARAMS_PREFIX}:binary"))
def test_modified_exception(): if sys.version_info < (3, 7): return def tr(df: pd.DataFrame) -> pd.DataFrame: raise Exception def show(df): df.show() def tt(df): __modified_exception__ = NotImplementedError() return df.transform(tr, schema="*") dag = FugueWorkflow() df = dag.df([[0]], "a:int") df = tt(df) show(df) try: dag.run() except Exception as ex: assert isinstance(ex.__cause__, NotImplementedError)
def test_head(): dag = FugueWorkflow() a = dag.df( [[None, 1], [None, 2], [1, None], [1, 2]], "a:double, b:double", data_determiner=to_uuid, ) b = a.partition(by=["a"], presort="b desc").take(1, na_position="first") c = b.take(1, presort="b desc", na_position="first") assert_eq( """ a=create [[NULL, 1], [NULL, 2], [1, NULL], [1, 2]] schema a:double, b:double b=take 1 row from a prepartition by a presort b desc nulls first c=take 1 row from b presort b desc nulls first""", dag, ) # anonymous assert_eq( """ create [[NULL, 1], [NULL, 2], [1, NULL], [1, 2]] schema a:double, b:double take 1 row prepartition by a presort b desc nulls first take 1 row presort b desc nulls first""", dag, )
def suggest_sk_stacking_model( space: Space, stack_space: Space, train_df: Any, scoring: str, serialize_path: str, cv: int = 5, feature_prefix: str = "", label_col: str = "label", save_model: bool = False, partition_keys: List[str] = _EMPTY_LIST, top_n: int = 1, visualize_top_n: int = 0, objective_runner: Optional[ObjectiveRunner] = None, distributable: Optional[bool] = None, execution_engine: Any = None, stack_cv: int = 2, stack_method: str = "auto", stack_passthrough: bool = False, ) -> List[Dict[str, Any]]: e = make_execution_engine(execution_engine) model_path = serialize_path if save_model else "" dag = FugueWorkflow() df = dag.df(train_df) if len(partition_keys) > 0: df = df.partition(by=partition_keys) skcv = build_sk_cv( space=space, train_df=df, scoring=scoring, cv=cv, feature_prefix=feature_prefix, label_col=label_col, ) result = skcv.tune( objective_runner=objective_runner, distributable=distributable, serialize_path=serialize_path, shuffle=True, ).persist() best_models = select_best(result.transform(_extract_model), top=1) if top_n > 0: best_models = select_best(best_models.drop(["_sk__model"]), top=top_n) kwargs = Space( _sk__scoring=scoring, _sk__cv=cv, _sk__feature_prefix=feature_prefix, _sk__label_col=label_col, _sk__save_path=model_path, _sk__stack_cv=stack_cv, _sk__method=stack_method, _sk__passthrough=stack_passthrough, ) space_df = best_models.process( _process_stack_space, params=dict(keys=partition_keys, space=stack_space * kwargs), ) data = serialize_df(df, name="_sk__train_df", path=serialize_path) if len(partition_keys) > 0: data = data.inner_join(space_df.broadcast()) else: data = data.cross_join(space_df.broadcast()) result = tune( data, tunable=tunable(_sk_stack_cv), distributable=distributable, objective_runner=objective_runner, ) best = select_best(result, top=1) visualize_top(result, top=visualize_top_n) dag.run(e) return list(best.result.as_dict_iterable())
def input3(wf: FugueWorkflow, a: int, b: int) -> WorkflowDataFrames: return WorkflowDataFrames(a=wf.df([[a]], "a:int"), b=wf.df([[b]], "b:int"))
def input2(wf: FugueWorkflow, a: int) -> WorkflowDataFrame: return wf.df([[a]], "a:int")
def input1(wf: FugueWorkflow) -> WorkflowDataFrame: return wf.df([[0]], "a:int")
def create(wf: FugueWorkflow, n: int = 1) -> WorkflowDataFrame: return wf.df([[n]], "a:int")