def test_global_funcs(): assert isinstance(make_execution_engine(), NativeExecutionEngine) register_execution_engine( "xyz", lambda conf, **kwargs: _MockExecutionEngine(conf, **kwargs)) assert isinstance(make_execution_engine("xyz"), _MockExecutionEngine) register_default_execution_engine( lambda conf, **kwargs: _MockExecutionEngine(conf, **kwargs), on_dup="ignore") assert not isinstance(make_execution_engine(), _MockExecutionEngine) register_default_execution_engine( lambda conf, **kwargs: _MockExecutionEngine(conf, **kwargs), on_dup="overwrite") assert isinstance(make_execution_engine(), _MockExecutionEngine) se = SqliteEngine(make_execution_engine) assert make_sql_engine(se) is se assert not isinstance(make_sql_engine(None, make_execution_engine()), _MockSQlEngine) register_sql_engine("x", lambda engine: _MockSQlEngine(engine)) assert isinstance(make_sql_engine("x", make_execution_engine()), _MockSQlEngine) register_default_sql_engine( lambda engine: _MockSQlEngine(engine, other=10)) e = make_execution_engine() assert isinstance(e, _MockExecutionEngine) assert isinstance(e.sql_engine, _MockSQlEngine) assert 10 == e.sql_engine.other
def get_engine(self, line: str, lc: Dict[str, Any]) -> ExecutionEngine: line = line.strip() p = line.find("{") if p >= 0: engine = line[:p].strip() conf = json.loads(line[p:]) else: parts = line.split(" ", 1) engine = parts[0] conf = ParamDict(None if len(parts) == 1 else lc[parts[1]]) cf = dict(self._pre_conf) cf.update(conf) for k, v in self._post_conf.items(): if k in cf and cf[k] != v: raise ValueError( f"{k} must be {v}, but you set to {cf[k]}, you may unset it" ) cf[k] = v if "+" in engine: return make_execution_engine(tuple(engine.split("+", 1)), cf) return make_execution_engine(engine, cf)
def suggest_sk_model( space: Space, train_df: Any, scoring: str, serialize_path: str, cv: int = 5, feature_prefix: str = "", label_col: str = "label", save_model: bool = False, partition_keys: List[str] = _EMPTY_LIST, top_n: int = 1, visualize_top_n: int = 0, objective_runner: Optional[ObjectiveRunner] = None, distributable: Optional[bool] = None, execution_engine: Any = None, ) -> List[Dict[str, Any]]: e = make_execution_engine(execution_engine) model_path = serialize_path if save_model else "" dag = FugueWorkflow() df = dag.df(train_df) if len(partition_keys) > 0: df = df.partition(by=partition_keys) skcv = build_sk_cv( space=space, train_df=df, scoring=scoring, cv=cv, feature_prefix=feature_prefix, label_col=label_col, save_path=model_path, ) result = skcv.tune( objective_runner=objective_runner, distributable=distributable, serialize_path=serialize_path, shuffle=True, ).persist() best = select_best(result, top=top_n) if top_n > 0 else result visualize_top(result, top=visualize_top_n) dag.run(e) return list(best.result.as_dict_iterable())
def test_tune_df(tmpdir): @tunable() def t1(a: int, df: pd.DataFrame, b: int) -> float: return float(a + b + df["y"].sum()) e = make_execution_engine(None, {FUGUE_TUNE_TEMP_PATH: str(tmpdir)}) for distributable in [True, False, None]: with FugueWorkflow(e) as dag: s = space_to_df(dag, Space(a=Grid(0, 1), b=Grid(2, 3)), batch_size=3) t = dag.df([[0, 1], [1, 2], [0, 2]], "x:int,y:int").partition(by=["x"]) df = serialize_df(t, "df", str(tmpdir)).cross_join(s.broadcast()) tune(df, t1, distributable=distributable).show() for distributable in [True, False, None]: with FugueWorkflow(e) as dag: df = dag.df([[0, 1], [1, 2], [0, 2]], "x:int,y:int") t1.space(a=Grid(0, 1), b=Grid(2, 3), df=df).tune().show() @tunable() def t2(df1: pd.DataFrame, df2: pd.DataFrame, a: int, b: int) -> Dict[str, Any]: return { "error": float(a + b + df1["y"].sum() + df2["y"].sum()), "metadata": { "a": a }, } with FugueWorkflow(e) as dag: df1 = dag.df([[0, 1], [1, 2], [0, 2]], "x:int,y:int").partition(by=["x"]) df2 = dag.df([[0, 10], [0, 20]], "x:int,y:int").partition(by=["x"]) t2.space(df1=df1, df2=df2, a=Grid(0, 1), b=Grid(2, 3)).tune().show()
def test_make_execution_engine(): e = make_execution_engine(None, {FUGUE_CONF_SQL_IGNORE_CASE: True}) assert isinstance(e, NativeExecutionEngine) assert e.compile_conf.get_or_throw(FUGUE_CONF_SQL_IGNORE_CASE, bool) e = make_execution_engine(NativeExecutionEngine, {FUGUE_CONF_SQL_IGNORE_CASE: True}) assert isinstance(e, NativeExecutionEngine) assert e.compile_conf.get_or_throw(FUGUE_CONF_SQL_IGNORE_CASE, bool) e = make_execution_engine(NativeExecutionEngine({"ab": "c"}), {FUGUE_CONF_SQL_IGNORE_CASE: True}, de="f") assert isinstance(e, NativeExecutionEngine) assert e.compile_conf.get_or_throw(FUGUE_CONF_SQL_IGNORE_CASE, bool) assert "c" == e.compile_conf.get_or_throw("ab", str) assert "f" == e.compile_conf.get_or_throw("de", str) assert "c" == e.conf.get_or_throw("ab", str) assert "de" not in e.conf e = make_execution_engine("pandas", {FUGUE_CONF_SQL_IGNORE_CASE: True}) assert isinstance(e, NativeExecutionEngine) assert e.compile_conf.get_or_throw(FUGUE_CONF_SQL_IGNORE_CASE, bool) e = make_execution_engine((NativeExecutionEngine, "sqlite"), {FUGUE_CONF_SQL_IGNORE_CASE: True}) assert isinstance(e, NativeExecutionEngine) assert e.compile_conf.get_or_throw(FUGUE_CONF_SQL_IGNORE_CASE, bool) e = make_execution_engine( NativeExecutionEngine({FUGUE_CONF_SQL_IGNORE_CASE: True})) assert isinstance(e, NativeExecutionEngine) assert e.compile_conf.get_or_throw(FUGUE_CONF_SQL_IGNORE_CASE, bool) e = make_execution_engine( (NativeExecutionEngine({FUGUE_CONF_SQL_IGNORE_CASE: True}), "sqlite")) assert isinstance(e, NativeExecutionEngine) assert e.compile_conf.get_or_throw(FUGUE_CONF_SQL_IGNORE_CASE, bool)
def suggest_sk_stacking_model( space: Space, stack_space: Space, train_df: Any, scoring: str, serialize_path: str, cv: int = 5, feature_prefix: str = "", label_col: str = "label", save_model: bool = False, partition_keys: List[str] = _EMPTY_LIST, top_n: int = 1, visualize_top_n: int = 0, objective_runner: Optional[ObjectiveRunner] = None, distributable: Optional[bool] = None, execution_engine: Any = None, stack_cv: int = 2, stack_method: str = "auto", stack_passthrough: bool = False, ) -> List[Dict[str, Any]]: e = make_execution_engine(execution_engine) model_path = serialize_path if save_model else "" dag = FugueWorkflow() df = dag.df(train_df) if len(partition_keys) > 0: df = df.partition(by=partition_keys) skcv = build_sk_cv( space=space, train_df=df, scoring=scoring, cv=cv, feature_prefix=feature_prefix, label_col=label_col, ) result = skcv.tune( objective_runner=objective_runner, distributable=distributable, serialize_path=serialize_path, shuffle=True, ).persist() best_models = select_best(result.transform(_extract_model), top=1) if top_n > 0: best_models = select_best(best_models.drop(["_sk__model"]), top=top_n) kwargs = Space( _sk__scoring=scoring, _sk__cv=cv, _sk__feature_prefix=feature_prefix, _sk__label_col=label_col, _sk__save_path=model_path, _sk__stack_cv=stack_cv, _sk__method=stack_method, _sk__passthrough=stack_passthrough, ) space_df = best_models.process( _process_stack_space, params=dict(keys=partition_keys, space=stack_space * kwargs), ) data = serialize_df(df, name="_sk__train_df", path=serialize_path) if len(partition_keys) > 0: data = data.inner_join(space_df.broadcast()) else: data = data.cross_join(space_df.broadcast()) result = tune( data, tunable=tunable(_sk_stack_cv), distributable=distributable, objective_runner=objective_runner, ) best = select_best(result, top=1) visualize_top(result, top=visualize_top_n) dag.run(e) return list(best.result.as_dict_iterable())