Пример #1
0
def test_global_funcs():
    assert isinstance(make_execution_engine(), NativeExecutionEngine)
    register_execution_engine(
        "xyz", lambda conf, **kwargs: _MockExecutionEngine(conf, **kwargs))
    assert isinstance(make_execution_engine("xyz"), _MockExecutionEngine)
    register_default_execution_engine(
        lambda conf, **kwargs: _MockExecutionEngine(conf, **kwargs),
        on_dup="ignore")
    assert not isinstance(make_execution_engine(), _MockExecutionEngine)
    register_default_execution_engine(
        lambda conf, **kwargs: _MockExecutionEngine(conf, **kwargs),
        on_dup="overwrite")
    assert isinstance(make_execution_engine(), _MockExecutionEngine)

    se = SqliteEngine(make_execution_engine)
    assert make_sql_engine(se) is se
    assert not isinstance(make_sql_engine(None, make_execution_engine()),
                          _MockSQlEngine)
    register_sql_engine("x", lambda engine: _MockSQlEngine(engine))
    assert isinstance(make_sql_engine("x", make_execution_engine()),
                      _MockSQlEngine)
    register_default_sql_engine(
        lambda engine: _MockSQlEngine(engine, other=10))
    e = make_execution_engine()
    assert isinstance(e, _MockExecutionEngine)
    assert isinstance(e.sql_engine, _MockSQlEngine)
    assert 10 == e.sql_engine.other
Пример #2
0
 def get_engine(self, line: str, lc: Dict[str, Any]) -> ExecutionEngine:
     line = line.strip()
     p = line.find("{")
     if p >= 0:
         engine = line[:p].strip()
         conf = json.loads(line[p:])
     else:
         parts = line.split(" ", 1)
         engine = parts[0]
         conf = ParamDict(None if len(parts) == 1 else lc[parts[1]])
     cf = dict(self._pre_conf)
     cf.update(conf)
     for k, v in self._post_conf.items():
         if k in cf and cf[k] != v:
             raise ValueError(
                 f"{k} must be {v}, but you set to {cf[k]}, you may unset it"
             )
         cf[k] = v
     if "+" in engine:
         return make_execution_engine(tuple(engine.split("+", 1)), cf)
     return make_execution_engine(engine, cf)
Пример #3
0
def suggest_sk_model(
    space: Space,
    train_df: Any,
    scoring: str,
    serialize_path: str,
    cv: int = 5,
    feature_prefix: str = "",
    label_col: str = "label",
    save_model: bool = False,
    partition_keys: List[str] = _EMPTY_LIST,
    top_n: int = 1,
    visualize_top_n: int = 0,
    objective_runner: Optional[ObjectiveRunner] = None,
    distributable: Optional[bool] = None,
    execution_engine: Any = None,
) -> List[Dict[str, Any]]:
    e = make_execution_engine(execution_engine)
    model_path = serialize_path if save_model else ""

    dag = FugueWorkflow()
    df = dag.df(train_df)
    if len(partition_keys) > 0:
        df = df.partition(by=partition_keys)
    skcv = build_sk_cv(
        space=space,
        train_df=df,
        scoring=scoring,
        cv=cv,
        feature_prefix=feature_prefix,
        label_col=label_col,
        save_path=model_path,
    )
    result = skcv.tune(
        objective_runner=objective_runner,
        distributable=distributable,
        serialize_path=serialize_path,
        shuffle=True,
    ).persist()
    best = select_best(result, top=top_n) if top_n > 0 else result
    visualize_top(result, top=visualize_top_n)
    dag.run(e)
    return list(best.result.as_dict_iterable())
Пример #4
0
def test_tune_df(tmpdir):
    @tunable()
    def t1(a: int, df: pd.DataFrame, b: int) -> float:
        return float(a + b + df["y"].sum())

    e = make_execution_engine(None, {FUGUE_TUNE_TEMP_PATH: str(tmpdir)})

    for distributable in [True, False, None]:
        with FugueWorkflow(e) as dag:
            s = space_to_df(dag,
                            Space(a=Grid(0, 1), b=Grid(2, 3)),
                            batch_size=3)
            t = dag.df([[0, 1], [1, 2], [0, 2]],
                       "x:int,y:int").partition(by=["x"])
            df = serialize_df(t, "df", str(tmpdir)).cross_join(s.broadcast())
            tune(df, t1, distributable=distributable).show()

    for distributable in [True, False, None]:
        with FugueWorkflow(e) as dag:
            df = dag.df([[0, 1], [1, 2], [0, 2]], "x:int,y:int")
            t1.space(a=Grid(0, 1), b=Grid(2, 3), df=df).tune().show()

    @tunable()
    def t2(df1: pd.DataFrame, df2: pd.DataFrame, a: int,
           b: int) -> Dict[str, Any]:
        return {
            "error": float(a + b + df1["y"].sum() + df2["y"].sum()),
            "metadata": {
                "a": a
            },
        }

    with FugueWorkflow(e) as dag:
        df1 = dag.df([[0, 1], [1, 2], [0, 2]],
                     "x:int,y:int").partition(by=["x"])
        df2 = dag.df([[0, 10], [0, 20]], "x:int,y:int").partition(by=["x"])
        t2.space(df1=df1, df2=df2, a=Grid(0, 1), b=Grid(2, 3)).tune().show()
Пример #5
0
def test_make_execution_engine():
    e = make_execution_engine(None, {FUGUE_CONF_SQL_IGNORE_CASE: True})
    assert isinstance(e, NativeExecutionEngine)
    assert e.compile_conf.get_or_throw(FUGUE_CONF_SQL_IGNORE_CASE, bool)

    e = make_execution_engine(NativeExecutionEngine,
                              {FUGUE_CONF_SQL_IGNORE_CASE: True})
    assert isinstance(e, NativeExecutionEngine)
    assert e.compile_conf.get_or_throw(FUGUE_CONF_SQL_IGNORE_CASE, bool)

    e = make_execution_engine(NativeExecutionEngine({"ab": "c"}),
                              {FUGUE_CONF_SQL_IGNORE_CASE: True},
                              de="f")
    assert isinstance(e, NativeExecutionEngine)
    assert e.compile_conf.get_or_throw(FUGUE_CONF_SQL_IGNORE_CASE, bool)
    assert "c" == e.compile_conf.get_or_throw("ab", str)
    assert "f" == e.compile_conf.get_or_throw("de", str)
    assert "c" == e.conf.get_or_throw("ab", str)
    assert "de" not in e.conf

    e = make_execution_engine("pandas", {FUGUE_CONF_SQL_IGNORE_CASE: True})
    assert isinstance(e, NativeExecutionEngine)
    assert e.compile_conf.get_or_throw(FUGUE_CONF_SQL_IGNORE_CASE, bool)

    e = make_execution_engine((NativeExecutionEngine, "sqlite"),
                              {FUGUE_CONF_SQL_IGNORE_CASE: True})
    assert isinstance(e, NativeExecutionEngine)
    assert e.compile_conf.get_or_throw(FUGUE_CONF_SQL_IGNORE_CASE, bool)

    e = make_execution_engine(
        NativeExecutionEngine({FUGUE_CONF_SQL_IGNORE_CASE: True}))
    assert isinstance(e, NativeExecutionEngine)
    assert e.compile_conf.get_or_throw(FUGUE_CONF_SQL_IGNORE_CASE, bool)

    e = make_execution_engine(
        (NativeExecutionEngine({FUGUE_CONF_SQL_IGNORE_CASE: True}), "sqlite"))
    assert isinstance(e, NativeExecutionEngine)
    assert e.compile_conf.get_or_throw(FUGUE_CONF_SQL_IGNORE_CASE, bool)
Пример #6
0
def suggest_sk_stacking_model(
    space: Space,
    stack_space: Space,
    train_df: Any,
    scoring: str,
    serialize_path: str,
    cv: int = 5,
    feature_prefix: str = "",
    label_col: str = "label",
    save_model: bool = False,
    partition_keys: List[str] = _EMPTY_LIST,
    top_n: int = 1,
    visualize_top_n: int = 0,
    objective_runner: Optional[ObjectiveRunner] = None,
    distributable: Optional[bool] = None,
    execution_engine: Any = None,
    stack_cv: int = 2,
    stack_method: str = "auto",
    stack_passthrough: bool = False,
) -> List[Dict[str, Any]]:
    e = make_execution_engine(execution_engine)
    model_path = serialize_path if save_model else ""

    dag = FugueWorkflow()
    df = dag.df(train_df)
    if len(partition_keys) > 0:
        df = df.partition(by=partition_keys)
    skcv = build_sk_cv(
        space=space,
        train_df=df,
        scoring=scoring,
        cv=cv,
        feature_prefix=feature_prefix,
        label_col=label_col,
    )
    result = skcv.tune(
        objective_runner=objective_runner,
        distributable=distributable,
        serialize_path=serialize_path,
        shuffle=True,
    ).persist()
    best_models = select_best(result.transform(_extract_model), top=1)
    if top_n > 0:
        best_models = select_best(best_models.drop(["_sk__model"]), top=top_n)
    kwargs = Space(
        _sk__scoring=scoring,
        _sk__cv=cv,
        _sk__feature_prefix=feature_prefix,
        _sk__label_col=label_col,
        _sk__save_path=model_path,
        _sk__stack_cv=stack_cv,
        _sk__method=stack_method,
        _sk__passthrough=stack_passthrough,
    )
    space_df = best_models.process(
        _process_stack_space,
        params=dict(keys=partition_keys, space=stack_space * kwargs),
    )
    data = serialize_df(df, name="_sk__train_df", path=serialize_path)
    if len(partition_keys) > 0:
        data = data.inner_join(space_df.broadcast())
    else:
        data = data.cross_join(space_df.broadcast())
    result = tune(
        data,
        tunable=tunable(_sk_stack_cv),
        distributable=distributable,
        objective_runner=objective_runner,
    )
    best = select_best(result, top=1)
    visualize_top(result, top=visualize_top_n)
    dag.run(e)
    return list(best.result.as_dict_iterable())