Пример #1
0
def test_run_asha(tmpdir):
    class M(Monitor):
        def on_report(self, report: TrialReport) -> None:
            print(report.jsondict)

    def assert_metric(df: Iterable[Dict[str, Any]], metric: float,
                      ct: int) -> None:
        n = 0
        for row in df:
            assert row[TUNE_REPORT_METRIC] == metric
            n += 1
        assert n == ct

    space = Space(a=Grid(0, 1, 2, 3))
    dag = FugueWorkflow()
    dataset = TuneDatasetBuilder(space, str(tmpdir)).build(dag, shuffle=False)
    obj = F()
    res = optimize_by_continuous_asha(
        obj,
        dataset,
        plan=[[1.0, 3], [1.0, 2], [1.0, 1], [1.0, 1]],
        checkpoint_path=str(tmpdir),
    )
    res.result(1).output(assert_metric, dict(metric=1.0, ct=1))

    res = optimize_by_continuous_asha(
        obj,
        dataset,
        plan=[[2.0, 2], [1.0, 1], [1.0, 1]],
        checkpoint_path=str(tmpdir),
        monitor=M(),
    )
    res.result(1).output(assert_metric, dict(metric=1.0, ct=1))
    dag.run()
Пример #2
0
def test_compile_conf():
    def assert_conf(e: ExecutionEngine, **kwargs) -> pd.DataFrame:
        for k, v in kwargs.items():
            assert e.compile_conf[k] == v
        return pd.DataFrame([[0]], columns=["a"])

    dag = FugueWorkflow(conf={"a": 1})
    dag.create(assert_conf, params=dict(a=1))

    dag.run()

    with raises(KeyError):  # non-compile time param doesn't keep in new engine
        dag.run(NativeExecutionEngine())

    dag = FugueWorkflow(conf={FUGUE_CONF_WORKFLOW_EXCEPTION_HIDE: "abc"})
    dag.create(assert_conf,
               params=dict({FUGUE_CONF_WORKFLOW_EXCEPTION_HIDE: "abc"}))

    dag.run()

    # non-compile time param is kepts
    dag.run(NativeExecutionEngine())

    # non-compile time param can't be changed by new engines
    # new engine compile conf will be overwritten
    dag.run(NativeExecutionEngine({FUGUE_CONF_WORKFLOW_EXCEPTION_HIDE: "def"}))
Пример #3
0
def test_yield(tmpdir):
    df = pd.DataFrame([[0, 0]], columns=["a", "b"])

    # schema: *
    def t(df: pd.DataFrame) -> pd.DataFrame:
        return df.assign(b=df.b + 1)

    dag = FugueWorkflow()
    dag.df(df).transform(t).yield_dataframe_as("x")
    result = dag.run()["x"]
    assert [[0, 1]] == result.as_array()

    dag1 = FugueWorkflow()
    dag1.df(df).transform(t).yield_file_as("x")
    dag1.run("", {FUGUE_CONF_WORKFLOW_CHECKPOINT_PATH: str(tmpdir)})

    dag2 = FugueWorkflow()
    dag2.df(dag1.yields["x"]).transform(t).yield_dataframe_as("y")
    result = dag2.run("",
                      {FUGUE_CONF_WORKFLOW_CHECKPOINT_PATH: str(tmpdir)})["y"]
    assert [[0, 2]] == result.as_array()

    dag3 = FugueWorkflow()
    dag3.df(dag2.yields["y"]).transform(t).yield_dataframe_as("z")
    result = dag3.run()["z"]
    assert [[0, 3]] == result.as_array()
Пример #4
0
def test_process_stack_space(tmpdir):
    space1 = ss(LinearRegression, normalize=Grid(True, False))
    space2 = ss(LinearRegression, fit_intercept=Grid(True, False))
    dag = FugueWorkflow()
    result0 = build_sk_cv(
        space1,
        dag.df(_create_mock_data()),
        scoring="neg_mean_absolute_error",
        cv=2,
        label_col="l",
        feature_prefix="f_",
    ).tune(distributable=False, serialize_path=str(tmpdir))
    res0 = result0.process(_process_stack_space,
                           params=dict(keys=[], space=space2))
    res0.show()

    result1 = build_sk_cv(
        space1,
        dag.df(_create_mock_data()).partition(by=["p"]),
        scoring="neg_mean_absolute_error",
        cv=2,
        label_col="l",
        feature_prefix="f_",
    ).tune(distributable=False, serialize_path=str(tmpdir))
    res1 = result1.process(_process_stack_space,
                           params=dict(keys=["p"], space=space2))
    dag.run()

    assert 2 == len(res0.result.as_array())
    assert 8 == len(res1.result.as_array())
Пример #5
0
def test_hyperband(tmpdir):
    def assert_metric(df: Iterable[Dict[str, Any]], metric: float,
                      ct: int) -> None:
        n = 0
        for row in df:
            if metric > 0:
                assert row[TUNE_REPORT_METRIC] == metric
            n += 1
        assert n == ct

    space = Space(a=Grid(0, 1, 2, 3))
    dag = FugueWorkflow()
    dataset = TuneDatasetBuilder(space, str(tmpdir)).build(dag)
    obj = F()
    res = optimize_by_hyperband(
        obj,
        dataset,
        plans=[
            [[1.0, 3], [1.0, 2], [1.0, 1], [1.0, 1]],
            [[2.0, 2], [1.0, 1], [1.0, 1]],
        ],
        checkpoint_path=str(tmpdir),
    )
    res.result().output(assert_metric, dict(metric=0.0, ct=2))
    res.result(1).output(assert_metric, dict(metric=1.0, ct=1))
    dag.run()
Пример #6
0
def test_study(tmpdir):
    space = Space(a=Grid(-2, 0, 1))
    input_df = pd.DataFrame([[0, 1], [1, 1], [0, 2]], columns=["a", "b"])
    dag = FugueWorkflow()
    monitor = M()

    # no data partition
    builder = TuneDatasetBuilder(space,
                                 str(tmpdir)).add_df("b", dag.df(input_df))
    dataset = builder.build(dag, 1)
    for distributed in [True, False, None]:
        # min_better = True
        result = optimize_noniterative(
            objective=to_noniterative_objective(objective),
            dataset=dataset,
            distributed=distributed,
        )
        result.result()[[TUNE_REPORT, TUNE_REPORT_METRIC
                         ]].output(assert_metric,
                                   params=dict(metrics=[3.0, 4.0, 7.0]))
        result.result(2)[[TUNE_REPORT, TUNE_REPORT_METRIC
                          ]].output(assert_metric,
                                    params=dict(metrics=[3.0, 4.0]))

        # min_better = False
        result = optimize_noniterative(
            objective=to_noniterative_objective(objective, min_better=False),
            dataset=dataset,
            distributed=distributed,
        )
        result.result()[[TUNE_REPORT, TUNE_REPORT_METRIC
                         ]].output(assert_metric,
                                   params=dict(metrics=[-7.0, -4.0, -3.0]))
        result.result(2)[[TUNE_REPORT, TUNE_REPORT_METRIC
                          ]].output(assert_metric,
                                    params=dict(metrics=[-7.0, -4.0]))

    # with data partition
    builder = TuneDatasetBuilder(space, str(tmpdir)).add_df(
        "b",
        dag.df(input_df).partition_by("a"))
    dataset = builder.build(dag, 1)
    for distributed in [True, False, None]:
        result = optimize_noniterative(
            objective=to_noniterative_objective(objective),
            dataset=dataset,
            distributed=distributed,
            monitor=monitor,
        )
        result.result()[[TUNE_REPORT, TUNE_REPORT_METRIC]].output(
            assert_metric, params=dict(metrics=[2.0, 3.0, 6.0, 1.0, 2.0, 5.0]))
        result.result(1)[[TUNE_REPORT, TUNE_REPORT_METRIC
                          ]].output(assert_metric,
                                    params=dict(metrics=[1.0, 2.0]))

    dag.run()

    assert 3 * 3 * 2 == len(monitor._reports)
Пример #7
0
def test_build_sk_cv(tmpdir):
    space = sum([
        ss(LinearRegression, fit_intercept=Grid(True, False)),
        ss(LinearRegression, normalize=Grid(True, False)),
    ])
    dag = FugueWorkflow()
    build_sk_cv(
        space,
        dag.df(_create_mock_data()),
        scoring="neg_mean_absolute_error",
        cv=4,
        label_col="l",
        feature_prefix="f_",
        save_path=str(tmpdir),
    ).tune(distributable=False, serialize_path=str(tmpdir)).show()
    dag.run()
Пример #8
0
def test_workflow():
    builder = FugueWorkflow()

    a = builder.create_data([[0], [0], [1]], "a:int")
    raises(InvalidOperationError, lambda: a._task.copy())
    raises(InvalidOperationError, lambda: copy.copy(a._task))
    raises(InvalidOperationError, lambda: copy.deepcopy(a._task))
    a.show()
    a.show()

    raises(FugueWorkflowCompileError, lambda: builder.df(123))

    b = a.transform(mock_tf1, "*,b:int", pre_partition=dict(by=["a"]))
    b.show()
    builder.create_data([[0], [1]], "b:int").show()
    c = ArrayDataFrame([[100]], "a:int")
    builder.show(a, b, c)
    b = a.partition(by=["a"]).transform(mock_tf2).persist().broadcast()
    b.show()

    builder.run()
    df_eq(a.result, [[0], [0], [1]], "a:int")
    raises(TypeError, lambda: builder.run("abc"))
    builder.run(FugueWorkflowContext())
    df_eq(a.result, [[0], [0], [1]], "a:int")
    builder.run("NativeExecutionEngine")
    df_eq(b.result, [[0, 2], [0, 2], [1, 1]], "a:int,b:int")
    df_eq(b.compute(), [[0, 2], [0, 2], [1, 1]], "a:int,b:int")
    df_eq(b.compute(NativeExecutionEngine), [[0, 2], [0, 2], [1, 1]],
          "a:int,b:int")
Пример #9
0
    def test_run_ibis_duck(self):
        def _test1(con: ibis.BaseBackend) -> ibis.Expr:
            tb = con.table("a")
            return tb

        def _test2(con: ibis.BaseBackend) -> ibis.Expr:
            tb = con.table("a")
            return tb.mutate(c=tb.a + tb.b)

        dag = FugueWorkflow()
        df = dag.df([[0, 1], [2, 3]], "a:long,b:long")
        res = run_ibis(_test1, ibis_engine="duck", a=df)
        res.assert_eq(df)
        df = dag.df([[0, 1], [2, 3]], "a:long,b:long")
        res = run_ibis(_test2, ibis_engine="duckdb", a=df)
        df2 = dag.df([[0, 1, 1], [2, 3, 5]], "a:long,b:long,c:long")
        res.assert_eq(df2)
        dag.run(NativeExecutionEngine())
Пример #10
0
def test_out_transform(tmpdir):
    pdf = pd.DataFrame([[1, 10], [0, 0], [1, 1], [0, 20]], columns=["a", "b"])

    class T:
        def __init__(self):
            self.n = 0

        def f(self, df: Iterable[Dict[str, Any]]) -> None:
            self.n += 1

    t = T()
    out_transform(pdf, t.f)
    assert 1 == t.n

    t = T()
    out_transform(pdf, t.f, partition=dict(by=["a"]))
    assert 2 == t.n

    dag = FugueWorkflow()
    dag.df(pdf).yield_dataframe_as("x1")
    dag.df(pdf).yield_dataframe_as("x2")
    dag.run("", {FUGUE_CONF_WORKFLOW_CHECKPOINT_PATH: str(tmpdir)})

    t = T()
    out_transform(dag.yields["x1"], t.f)
    assert 1 == t.n

    t = T()
    out_transform(
        dag.yields["x2"],
        t.f,
        partition=dict(by=["a"]),
        engine_conf={FUGUE_CONF_WORKFLOW_CHECKPOINT_PATH: str(tmpdir)},
    )
    assert 2 == t.n

    # schema: *
    def f3(df: pd.DataFrame, called: callable) -> pd.DataFrame:
        called()
        return df

    cb = Callback()
    result = out_transform(pdf, f3, callback=cb.called)
    assert 1 == cb.ct
Пример #11
0
def suggest_sk_model(
    space: Space,
    train_df: Any,
    scoring: str,
    serialize_path: str,
    cv: int = 5,
    feature_prefix: str = "",
    label_col: str = "label",
    save_model: bool = False,
    partition_keys: List[str] = _EMPTY_LIST,
    top_n: int = 1,
    visualize_top_n: int = 0,
    objective_runner: Optional[ObjectiveRunner] = None,
    distributable: Optional[bool] = None,
    execution_engine: Any = None,
) -> List[Dict[str, Any]]:
    e = make_execution_engine(execution_engine)
    model_path = serialize_path if save_model else ""

    dag = FugueWorkflow()
    df = dag.df(train_df)
    if len(partition_keys) > 0:
        df = df.partition(by=partition_keys)
    skcv = build_sk_cv(
        space=space,
        train_df=df,
        scoring=scoring,
        cv=cv,
        feature_prefix=feature_prefix,
        label_col=label_col,
        save_path=model_path,
    )
    result = skcv.tune(
        objective_runner=objective_runner,
        distributable=distributable,
        serialize_path=serialize_path,
        shuffle=True,
    ).persist()
    best = select_best(result, top=top_n) if top_n > 0 else result
    visualize_top(result, top=visualize_top_n)
    dag.run(e)
    return list(best.result.as_dict_iterable())
Пример #12
0
def suggest_sk_models_by_cv(
    space: Space,
    train_df: Any,
    scoring: str,
    cv: int = 5,
    temp_path: str = "",
    feature_prefix: str = "",
    label_col: str = "label",
    save_model: bool = False,
    partition_keys: Optional[List[str]] = None,
    top_n: int = 1,
    local_optimizer: Optional[NonIterativeObjectiveLocalOptimizer] = None,
    monitor: Any = None,
    stopper: Any = None,
    stop_check_interval: Any = None,
    distributed: Optional[bool] = None,
    execution_engine: Any = None,
    execution_engine_conf: Any = None,
) -> List[TrialReport]:
    dag = FugueWorkflow()
    dataset = TUNE_OBJECT_FACTORY.make_dataset(
        dag,
        space,
        df=train_df,
        partition_keys=partition_keys,
        temp_path=temp_path,
    )
    objective = SKCVObjective(
        scoring=scoring,
        cv=cv,
        feature_prefix=feature_prefix,
        label_col=label_col,
        checkpoint_path=temp_path if save_model else None,
    )
    study = optimize_noniterative(
        objective=objective,
        dataset=dataset,
        optimizer=local_optimizer,
        distributed=distributed,
        monitor=monitor,
        stopper=stopper,
        stop_check_interval=stop_check_interval,
    )
    study.result(top_n).yield_dataframe_as("result")

    rows = list(
        dag.run(
            execution_engine,
            conf=execution_engine_conf,
        )["result"].as_dict_iterable())
    return [
        from_base64(r[TUNE_REPORT])
        for r in sorted(rows, key=lambda r: r[TUNE_REPORT_METRIC])
    ]
Пример #13
0
def test_transform_from_yield(tmpdir):
    # schema: *,x:int
    def f(df: pd.DataFrame) -> pd.DataFrame:
        return df.assign(x=1)

    dag = FugueWorkflow()
    dag.df([[0]], "a:int").yield_dataframe_as("x1")
    dag.df([[1]], "b:int").yield_dataframe_as("x2")
    dag.run("", {FUGUE_CONF_WORKFLOW_CHECKPOINT_PATH: str(tmpdir)})

    result = transform(dag.yields["x1"], f)
    assert isinstance(result, DataFrame)
    assert result.as_array(type_safe=True) == [[0, 1]]

    result = transform(
        dag.yields["x2"],
        f,
        engine_conf={FUGUE_CONF_WORKFLOW_CHECKPOINT_PATH: str(tmpdir)},
    )
    assert isinstance(result, DataFrame)
    assert result.as_array(type_safe=True) == [[1, 1]]
Пример #14
0
def test_runtime_exception():
    if sys.version_info < (3, 7):
        return

    def tr(df: pd.DataFrame) -> pd.DataFrame:
        raise Exception

    def show(df):
        df.show()

    dag = FugueWorkflow()
    df = dag.df([[0]], "a:int")
    df = df.transform(tr, schema="*")
    show(df)

    try:
        dag.run()
    except:
        assert len(traceback.extract_tb(sys.exc_info()[2])) < 10

    try:
        dag.run("native", {FUGUE_CONF_WORKFLOW_EXCEPTION_OPTIMIZE: False})
    except:
        assert len(traceback.extract_tb(sys.exc_info()[2])) > 10

    try:
        dag.run("native", {FUGUE_CONF_WORKFLOW_EXCEPTION_HIDE: ""})
    except:
        assert len(traceback.extract_tb(sys.exc_info()[2])) > 10
Пример #15
0
def test_dataset(tmpdir):
    space = Space(a=Grid(0, 1, 2, 3, 4), b=Grid(5, 6, 7, 8, 9))
    builder = TuneDatasetBuilder(space, str(tmpdir))

    dag = FugueWorkflow()
    dataset = builder.build(dag)
    ds = dataset.split([4, 1], 0)
    assert 2 == len(ds)
    ds[0].data.yield_dataframe_as("a")
    ds[1].data.yield_dataframe_as("b")
    res = dag.run()
    assert 25 == len(res["a"].as_array()) + len(res["b"].as_array())
    assert len(res["b"].as_array()) < 10
Пример #16
0
def _run(dag: FugueWorkflow, execution_engine: Any,
         execution_engine_conf: Any) -> List[TrialReport]:
    try:
        rows = list(
            dag.run(
                execution_engine,
                conf=execution_engine_conf,
            )["result"].as_dict_iterable())
        return [
            from_base64(r[TUNE_REPORT])
            for r in sorted(rows, key=lambda r: r[TUNE_REPORT_METRIC])
        ]
    except FugueDataFrameError as e:
        raise e.__cause__ or e.__context__ or e
Пример #17
0
def test_modified_exception():
    if sys.version_info < (3, 7):
        return

    def tr(df: pd.DataFrame) -> pd.DataFrame:
        raise Exception

    def show(df):
        df.show()

    def tt(df):
        __modified_exception__ = NotImplementedError()
        return df.transform(tr, schema="*")

    dag = FugueWorkflow()
    df = dag.df([[0]], "a:int")
    df = tt(df)
    show(df)

    try:
        dag.run()
    except Exception as ex:
        assert isinstance(ex.__cause__, NotImplementedError)
Пример #18
0
def suggest_by_hyperband(
    objective: Any,
    space: Space,
    plans: List[List[Tuple[float, int]]],
    train_df: Any = None,
    temp_path: str = "",
    partition_keys: Optional[List[str]] = None,
    top_n: int = 1,
    monitor: Any = None,
    distributed: Optional[bool] = None,
    execution_engine: Any = None,
    execution_engine_conf: Any = None,
) -> List[TrialReport]:
    assert_or_throw(
        not space.has_random_parameter,
        TuneCompileError("space can't contain random parameters, "
                         "use sample method before calling this function"),
    )
    dag = FugueWorkflow()
    dataset = TUNE_OBJECT_FACTORY.make_dataset(
        dag,
        space,
        df=train_df,
        partition_keys=partition_keys,
        temp_path=temp_path,
    )
    study = optimize_by_hyperband(
        objective=objective,
        dataset=dataset,
        plans=plans,
        checkpoint_path=temp_path,
        distributed=distributed,
        monitor=monitor,
    )
    study.result(top_n).yield_dataframe_as("result")

    rows = list(
        dag.run(
            execution_engine,
            conf=execution_engine_conf,
        )["result"].as_dict_iterable())
    return [
        TrialReport.from_jsondict(json.loads(r[TUNE_REPORT]))
        for r in sorted(rows, key=lambda r: r[TUNE_REPORT_METRIC])
    ]
Пример #19
0
def suggest_sk_stacking_model(
    space: Space,
    stack_space: Space,
    train_df: Any,
    scoring: str,
    serialize_path: str,
    cv: int = 5,
    feature_prefix: str = "",
    label_col: str = "label",
    save_model: bool = False,
    partition_keys: List[str] = _EMPTY_LIST,
    top_n: int = 1,
    visualize_top_n: int = 0,
    objective_runner: Optional[ObjectiveRunner] = None,
    distributable: Optional[bool] = None,
    execution_engine: Any = None,
    stack_cv: int = 2,
    stack_method: str = "auto",
    stack_passthrough: bool = False,
) -> List[Dict[str, Any]]:
    e = make_execution_engine(execution_engine)
    model_path = serialize_path if save_model else ""

    dag = FugueWorkflow()
    df = dag.df(train_df)
    if len(partition_keys) > 0:
        df = df.partition(by=partition_keys)
    skcv = build_sk_cv(
        space=space,
        train_df=df,
        scoring=scoring,
        cv=cv,
        feature_prefix=feature_prefix,
        label_col=label_col,
    )
    result = skcv.tune(
        objective_runner=objective_runner,
        distributable=distributable,
        serialize_path=serialize_path,
        shuffle=True,
    ).persist()
    best_models = select_best(result.transform(_extract_model), top=1)
    if top_n > 0:
        best_models = select_best(best_models.drop(["_sk__model"]), top=top_n)
    kwargs = Space(
        _sk__scoring=scoring,
        _sk__cv=cv,
        _sk__feature_prefix=feature_prefix,
        _sk__label_col=label_col,
        _sk__save_path=model_path,
        _sk__stack_cv=stack_cv,
        _sk__method=stack_method,
        _sk__passthrough=stack_passthrough,
    )
    space_df = best_models.process(
        _process_stack_space,
        params=dict(keys=partition_keys, space=stack_space * kwargs),
    )
    data = serialize_df(df, name="_sk__train_df", path=serialize_path)
    if len(partition_keys) > 0:
        data = data.inner_join(space_df.broadcast())
    else:
        data = data.cross_join(space_df.broadcast())
    result = tune(
        data,
        tunable=tunable(_sk_stack_cv),
        distributable=distributable,
        objective_runner=objective_runner,
    )
    best = select_best(result, top=1)
    visualize_top(result, top=visualize_top_n)
    dag.run(e)
    return list(best.result.as_dict_iterable())