Exemplo n.º 1
0
def test_tune_simple():
    def t1(a: int, b: int) -> float:
        return a + b

    for distributable in [True, False, None]:
        with FugueWorkflow() as dag:
            df = space_to_df(dag, Space(a=Grid(0, 1), b=Grid(2, 3)))
            tune(df, t1, distributable=distributable).show()

    @tunable()
    def t2(e: ExecutionEngine, a: int, b: int) -> float:
        assert isinstance(e, ExecutionEngine)
        return a + b

    for distributable in [False, None]:
        with FugueWorkflow() as dag:
            df = space_to_df(dag, Space(a=Grid(0, 1), b=Grid(2, 3)))
            tune(df, t2, distributable=distributable).show()

    # equivalent syntax sugar
    with FugueWorkflow() as dag:
        t2.space(a=Grid(0, 1), b=Grid(2, 3)).tune(dag).show()

    with raises(FugueTuneCompileError):
        with FugueWorkflow() as dag:
            df = space_to_df(dag, Space(a=Grid(0, 1), b=Grid(2, 3)))
            tune(df, t2, distributable=True).show()
Exemplo n.º 2
0
def test_space_to_df():
    with FugueWorkflow() as dag:
        df = space_to_df(dag, Space(a=Grid(0, 1), b=Grid(2, 3)))
        df.assert_eq(
            dag.df(
                [
                    ['[{"a": 0, "b": 2}]'],
                    ['[{"a": 0, "b": 3}]'],
                    ['[{"a": 1, "b": 2}]'],
                    ['[{"a": 1, "b": 3}]'],
                ],
                "__fmin_params__:str",
            ))

    with FugueWorkflow() as dag:
        df = space_to_df(dag,
                         Space(a=Grid(0, 1), b=Grid(2, 3)),
                         batch_size=3,
                         shuffle=False)
        df.assert_eq(
            dag.df(
                [
                    ['[{"a": 0, "b": 2}, {"a": 0, "b": 3}, {"a": 1, "b": 2}]'],
                    ['[{"a": 1, "b": 3}]'],
                ],
                "__fmin_params__:str",
            ))
Exemplo n.º 3
0
def test_save_and_use():
    dag = FugueWorkflow()
    a = dag.create(mock_create1, params=dict(n=1))
    b = dag.create(mock_create1, params=dict(n=1))
    a = a.save_and_use("xx", fmt="parquet", mode="overwrite")
    b.save_and_use("xx", mode="append")
    b.save_and_use("xx", mode="error")
    a = a.save_and_use("xx.csv",
                       fmt="csv",
                       mode="error",
                       single=True,
                       header=True)
    a = a.partition(by=["x"]).save_and_use("xx", mode="overwrite")
    dag.create(mock_create1, params=dict(n=2)).save_and_use("xx",
                                                            mode="overwrite")
    assert_eq(
        """
    a=create using mock_create1(n=1)
    b=create using mock_create1(n=1)
    a=save and use a overwrite parquet "xx"
    save and use b append "xx"
    save and use b to "xx"
    save and use a to single csv "xx.csv"(header=True)
    save and use prepartition by x overwrite "xx"
    save and use (create using mock_create1(n=2)) overwrite "xx"
    """,
        dag,
    )
Exemplo n.º 4
0
def test_hyperband(tmpdir):
    def assert_metric(df: Iterable[Dict[str, Any]], metric: float,
                      ct: int) -> None:
        n = 0
        for row in df:
            if metric > 0:
                assert row[TUNE_REPORT_METRIC] == metric
            n += 1
        assert n == ct

    space = Space(a=Grid(0, 1, 2, 3))
    dag = FugueWorkflow()
    dataset = TuneDatasetBuilder(space, str(tmpdir)).build(dag)
    obj = F()
    res = optimize_by_hyperband(
        obj,
        dataset,
        plans=[
            [[1.0, 3], [1.0, 2], [1.0, 1], [1.0, 1]],
            [[2.0, 2], [1.0, 1], [1.0, 1]],
        ],
        checkpoint_path=str(tmpdir),
    )
    res.result().output(assert_metric, dict(metric=0.0, ct=2))
    res.result(1).output(assert_metric, dict(metric=1.0, ct=1))
    dag.run()
Exemplo n.º 5
0
def test_visualize_top_n(tmpdir):
    def t1(a: int, b: int) -> float:
        return a + b

    with FugueWorkflow() as dag:
        df = space_to_df(dag, Space(a=Grid(0, 1), b=Grid(2, 3)))
        visualize_top_n(tune(df, t1, distributable=False), top=2)

    @tunable()
    def t2(df1: pd.DataFrame, df2: pd.DataFrame, a: int,
           b: int) -> Dict[str, Any]:
        return {
            "error": float(a + b + df1["y"].sum() + df2["y"].sum()),
            "metadata": {
                "a": a
            },
        }

    e = NativeExecutionEngine(conf={FUGUE_TUNE_TEMP_PATH: str(tmpdir)})
    with FugueWorkflow(e) as dag:
        df1 = dag.df([[0, 1], [1, 2], [0, 2]],
                     "x:int,y:int").partition(by=["x"])
        df2 = dag.df([[0, 10], [1, 20]], "x:int,y:int").partition(by=["x"])
        res = t2.space(df1=df1, df2=df2, a=Grid(0, 1), b=Grid(2, 3)).tune()
        visualize_top_n(res, top=2)
Exemplo n.º 6
0
def test_input_module():
    # pylint: disable=no-value-for-parameter
    @module()
    def input1(wf: FugueWorkflow) -> WorkflowDataFrame:
        return wf.df([[0]], "a:int")

    @module()
    def input2(wf: FugueWorkflow, a: int) -> WorkflowDataFrame:
        return wf.df([[a]], "a:int")

    @module()
    def input3(wf: FugueWorkflow, a: int, b: int) -> WorkflowDataFrames:
        return WorkflowDataFrames(a=wf.df([[a]], "a:int"),
                                  b=wf.df([[b]], "b:int"))

    assert not input1.has_input
    assert input1.has_single_output
    assert not input1.has_no_output

    assert input3.has_multiple_output

    with FugueWorkflow() as dag:
        input1(dag).assert_eq(dag.df([[0]], "a:int"))

    with FugueWorkflow() as dag:
        input2(a=10, wf=dag).assert_eq(dag.df([[10]], "a:int"))

    with FugueWorkflow() as dag:
        dfs = input3(dag, 10, 11)
        dfs["a"].assert_eq(dag.df([[10]], "a:int"))
        dfs["b"].assert_eq(dag.df([[11]], "b:int"))
Exemplo n.º 7
0
 def make_dataset(
     self,
     dag: FugueWorkflow,
     dataset: Any,
     df: Any = None,
     df_name: str = TUNE_DATASET_DF_DEFAULT_NAME,
     test_df: Any = None,
     test_df_name: str = TUNE_DATASET_VALIDATION_DF_DEFAULT_NAME,
     partition_keys: Optional[List[str]] = None,
     temp_path: str = "",
 ) -> TuneDataset:
     assert_or_throw(dataset is not None,
                     TuneCompileError("dataset can't be None"))
     if isinstance(dataset, TuneDataset):
         assert_or_throw(
             df is None,
             TuneCompileError("can't set df when dataset is TuneDataset"))
         return dataset
     if isinstance(dataset, Space):
         path = self.get_path_or_temp(temp_path)
         builder = TuneDatasetBuilder(dataset, path)
         if df is not None:
             wdf = dag.df(df)
             if partition_keys is not None and len(partition_keys) > 0:
                 wdf = wdf.partition_by(*partition_keys)
             builder.add_df(df_name, wdf)
         if test_df is not None:
             wdf = dag.df(test_df)
             how = "cross"
             if partition_keys is not None and len(partition_keys) > 0:
                 wdf = wdf.partition_by(*partition_keys)
                 how = "inner"
             builder.add_df(test_df_name, wdf, how=how)
         return builder.build(dag, batch_size=1, shuffle=True)
     raise TuneCompileError(f"{dataset} can't be converted to TuneDataset")
Exemplo n.º 8
0
def test_run_asha(tmpdir):
    class M(Monitor):
        def on_report(self, report: TrialReport) -> None:
            print(report.jsondict)

    def assert_metric(df: Iterable[Dict[str, Any]], metric: float,
                      ct: int) -> None:
        n = 0
        for row in df:
            assert row[TUNE_REPORT_METRIC] == metric
            n += 1
        assert n == ct

    space = Space(a=Grid(0, 1, 2, 3))
    dag = FugueWorkflow()
    dataset = TuneDatasetBuilder(space, str(tmpdir)).build(dag, shuffle=False)
    obj = F()
    res = optimize_by_continuous_asha(
        obj,
        dataset,
        plan=[[1.0, 3], [1.0, 2], [1.0, 1], [1.0, 1]],
        checkpoint_path=str(tmpdir),
    )
    res.result(1).output(assert_metric, dict(metric=1.0, ct=1))

    res = optimize_by_continuous_asha(
        obj,
        dataset,
        plan=[[2.0, 2], [1.0, 1], [1.0, 1]],
        checkpoint_path=str(tmpdir),
        monitor=M(),
    )
    res.result(1).output(assert_metric, dict(metric=1.0, ct=1))
    dag.run()
Exemplo n.º 9
0
def test_create():
    id0 = FugueWorkflow().df([[0]], "a:int32").workflow.spec_uuid()
    id1 = FugueWorkflow().df([[1]], "a:int32").workflow.spec_uuid()
    id2 = FugueWorkflow().df([[1]], "a:int32").workflow.spec_uuid()

    assert id1 != id0
    assert id1 == id2
Exemplo n.º 10
0
def assert_eq(expr, expected: FugueWorkflow):
    global_vars, local_vars = get_caller_global_local_vars()
    sql = FugueSQL(expr, "fugueLanguage", ignore_case=True, simple_assign=True)
    wf = FugueWorkflow()
    v = _Extensions(
        sql, FugueSQLHooks(), wf, global_vars=global_vars, local_vars=local_vars
    )
    obj = v.visit(sql.tree)
    assert expected.spec_uuid() == v.workflow.spec_uuid()
Exemplo n.º 11
0
def test_process_module():
    # pylint: disable=no-value-for-parameter
    def process(df: pd.DataFrame, d: int = 1) -> pd.DataFrame:
        df["a"] += d
        return df

    @module
    def p1(wf: FugueSQLWorkflow, df: WorkflowDataFrame) -> WorkflowDataFrame:
        return df.process(process)

    @module()
    def p2(wf: FugueWorkflow, dfs: WorkflowDataFrames,
           d: int) -> WorkflowDataFrames:
        return WorkflowDataFrames(
            {k: v.process(process, params={"d": d})
             for k, v in dfs.items()})

    @module(as_method=True, name="p4")
    def p3(df: WorkflowDataFrame) -> WorkflowDataFrame:
        return df.process(process)

    assert p1.has_input
    assert not p1.has_dfs_input
    assert p2.has_dfs_input

    with FugueSQLWorkflow() as dag:
        df = dag.df([[0]], "a:int")
        p1(df).assert_eq(dag.df([[1]], "a:int"))
        p1(dag, df).assert_eq(dag.df([[1]], "a:int"))
        p1(df=df).assert_eq(dag.df([[1]], "a:int"))
        p1(df=df, wf=dag).assert_eq(dag.df([[1]], "a:int"))

    with FugueWorkflow() as dag:
        dfs = WorkflowDataFrames(aa=dag.df([[0]], "a:int"),
                                 bb=dag.df([[10]], "a:int"))
        r = p2(dag, dfs, 1)
        r["aa"].assert_eq(dag.df([[1]], "a:int"))
        r["bb"].assert_eq(dag.df([[11]], "a:int"))

        r = p2(dfs, 1)
        r["aa"].assert_eq(dag.df([[1]], "a:int"))
        r["bb"].assert_eq(dag.df([[11]], "a:int"))

        r = p2(d=1, dfs=dfs, wf=dag)
        r["aa"].assert_eq(dag.df([[1]], "a:int"))
        r["bb"].assert_eq(dag.df([[11]], "a:int"))

        r = p2(d=1, dfs=dfs)
        r["aa"].assert_eq(dag.df([[1]], "a:int"))
        r["bb"].assert_eq(dag.df([[11]], "a:int"))

    with FugueWorkflow() as dag:
        df = dag.df([[0]], "a:int")
        p3(df).assert_eq(dag.df([[1]], "a:int"))
        p3(df=df).assert_eq(dag.df([[1]], "a:int"))
        df.p4().assert_eq(dag.df([[1]], "a:int"))
Exemplo n.º 12
0
def test_worflow_dataframes():
    dag1 = FugueWorkflow()
    df1 = dag1.df([[0]], "a:int")
    df2 = dag1.df([[0]], "b:int")
    dag2 = FugueWorkflow()
    df3 = dag2.df([[0]], "a:int")

    dfs1 = WorkflowDataFrames(a=df1, b=df2)
    assert dfs1["a"] is df1
    assert dfs1["b"] is df2

    dfs2 = WorkflowDataFrames(dfs1, aa=df1, bb=df2)
    assert 4 == len(dfs2)

    with raises(ValueError):
        WorkflowDataFrames(a=df1, b=df3)

    with raises(ValueError):
        WorkflowDataFrames(a=df1, b=ArrayDataFrame([[0]], "a:int"))

    dag = FugueWorkflow()
    df = dag.df([[0, 1], [1, 1]], "a:int,b:int")
    assert df.partition_spec.empty
    df2 = df.partition(by=["a"])
    assert df.partition_spec.empty
    assert df2.partition_spec == PartitionSpec(by=["a"])
    df3 = df.partition_by("a", "b")
    assert df.partition_spec.empty
    assert df3.partition_spec == PartitionSpec(by=["a", "b"])
    df4 = df.per_partition_by("a", "b")
    assert df.partition_spec.empty
    assert df4.partition_spec == PartitionSpec(by=["a", "b"], algo="even")
    df4 = df.per_row()
    assert df.partition_spec.empty
    assert df4.partition_spec == PartitionSpec("per_row")
Exemplo n.º 13
0
def test_runtime_exception():
    if sys.version_info < (3, 7):
        return

    def tr(df: pd.DataFrame) -> pd.DataFrame:
        raise Exception

    def show(df):
        df.show()

    dag = FugueWorkflow()
    df = dag.df([[0]], "a:int")
    df = df.transform(tr, schema="*")
    show(df)

    try:
        dag.run()
    except:
        assert len(traceback.extract_tb(sys.exc_info()[2])) < 10

    try:
        dag.run("native", {FUGUE_CONF_WORKFLOW_EXCEPTION_OPTIMIZE: False})
    except:
        assert len(traceback.extract_tb(sys.exc_info()[2])) > 10

    try:
        dag.run("native", {FUGUE_CONF_WORKFLOW_EXCEPTION_HIDE: ""})
    except:
        assert len(traceback.extract_tb(sys.exc_info()[2])) > 10
Exemplo n.º 14
0
def suggest_sk_models_by_cv(
    space: Space,
    train_df: Any,
    scoring: str,
    cv: int = 5,
    temp_path: str = "",
    feature_prefix: str = "",
    label_col: str = "label",
    save_model: bool = False,
    partition_keys: Optional[List[str]] = None,
    top_n: int = 1,
    local_optimizer: Optional[NonIterativeObjectiveLocalOptimizer] = None,
    monitor: Any = None,
    stopper: Any = None,
    stop_check_interval: Any = None,
    distributed: Optional[bool] = None,
    execution_engine: Any = None,
    execution_engine_conf: Any = None,
) -> List[TrialReport]:
    dag = FugueWorkflow()
    dataset = TUNE_OBJECT_FACTORY.make_dataset(
        dag,
        space,
        df=train_df,
        partition_keys=partition_keys,
        temp_path=temp_path,
    )
    objective = SKCVObjective(
        scoring=scoring,
        cv=cv,
        feature_prefix=feature_prefix,
        label_col=label_col,
        checkpoint_path=temp_path if save_model else None,
    )
    study = optimize_noniterative(
        objective=objective,
        dataset=dataset,
        optimizer=local_optimizer,
        distributed=distributed,
        monitor=monitor,
        stopper=stopper,
        stop_check_interval=stop_check_interval,
    )
    study.result(top_n).yield_dataframe_as("result")

    rows = list(
        dag.run(
            execution_engine,
            conf=execution_engine_conf,
        )["result"].as_dict_iterable())
    return [
        from_base64(r[TUNE_REPORT])
        for r in sorted(rows, key=lambda r: r[TUNE_REPORT_METRIC])
    ]
Exemplo n.º 15
0
def test_transform():
    w = (FugueWorkflow().df([[0], [1]], "a:int",
                            data_determiner=to_uuid).transform(
                                mock_transformer,
                                schema=Schema("a:int"),
                                params=dict(n=2)))
    assert_eq(
        """
    create [[0],[1]] schema a:int
    transform using mock_transformer(n=2) schema a:int
    """,
        w.workflow,
    )

    w = (FugueWorkflow().df([[0], [1]], "a:int",
                            data_determiner=to_uuid).partition(
                                by=["a"], presort="b DESC",
                                num="ROWCOUNT/2").transform(mock_transformer,
                                                            schema="*",
                                                            params=dict(n=2)))
    assert_eq(
        """
    create [[0],[1]] schema a:int

    transform
        prepartition ROWCOUNT / 2 by a presort b desc
        using mock_transformer(n=2) schema *
    """,
        w.workflow,
    )

    def _func(a: int, b: int) -> int:
        return a + b

    w = (FugueWorkflow().df([[0], [1]], "a:int",
                            data_determiner=to_uuid).partition(
                                by=["a"], presort="b DESC",
                                num="ROWCOUNT/2").transform(mock_transformer,
                                                            schema="*",
                                                            params=dict(n=2),
                                                            callback=_func))
    assert_eq(
        """
    create [[0],[1]] schema a:int

    transform
        prepartition ROWCOUNT / 2 by a presort b desc
        using mock_transformer(n=2) schema *
        callback _func
    """,
        w.workflow,
    )
Exemplo n.º 16
0
def test_checkpoint():
    id0 = FugueWorkflow().df([[0]], "a:int32").workflow.spec_uuid()
    id1 = FugueWorkflow().df([[0]],
                             "a:int32").checkpoint().workflow.spec_uuid()
    id2 = FugueWorkflow().df([[0]],
                             "a:int32").checkpoint("1").workflow.spec_uuid()
    id3 = FugueWorkflow().df([[0]],
                             "a:int32").checkpoint(1).workflow.spec_uuid()

    assert id1 != id0
    assert id1 != id2
    assert id2 != id0
    assert id2 == id3
Exemplo n.º 17
0
def test_dataset(tmpdir):
    space = Space(a=Grid(0, 1, 2, 3, 4), b=Grid(5, 6, 7, 8, 9))
    builder = TuneDatasetBuilder(space, str(tmpdir))

    dag = FugueWorkflow()
    dataset = builder.build(dag)
    ds = dataset.split([4, 1], 0)
    assert 2 == len(ds)
    ds[0].data.yield_dataframe_as("a")
    ds[1].data.yield_dataframe_as("b")
    res = dag.run()
    assert 25 == len(res["a"].as_array()) + len(res["b"].as_array())
    assert len(res["b"].as_array()) < 10
Exemplo n.º 18
0
def trim_index(
    compute_engine: FugueExecutionEngine,
    df_graph: FugueDataFrame,
    indexed: bool = False,
    directed: bool = True,
    max_out_deg: int = 0,
    random_seed: Optional[int] = None,
) -> Tuple[FugueDataFrame, Optional[FugueDataFrame]]:
    """
    The very first steps to treat the input graph:
    1) basic validation of the input graph format: at least have ["src", "dst"] cols,
       it will be an unweighted graph if no "weight" col.
    2) trim some edges to avoid super hotspot vertices: random sampling will be done
       on all the edges of a vertex if the number of edges is greater than a threshold,
       this is critical to reduce data skewness and save disk space
    3) index the graph vertices by using sequential integers to represent vertices,
       this is critical to save memory

    :param compute_engine: an execution engine supported by Fugue
    :param df_graph: the input graph data as general Fugue dataframe
    :param indexed: if the input graph is using sequential integers to note vertices
    :param directed: if the graph is directed or not
    :param max_out_deg: the threshold for trimming hotspot vertices, set it to <= 0
                        to turn off trimming
    :param random_seed: optional random seed, for testing only

    Returns a validated, trimmed, and indexed graph
    """
    logging.info("trim_index(): start validating, trimming, and indexing ...")
    if "src" not in df_graph.schema or "dst" not in df_graph.schema:
        raise ValueError(
            f"Input graph NOT in the right format: {df_graph.schema}")

    params = {"max_out_degree": max_out_deg, "random_seed": random_seed}
    dag = FugueWorkflow(compute_engine)
    df = (dag.df(df_graph).partition(by=["src"]).transform(
        trim_hotspot_vertices,
        schema="*",
        params=params,
    ).compute())

    name_id = None
    if indexed is True:
        return df, name_id
    if isinstance(compute_engine, SparkExecutionEngine):
        df_res, name_id = index_graph_spark(df.native,
                                            directed)  # type: ignore
        return SparkDataFrame(df_res), SparkDataFrame(name_id)
    else:
        df_res, name_id = index_graph_pandas(df.as_pandas(), directed)
        return PandasDataFrame(df_res), PandasDataFrame(name_id)
Exemplo n.º 19
0
def test_out_transform():
    class OT(OutputTransformer):
        def process(self, df):
            return

    o = _to_output_transformer(OT)
    w = FugueWorkflow()
    w.df([[0], [1]], "a:int", data_determiner=to_uuid).out_transform(
        o, params=dict(n=2)
    )
    assert_eq(
        """
    create [[0],[1]] schema a:int
    outtransform using OT(n=2)
    """,
        w,
    )

    w = FugueWorkflow()
    w.df([[0], [1]], "a:int", data_determiner=to_uuid).partition(
        by=["a"], presort="b DESC", num="ROWCOUNT/2"
    ).out_transform(mock_transformer, params=dict(n=2))
    assert_eq(
        """
    create [[0],[1]] schema a:int

    outtransform
        prepartition ROWCOUNT / 2 by a presort b desc
        using mock_transformer(n=2)
    """,
        w,
    )
Exemplo n.º 20
0
def test_select_with():
    dag = FugueWorkflow()
    dag.select("with x as ( select * from a ) , y as ( select * from b ) "
               "select * from x union select * from y")
    assert_eq(
        """
    with
        x as (select * from a),
        y as (select * from b)
    select *   from x union select * from y

    """,
        dag,
    )
Exemplo n.º 21
0
def test_rename():
    dag = FugueWorkflow()
    a = dag.create(mock_create1)
    b = a.rename({"a": "aa", "b": "bb"})
    c = a.rename({"a": "aaa", "b": "bbb"})

    assert_eq(
        """
    a=create using mock_create1
    rename columns a:aa,b:bb
    rename columns a:aaa,b:bbb from a
    """,
        dag,
    )
Exemplo n.º 22
0
def test_alter_columns():
    dag = FugueWorkflow()
    a = dag.create(mock_create1)
    a.alter_columns(Schema("a:str,b:str"))
    a.alter_columns(Schema("a:float,b:double"))

    assert_eq(
        """
    a=create using mock_create1
    alter columns a:str, b:str
    alter columns a:float, b:double from a
    """,
        dag,
    )
Exemplo n.º 23
0
def test_sample():
    dag = FugueWorkflow()
    a = dag.create(mock_create1)
    a.sample(frac=0.1, replace=False, seed=None)
    a.sample(n=5, replace=True, seed=7)

    assert_eq(
        """
    a=create using mock_create1
    sample 10 percent
    sample replace 5 rows seed 7 from a
    """,
        dag,
    )
Exemplo n.º 24
0
def test_workflow_determinism_2():
    dag1 = FugueWorkflow()
    dag1.create_data([[0], [0], [1]], "a:int32")  # <----
    a1 = dag1.create_data([[0], [0], [1]], "a:int32")
    b1 = a1.transform(mock_tf1, "*,b:int", pre_partition=dict(by=["a"], num=2))
    a1.show()

    dag2 = FugueWorkflow()
    a2 = dag2.create_data([[0], [0], [1]], "a:int32")
    b2 = a2.transform(mock_tf1, "*,b:int", pre_partition=dict(by=["a"], num=2))
    a2.show()

    assert a1.spec_uuid() == a2.spec_uuid()
    assert b1.spec_uuid() == b2.spec_uuid()
    assert dag1.spec_uuid() != dag2.spec_uuid()
Exemplo n.º 25
0
def test_cotransform():
    dag = FugueWorkflow()
    a1 = dag.create(mock_create1, params=dict(n=1))
    a2 = dag.create(mock_create1, params=dict(n=2))
    z = dag.zip(a1, a2)
    t = z.partition(num=3).transform(mock_cotransformer1, params=dict(n=3))
    assert_eq(
        """
    zip
        (create using mock_create1 params n:1),
        (create using mock_create1 params n:2)
    transform prepartition 3 using mock_cotransformer1(n=3)
    """,
        dag,
    )
Exemplo n.º 26
0
def test_load():
    dag = FugueWorkflow()
    dag.load("xx")
    dag.load("xx", fmt="csv")
    dag.load("xx", columns="a:int,b:str")
    dag.load("xx", columns=["a", "b"], header=True)
    assert_eq(
        """
    load "xx"
    load csv "xx"
    load "xx" columns a:int, b:str
    load "xx"(header=True) columns a, b
    """,
        dag,
    )
Exemplo n.º 27
0
def test_wf():
    @tunable()
    def func(a: float, b: float, c: int, d: int) -> float:
        return a * a + b * b + c + d

    with FugueWorkflow() as dag:
        space = space_to_df(
            dag,
            Space(a=Grid(1, 2), b=Rand(-100, 100), c=Choice(1, -1), d=RandInt(0, 3)),
        )
        tune(space, func, objective_runner=HyperoptRunner(100, seed=3)).show()

    with FugueWorkflow() as dag:
        space = space_to_df(dag, Space(a=Grid(1, 2), b=Grid(0, 1), c=1, d=2))
        tune(space, func, objective_runner=HyperoptRunner(100, seed=3)).show()
Exemplo n.º 28
0
def test_output_module():
    # pylint: disable=no-value-for-parameter

    @module()
    def o1(wf: FugueWorkflow, df: WorkflowDataFrame) -> None:
        pass

    @module()
    def o2(wf: FugueWorkflow, df: WorkflowDataFrame):
        pass

    @module()
    def o3(df: WorkflowDataFrame):
        pass

    assert o1.has_input
    assert o1.has_no_output
    assert o2.has_no_output
    assert o3.has_no_output

    with FugueWorkflow() as dag:
        df = dag.df([[0]], "a:int")
        o1(df)
        o1(dag, df)
        o2(df=df)
        o2(df=df, wf=dag)
        o3(df)
Exemplo n.º 29
0
    def outputter(df: LocalDataFrame) -> None:
        keys = [
            k for k in df.schema.names
            if not k.startswith("__df_") and not k.startswith("__fmin_")
        ]

        def show(subdf: pd.DataFrame) -> None:
            if subdf.shape[0] == 0:  # pragma: no cover
                return
            subdf = subdf.sort_values("__fmin_value__").head(top)
            title = (json.dumps({k: str(subdf[k].iloc[0])
                                 for k in keys}) if len(keys) > 0 else "")
            pdf = pd.DataFrame(
                [json.loads(x) for x in subdf["__fmin_params__"]])
            fig = plt.figure(figsize=(12, 3 * len(pdf.columns)))
            if len(keys) > 0:
                fig.suptitle(
                    title,
                    va="center",
                    size=15,
                    weight="bold",
                    y=0.93,
                )
            for i in range(len(pdf.columns)):
                ax = fig.add_subplot(len(pdf.columns), 1, i + 1)
                pdf[pdf.columns[i]].hist(ax=ax).set_title(pdf.columns[i])
                plt.subplots_adjust(hspace=0.5)

        if len(keys) == 0:
            show(df.as_pandas())
        else:
            with FugueWorkflow() as dag:
                dag.df(df).partition(by=keys).out_transform(show)
Exemplo n.º 30
0
def _process_stack_space(engine: ExecutionEngine, df: DataFrame,
                         keys: List[str], space: Space) -> DataFrame:
    fe_schema = df.schema.extract(keys) + "__fmin_fe__:str"

    def _merge_space(df: List[Dict[str, Any]]) -> Iterable[Dict[str, Any]]:
        p = json.dumps([json.loads(row["__fmin_params__"]) for row in df])
        res = df[0]
        res["__fmin_fe__"] = p
        yield res

    # schema: *-__fmin_fe__
    def _construct_final_space(
            df: Iterable[Dict[str, Any]]) -> Iterable[Dict[str, Any]]:
        for row in df:
            op = json.loads(row["__fmin_params__"])
            for o in op:
                o["_sk__estimators"] = row["__fmin_fe__"]
            row["__fmin_params__"] = json.dumps(op)
            yield row

    with FugueWorkflow(engine) as dag:
        ddf = dag.df(df)
        space_df = space_to_df(dag, space).broadcast()
        if len(keys) == 0:
            fe = ddf.process(_merge_space, schema=fe_schema)
        else:
            fe = ddf.partition(by=keys).transform(_merge_space,
                                                  schema=fe_schema)
        result = fe.cross_join(space_df).transform(_construct_final_space)

    return result.result