Пример #1
0
def test_worflow_dataframes():
    dag1 = FugueWorkflow()
    df1 = dag1.df([[0]], "a:int")
    df2 = dag1.df([[0]], "b:int")
    dag2 = FugueWorkflow()
    df3 = dag2.df([[0]], "a:int")

    dfs1 = WorkflowDataFrames(a=df1, b=df2)
    assert dfs1["a"] is df1
    assert dfs1["b"] is df2

    dfs2 = WorkflowDataFrames(dfs1, aa=df1, bb=df2)
    assert 4 == len(dfs2)

    with raises(ValueError):
        WorkflowDataFrames(a=df1, b=df3)

    with raises(ValueError):
        WorkflowDataFrames(a=df1, b=ArrayDataFrame([[0]], "a:int"))

    dag = FugueWorkflow()
    df = dag.df([[0, 1], [1, 1]], "a:int,b:int")
    assert df.partition_spec.empty
    df2 = df.partition(by=["a"])
    assert df.partition_spec.empty
    assert df2.partition_spec == PartitionSpec(by=["a"])
    df3 = df.partition_by("a", "b")
    assert df.partition_spec.empty
    assert df3.partition_spec == PartitionSpec(by=["a", "b"])
    df4 = df.per_partition_by("a", "b")
    assert df.partition_spec.empty
    assert df4.partition_spec == PartitionSpec(by=["a", "b"], algo="even")
    df4 = df.per_row()
    assert df.partition_spec.empty
    assert df4.partition_spec == PartitionSpec("per_row")
Пример #2
0
 def visitFugueModuleTask(self, ctx: fp.FugueModuleTaskContext) -> None:
     data = self.get_dict(ctx, "assign", "dfs", "using", "params")
     sub = _to_module(
         data["using"],
         global_vars=self.global_vars,
         local_vars=self.local_vars,
     )
     varname = data["assign"][0] if "assign" in data else None
     if varname is not None:
         assert_or_throw(
             sub.has_single_output or sub.has_multiple_output,
             FugueSQLSyntaxError(
                 "invalid assignment for module without output"),
         )
     if sub.has_input:
         dfs = data["dfs"] if "dfs" in data else WorkflowDataFrames(
             self.last)
     else:
         dfs = WorkflowDataFrames()
     p = data["params"] if "params" in data else {}
     if sub.has_dfs_input:
         result = sub(dfs, **p)
     elif len(dfs) == 0:
         result = sub(self.workflow, **p)
     elif len(dfs) == 1 or not dfs.has_key:
         result = sub(*list(dfs.values()), **p)
     else:
         result = sub(**dfs, **p)
     if sub.has_single_output or sub.has_multiple_output:
         self.variables[varname] = result
     if sub.has_single_output:
         self._last = result
Пример #3
0
def serialize_dfs(dfs: WorkflowDataFrames,
                  how: str = "inner",
                  path="") -> WorkflowDataFrame:
    assert_or_throw(dfs.has_key, "all datarames must be named")
    serialized = WorkflowDataFrames(
        {k: serialize_df(v, k, path)
         for k, v in dfs.items()})
    wf: FugueWorkflow = dfs.get_value_by_index(0).workflow
    return wf.join(serialized, how=how)
Пример #4
0
def test_process_module():
    # pylint: disable=no-value-for-parameter
    def process(df: pd.DataFrame, d: int = 1) -> pd.DataFrame:
        df["a"] += d
        return df

    @module
    def p1(wf: FugueSQLWorkflow, df: WorkflowDataFrame) -> WorkflowDataFrame:
        return df.process(process)

    @module()
    def p2(wf: FugueWorkflow, dfs: WorkflowDataFrames,
           d: int) -> WorkflowDataFrames:
        return WorkflowDataFrames(
            {k: v.process(process, params={"d": d})
             for k, v in dfs.items()})

    @module(as_method=True, name="p4")
    def p3(df: WorkflowDataFrame) -> WorkflowDataFrame:
        return df.process(process)

    assert p1.has_input
    assert not p1.has_dfs_input
    assert p2.has_dfs_input

    with FugueSQLWorkflow() as dag:
        df = dag.df([[0]], "a:int")
        p1(df).assert_eq(dag.df([[1]], "a:int"))
        p1(dag, df).assert_eq(dag.df([[1]], "a:int"))
        p1(df=df).assert_eq(dag.df([[1]], "a:int"))
        p1(df=df, wf=dag).assert_eq(dag.df([[1]], "a:int"))

    with FugueWorkflow() as dag:
        dfs = WorkflowDataFrames(aa=dag.df([[0]], "a:int"),
                                 bb=dag.df([[10]], "a:int"))
        r = p2(dag, dfs, 1)
        r["aa"].assert_eq(dag.df([[1]], "a:int"))
        r["bb"].assert_eq(dag.df([[11]], "a:int"))

        r = p2(dfs, 1)
        r["aa"].assert_eq(dag.df([[1]], "a:int"))
        r["bb"].assert_eq(dag.df([[11]], "a:int"))

        r = p2(d=1, dfs=dfs, wf=dag)
        r["aa"].assert_eq(dag.df([[1]], "a:int"))
        r["bb"].assert_eq(dag.df([[11]], "a:int"))

        r = p2(d=1, dfs=dfs)
        r["aa"].assert_eq(dag.df([[1]], "a:int"))
        r["bb"].assert_eq(dag.df([[11]], "a:int"))

    with FugueWorkflow() as dag:
        df = dag.df([[0]], "a:int")
        p3(df).assert_eq(dag.df([[1]], "a:int"))
        p3(df=df).assert_eq(dag.df([[1]], "a:int"))
        df.p4().assert_eq(dag.df([[1]], "a:int"))
Пример #5
0
 def add_dfs(self,
             dfs: WorkflowDataFrames,
             how: str = "") -> "TuneDatasetBuilder":
     assert_or_throw(dfs.has_key, "all datarames must be named")
     for k, v in dfs.items():
         if len(self._dfs_spec) == 0:
             self.add_df(k, v)
         else:
             self.add_df(k, v, how=how)
     return self
Пример #6
0
 def visitFuguePrintTask(self, ctx: fp.FuguePrintTaskContext) -> None:
     data = self.get_dict(ctx, "dfs")
     if "dfs" not in data:
         data["dfs"] = WorkflowDataFrames(self.last)
     params: Dict[str, Any] = {}
     if ctx.rows is not None:
         params["rows"] = int(self.ctxToStr(ctx.rows))
     if ctx.count is not None:
         params["show_count"] = True
     if ctx.title is not None:
         params["title"] = eval(self.ctxToStr(ctx.title))
     self.workflow.show(data["dfs"], **params)
Пример #7
0
def test_module():
    # pylint: disable=no-value-for-parameter

    def create(wf: FugueWorkflow, n: int = 1) -> WorkflowDataFrame:
        return wf.df([[n]], "a:int")

    def merge(df1: WorkflowDataFrame,
              df2: WorkflowDataFrame,
              k: str = "aa") -> WorkflowDataFrames:
        return WorkflowDataFrames({k: df1, "bb": df2})

    def merge2(wf: FugueWorkflow,
               dfs: WorkflowDataFrames,
               k: int = 0) -> WorkflowDataFrame:
        return dfs[k]

    def merge3(df1: WorkflowDataFrame,
               df2: WorkflowDataFrame) -> WorkflowDataFrames:
        return WorkflowDataFrames(df1, df2)

    @module()
    def out1(wf: FugueWorkflow, df: WorkflowDataFrame) -> None:
        df.show()

    dag = FugueWorkflow()
    a = create(dag)
    b = create(dag, n=2)
    dfs = merge(a, b, k="a1")
    dfs["a1"].show()
    dfs["bb"].show()
    df = merge2(dag, WorkflowDataFrames(a, b), k=1)
    out1(df)
    dfs = merge3(b, a)
    dfs[0].show()
    dfs[1].show()

    assert_eq(
        """
    a=sub using create
    b=sub using create(n=2)
    dfs=sub a,b using merge(k="a1")
    print dfs[a1]
    print dfs[bb]
    sub a,b using merge2(k=1)
    sub using out1
    dfs=sub df2:a,df1:b using merge3
    print dfs[0]
    print dfs[1]
    """,
        dag,
    )
Пример #8
0
 def visitFugueOutputTask(self, ctx: fp.FugueOutputTaskContext):
     data = self.get_dict(ctx, "dfs", "using", "params", "partition")
     if "dfs" not in data:
         data["dfs"] = WorkflowDataFrames(self.last)
     using = _to_outputter(
         data["using"],
         global_vars=self.global_vars,
         local_vars=self.local_vars,
     )
     self.workflow.output(
         data["dfs"],
         using=using,
         params=data.get("params"),
         pre_partition=data.get("partition"),
     )
Пример #9
0
 def space(self, *args: Space, **kwargs: Any) -> "TunableWithSpace":
     space = Space(
         **{
             k: v
             for k, v in kwargs.items()
             if not isinstance(v, WorkflowDataFrame)
         })
     if len(args) > 0:
         s = args[0]
         for x in args[1:]:
             s = s * x
         space = s * space
     dfs = WorkflowDataFrames({
         k: v
         for k, v in kwargs.items() if isinstance(v, WorkflowDataFrame)
     })
     return TunableWithSpace(self, space, dfs)
Пример #10
0
 def visitFugueProcessTask(
         self, ctx: fp.FugueProcessTaskContext) -> WorkflowDataFrame:
     data = self.get_dict(ctx, "partition", "dfs", "params")
     if "dfs" not in data:
         data["dfs"] = WorkflowDataFrames(self.last)
     p = data["params"]
     using = _to_processor(
         p["using"],
         schema=p.get("schema"),
         global_vars=self.global_vars,
         local_vars=self.local_vars,
     )
     return self.workflow.process(
         data["dfs"],
         using=using,
         params=p.get("params"),
         pre_partition=data.get("partition"),
     )
Пример #11
0
    def add_dfs(self,
                dfs: WorkflowDataFrames,
                how: str = "") -> "TuneDatasetBuilder":
        """Add multiple dataframes with the same join type

        :param dfs: dictionary like dataframe collection. The keys
          will be used as the dataframe names
        :param how: join type, can accept ``semi``, ``left_semi``,
          ``anti``, ``left_anti``, ``inner``, ``left_outer``,
          ``right_outer``, ``full_outer``, ``cross``
        :returns: the builder itself
        """
        assert_or_throw(dfs.has_key, "all datarames must be named")
        for k, v in dfs.items():
            if len(self._dfs_spec) == 0:
                self.add_df(k, v)
            else:
                self.add_df(k, v, how=how)
        return self
Пример #12
0
 def visitFugueOutputTransformTask(
         self, ctx: fp.FugueOutputTransformTaskContext) -> None:
     data = self.get_dict(ctx, "partition", "dfs", "using", "params",
                          "callback")
     if "dfs" not in data:
         data["dfs"] = WorkflowDataFrames(self.last)
     using = _to_output_transformer(
         data["using"],
         global_vars=self.global_vars,
         local_vars=self.local_vars,
     )
     # ignore errors is not implemented
     self.workflow.out_transform(
         data["dfs"],
         using=using,
         params=data.get("params"),
         pre_partition=data.get("partition"),
         callback=to_function(data["callback"], self.global_vars,
                              self.local_vars)
         if "callback" in data else None,
     )
Пример #13
0
def test_invalid_module():
    # pylint: disable=no-value-for-parameter

    @module()
    def o1(wf: FugueWorkflow, df1: WorkflowDataFrame,
           df2: WorkflowDataFrame) -> None:
        pass

    @module()
    def o2(wf: FugueWorkflow, dfs: WorkflowDataFrames) -> None:
        pass

    dag1 = FugueWorkflow()
    df1 = dag1.df([[0]], "a:int")
    dag2 = FugueWorkflow()
    df2 = dag2.df([[1]], "a:int")

    with raises(ValueError):
        o1(df1, df2)

    with raises(ValueError):
        o2(WorkflowDataFrames(a=df1, b=df2))
Пример #14
0
 def visitFugueTransformTask(
         self, ctx: fp.FugueTransformTaskContext) -> WorkflowDataFrame:
     data = self.get_dict(ctx, "partition", "dfs", "params", "callback")
     if "dfs" not in data:
         data["dfs"] = WorkflowDataFrames(self.last)
     p = data["params"]
     using = _to_transformer(
         p["using"],
         schema=p.get("schema"),
         global_vars=self.global_vars,
         local_vars=self.local_vars,
     )
     __modified_exception__ = self.to_runtime_error(ctx)  # noqa
     # TODO: ignore errors is not implemented
     return self.workflow.transform(
         data["dfs"],
         using=using,
         params=p.get("params"),
         pre_partition=data.get("partition"),
         callback=to_function(data["callback"], self.global_vars,
                              self.local_vars)
         if "callback" in data else None,
     )
Пример #15
0
 def merge(
     df1: WorkflowDataFrame, df2: WorkflowDataFrame, k: str = "aa"
 ) -> WorkflowDataFrames:
     return WorkflowDataFrames({k: df1, "bb": df2})
Пример #16
0
 def p2(wf: FugueWorkflow, dfs: WorkflowDataFrames,
        d: int) -> WorkflowDataFrames:
     return WorkflowDataFrames(
         {k: v.process(process, params={"d": d})
          for k, v in dfs.items()})
Пример #17
0
 def input3(wf: FugueWorkflow, a: int, b: int) -> WorkflowDataFrames:
     return WorkflowDataFrames(a=wf.df([[a]], "a:int"),
                               b=wf.df([[b]], "b:int"))
Пример #18
0
 def merge3(df1: WorkflowDataFrame, df2: WorkflowDataFrame) -> WorkflowDataFrames:
     return WorkflowDataFrames(df1, df2)
Пример #19
0
 def visitFugueDataFramesDict(
         self, ctx: fp.FugueDataFramesDictContext) -> WorkflowDataFrames:
     dfs = self.collectChildren(ctx, fp.FugueDataFramePairContext)
     return WorkflowDataFrames(dfs)
Пример #20
0
def test_builder(tmpdir):
    space = Space(a=1, b=2, c=Grid(2, 3))
    builder = TuneDatasetBuilder(space, str(tmpdir))

    def assert_count(df: DataFrame, n: int, schema=None) -> None:
        assert len(df.as_array()) == n
        if schema is not None:
            assert df.schema == schema

    # test to_space
    with FugueWorkflow() as dag:
        df = builder.build(dag).data
        df.show()

    df1 = ArrayDataFrame([[0, 1], [1, 1], [0, 2]], "a:int,b:int")

    # test single df
    with FugueWorkflow() as dag:
        builder.add_dfs(WorkflowDataFrames(x=dag.df(df1)))
        dataset = builder.build(dag)
        assert ["x"] == dataset.dfs
        assert [] == dataset.keys
        df = dataset.data
        df.show()
        df.output(
            assert_count,
            params=dict(n=2,
                        schema=f"__tune_df__x:str,{TUNE_DATASET_TRIALS}:str"),
        )

    space = Space(b=Rand(0, 1), a=1, c=Grid(2, 3), d=Grid("a", "b"))
    df2 = ArrayDataFrame([[0, 1], [1, 1], [3, 2]], "a:int,bb:int")
    df3 = ArrayDataFrame([[10, 1], [11, 1], [10, 2]], "a:int,c:int")
    builder = TuneDatasetBuilder(space)
    engine = NativeExecutionEngine(conf={TUNE_TEMP_PATH: str(tmpdir)})

    # test multiple dfs, batch_size and config
    with FugueWorkflow(engine) as dag:
        dfs = WorkflowDataFrames(a=dag.df(df1).partition_by("a"),
                                 b=dag.df(df2).partition_by("a"))
        dataset = (builder.add_dfs(dfs,
                                   "inner").add_df("c", dag.df(df3),
                                                   "cross").build(dag))
        assert ["a"] == dataset.keys
        assert ["a", "b", "c"] == dataset.dfs
        df = dataset.data
        df.show()
        df.output(
            assert_count,
            params=dict(
                n=8,
                schema="a:int,__tune_df__a:str,__tune_df__b:str,"
                f"__tune_df__c:str,{TUNE_DATASET_TRIALS}:str",
            ),
        )

        df = builder.build(dag, batch_size=3).data
        df.show()
        df.output(
            assert_count,
            params=dict(
                n=4,
                schema="a:int,__tune_df__a:str,__tune_df__b:str,"
                f"__tune_df__c:str,{TUNE_DATASET_TRIALS}:str",
            ),
        )