def test_worflow_dataframes(): dag1 = FugueWorkflow() df1 = dag1.df([[0]], "a:int") df2 = dag1.df([[0]], "b:int") dag2 = FugueWorkflow() df3 = dag2.df([[0]], "a:int") dfs1 = WorkflowDataFrames(a=df1, b=df2) assert dfs1["a"] is df1 assert dfs1["b"] is df2 dfs2 = WorkflowDataFrames(dfs1, aa=df1, bb=df2) assert 4 == len(dfs2) with raises(ValueError): WorkflowDataFrames(a=df1, b=df3) with raises(ValueError): WorkflowDataFrames(a=df1, b=ArrayDataFrame([[0]], "a:int")) dag = FugueWorkflow() df = dag.df([[0, 1], [1, 1]], "a:int,b:int") assert df.partition_spec.empty df2 = df.partition(by=["a"]) assert df.partition_spec.empty assert df2.partition_spec == PartitionSpec(by=["a"]) df3 = df.partition_by("a", "b") assert df.partition_spec.empty assert df3.partition_spec == PartitionSpec(by=["a", "b"]) df4 = df.per_partition_by("a", "b") assert df.partition_spec.empty assert df4.partition_spec == PartitionSpec(by=["a", "b"], algo="even") df4 = df.per_row() assert df.partition_spec.empty assert df4.partition_spec == PartitionSpec("per_row")
def visitFugueModuleTask(self, ctx: fp.FugueModuleTaskContext) -> None: data = self.get_dict(ctx, "assign", "dfs", "using", "params") sub = _to_module( data["using"], global_vars=self.global_vars, local_vars=self.local_vars, ) varname = data["assign"][0] if "assign" in data else None if varname is not None: assert_or_throw( sub.has_single_output or sub.has_multiple_output, FugueSQLSyntaxError( "invalid assignment for module without output"), ) if sub.has_input: dfs = data["dfs"] if "dfs" in data else WorkflowDataFrames( self.last) else: dfs = WorkflowDataFrames() p = data["params"] if "params" in data else {} if sub.has_dfs_input: result = sub(dfs, **p) elif len(dfs) == 0: result = sub(self.workflow, **p) elif len(dfs) == 1 or not dfs.has_key: result = sub(*list(dfs.values()), **p) else: result = sub(**dfs, **p) if sub.has_single_output or sub.has_multiple_output: self.variables[varname] = result if sub.has_single_output: self._last = result
def serialize_dfs(dfs: WorkflowDataFrames, how: str = "inner", path="") -> WorkflowDataFrame: assert_or_throw(dfs.has_key, "all datarames must be named") serialized = WorkflowDataFrames( {k: serialize_df(v, k, path) for k, v in dfs.items()}) wf: FugueWorkflow = dfs.get_value_by_index(0).workflow return wf.join(serialized, how=how)
def test_process_module(): # pylint: disable=no-value-for-parameter def process(df: pd.DataFrame, d: int = 1) -> pd.DataFrame: df["a"] += d return df @module def p1(wf: FugueSQLWorkflow, df: WorkflowDataFrame) -> WorkflowDataFrame: return df.process(process) @module() def p2(wf: FugueWorkflow, dfs: WorkflowDataFrames, d: int) -> WorkflowDataFrames: return WorkflowDataFrames( {k: v.process(process, params={"d": d}) for k, v in dfs.items()}) @module(as_method=True, name="p4") def p3(df: WorkflowDataFrame) -> WorkflowDataFrame: return df.process(process) assert p1.has_input assert not p1.has_dfs_input assert p2.has_dfs_input with FugueSQLWorkflow() as dag: df = dag.df([[0]], "a:int") p1(df).assert_eq(dag.df([[1]], "a:int")) p1(dag, df).assert_eq(dag.df([[1]], "a:int")) p1(df=df).assert_eq(dag.df([[1]], "a:int")) p1(df=df, wf=dag).assert_eq(dag.df([[1]], "a:int")) with FugueWorkflow() as dag: dfs = WorkflowDataFrames(aa=dag.df([[0]], "a:int"), bb=dag.df([[10]], "a:int")) r = p2(dag, dfs, 1) r["aa"].assert_eq(dag.df([[1]], "a:int")) r["bb"].assert_eq(dag.df([[11]], "a:int")) r = p2(dfs, 1) r["aa"].assert_eq(dag.df([[1]], "a:int")) r["bb"].assert_eq(dag.df([[11]], "a:int")) r = p2(d=1, dfs=dfs, wf=dag) r["aa"].assert_eq(dag.df([[1]], "a:int")) r["bb"].assert_eq(dag.df([[11]], "a:int")) r = p2(d=1, dfs=dfs) r["aa"].assert_eq(dag.df([[1]], "a:int")) r["bb"].assert_eq(dag.df([[11]], "a:int")) with FugueWorkflow() as dag: df = dag.df([[0]], "a:int") p3(df).assert_eq(dag.df([[1]], "a:int")) p3(df=df).assert_eq(dag.df([[1]], "a:int")) df.p4().assert_eq(dag.df([[1]], "a:int"))
def add_dfs(self, dfs: WorkflowDataFrames, how: str = "") -> "TuneDatasetBuilder": assert_or_throw(dfs.has_key, "all datarames must be named") for k, v in dfs.items(): if len(self._dfs_spec) == 0: self.add_df(k, v) else: self.add_df(k, v, how=how) return self
def visitFuguePrintTask(self, ctx: fp.FuguePrintTaskContext) -> None: data = self.get_dict(ctx, "dfs") if "dfs" not in data: data["dfs"] = WorkflowDataFrames(self.last) params: Dict[str, Any] = {} if ctx.rows is not None: params["rows"] = int(self.ctxToStr(ctx.rows)) if ctx.count is not None: params["show_count"] = True if ctx.title is not None: params["title"] = eval(self.ctxToStr(ctx.title)) self.workflow.show(data["dfs"], **params)
def test_module(): # pylint: disable=no-value-for-parameter def create(wf: FugueWorkflow, n: int = 1) -> WorkflowDataFrame: return wf.df([[n]], "a:int") def merge(df1: WorkflowDataFrame, df2: WorkflowDataFrame, k: str = "aa") -> WorkflowDataFrames: return WorkflowDataFrames({k: df1, "bb": df2}) def merge2(wf: FugueWorkflow, dfs: WorkflowDataFrames, k: int = 0) -> WorkflowDataFrame: return dfs[k] def merge3(df1: WorkflowDataFrame, df2: WorkflowDataFrame) -> WorkflowDataFrames: return WorkflowDataFrames(df1, df2) @module() def out1(wf: FugueWorkflow, df: WorkflowDataFrame) -> None: df.show() dag = FugueWorkflow() a = create(dag) b = create(dag, n=2) dfs = merge(a, b, k="a1") dfs["a1"].show() dfs["bb"].show() df = merge2(dag, WorkflowDataFrames(a, b), k=1) out1(df) dfs = merge3(b, a) dfs[0].show() dfs[1].show() assert_eq( """ a=sub using create b=sub using create(n=2) dfs=sub a,b using merge(k="a1") print dfs[a1] print dfs[bb] sub a,b using merge2(k=1) sub using out1 dfs=sub df2:a,df1:b using merge3 print dfs[0] print dfs[1] """, dag, )
def visitFugueOutputTask(self, ctx: fp.FugueOutputTaskContext): data = self.get_dict(ctx, "dfs", "using", "params", "partition") if "dfs" not in data: data["dfs"] = WorkflowDataFrames(self.last) using = _to_outputter( data["using"], global_vars=self.global_vars, local_vars=self.local_vars, ) self.workflow.output( data["dfs"], using=using, params=data.get("params"), pre_partition=data.get("partition"), )
def space(self, *args: Space, **kwargs: Any) -> "TunableWithSpace": space = Space( **{ k: v for k, v in kwargs.items() if not isinstance(v, WorkflowDataFrame) }) if len(args) > 0: s = args[0] for x in args[1:]: s = s * x space = s * space dfs = WorkflowDataFrames({ k: v for k, v in kwargs.items() if isinstance(v, WorkflowDataFrame) }) return TunableWithSpace(self, space, dfs)
def visitFugueProcessTask( self, ctx: fp.FugueProcessTaskContext) -> WorkflowDataFrame: data = self.get_dict(ctx, "partition", "dfs", "params") if "dfs" not in data: data["dfs"] = WorkflowDataFrames(self.last) p = data["params"] using = _to_processor( p["using"], schema=p.get("schema"), global_vars=self.global_vars, local_vars=self.local_vars, ) return self.workflow.process( data["dfs"], using=using, params=p.get("params"), pre_partition=data.get("partition"), )
def add_dfs(self, dfs: WorkflowDataFrames, how: str = "") -> "TuneDatasetBuilder": """Add multiple dataframes with the same join type :param dfs: dictionary like dataframe collection. The keys will be used as the dataframe names :param how: join type, can accept ``semi``, ``left_semi``, ``anti``, ``left_anti``, ``inner``, ``left_outer``, ``right_outer``, ``full_outer``, ``cross`` :returns: the builder itself """ assert_or_throw(dfs.has_key, "all datarames must be named") for k, v in dfs.items(): if len(self._dfs_spec) == 0: self.add_df(k, v) else: self.add_df(k, v, how=how) return self
def visitFugueOutputTransformTask( self, ctx: fp.FugueOutputTransformTaskContext) -> None: data = self.get_dict(ctx, "partition", "dfs", "using", "params", "callback") if "dfs" not in data: data["dfs"] = WorkflowDataFrames(self.last) using = _to_output_transformer( data["using"], global_vars=self.global_vars, local_vars=self.local_vars, ) # ignore errors is not implemented self.workflow.out_transform( data["dfs"], using=using, params=data.get("params"), pre_partition=data.get("partition"), callback=to_function(data["callback"], self.global_vars, self.local_vars) if "callback" in data else None, )
def test_invalid_module(): # pylint: disable=no-value-for-parameter @module() def o1(wf: FugueWorkflow, df1: WorkflowDataFrame, df2: WorkflowDataFrame) -> None: pass @module() def o2(wf: FugueWorkflow, dfs: WorkflowDataFrames) -> None: pass dag1 = FugueWorkflow() df1 = dag1.df([[0]], "a:int") dag2 = FugueWorkflow() df2 = dag2.df([[1]], "a:int") with raises(ValueError): o1(df1, df2) with raises(ValueError): o2(WorkflowDataFrames(a=df1, b=df2))
def visitFugueTransformTask( self, ctx: fp.FugueTransformTaskContext) -> WorkflowDataFrame: data = self.get_dict(ctx, "partition", "dfs", "params", "callback") if "dfs" not in data: data["dfs"] = WorkflowDataFrames(self.last) p = data["params"] using = _to_transformer( p["using"], schema=p.get("schema"), global_vars=self.global_vars, local_vars=self.local_vars, ) __modified_exception__ = self.to_runtime_error(ctx) # noqa # TODO: ignore errors is not implemented return self.workflow.transform( data["dfs"], using=using, params=p.get("params"), pre_partition=data.get("partition"), callback=to_function(data["callback"], self.global_vars, self.local_vars) if "callback" in data else None, )
def merge( df1: WorkflowDataFrame, df2: WorkflowDataFrame, k: str = "aa" ) -> WorkflowDataFrames: return WorkflowDataFrames({k: df1, "bb": df2})
def p2(wf: FugueWorkflow, dfs: WorkflowDataFrames, d: int) -> WorkflowDataFrames: return WorkflowDataFrames( {k: v.process(process, params={"d": d}) for k, v in dfs.items()})
def input3(wf: FugueWorkflow, a: int, b: int) -> WorkflowDataFrames: return WorkflowDataFrames(a=wf.df([[a]], "a:int"), b=wf.df([[b]], "b:int"))
def merge3(df1: WorkflowDataFrame, df2: WorkflowDataFrame) -> WorkflowDataFrames: return WorkflowDataFrames(df1, df2)
def visitFugueDataFramesDict( self, ctx: fp.FugueDataFramesDictContext) -> WorkflowDataFrames: dfs = self.collectChildren(ctx, fp.FugueDataFramePairContext) return WorkflowDataFrames(dfs)
def test_builder(tmpdir): space = Space(a=1, b=2, c=Grid(2, 3)) builder = TuneDatasetBuilder(space, str(tmpdir)) def assert_count(df: DataFrame, n: int, schema=None) -> None: assert len(df.as_array()) == n if schema is not None: assert df.schema == schema # test to_space with FugueWorkflow() as dag: df = builder.build(dag).data df.show() df1 = ArrayDataFrame([[0, 1], [1, 1], [0, 2]], "a:int,b:int") # test single df with FugueWorkflow() as dag: builder.add_dfs(WorkflowDataFrames(x=dag.df(df1))) dataset = builder.build(dag) assert ["x"] == dataset.dfs assert [] == dataset.keys df = dataset.data df.show() df.output( assert_count, params=dict(n=2, schema=f"__tune_df__x:str,{TUNE_DATASET_TRIALS}:str"), ) space = Space(b=Rand(0, 1), a=1, c=Grid(2, 3), d=Grid("a", "b")) df2 = ArrayDataFrame([[0, 1], [1, 1], [3, 2]], "a:int,bb:int") df3 = ArrayDataFrame([[10, 1], [11, 1], [10, 2]], "a:int,c:int") builder = TuneDatasetBuilder(space) engine = NativeExecutionEngine(conf={TUNE_TEMP_PATH: str(tmpdir)}) # test multiple dfs, batch_size and config with FugueWorkflow(engine) as dag: dfs = WorkflowDataFrames(a=dag.df(df1).partition_by("a"), b=dag.df(df2).partition_by("a")) dataset = (builder.add_dfs(dfs, "inner").add_df("c", dag.df(df3), "cross").build(dag)) assert ["a"] == dataset.keys assert ["a", "b", "c"] == dataset.dfs df = dataset.data df.show() df.output( assert_count, params=dict( n=8, schema="a:int,__tune_df__a:str,__tune_df__b:str," f"__tune_df__c:str,{TUNE_DATASET_TRIALS}:str", ), ) df = builder.build(dag, batch_size=3).data df.show() df.output( assert_count, params=dict( n=4, schema="a:int,__tune_df__a:str,__tune_df__b:str," f"__tune_df__c:str,{TUNE_DATASET_TRIALS}:str", ), )