def test_run_processor(): df = ArrayDataFrame([[0]], "a:int") dfs = DataFrames(df1=df, df2=df) dfs2 = DataFrames(df, df) assert not dfs2.has_key o1 = _to_processor(t3) assert 4 == o1(df, df, 2).as_array()[0][0] o1._params = ParamDict([("a", 2)], deep=False) o1._execution_engine = None assert 4 == o1.process(dfs).as_array()[0][0] o1._params = ParamDict([("a", 2)], deep=False) o1._execution_engine = None assert 4 == o1.process(dfs2).as_array()[0][0] o1 = _to_processor(t5) assert 4 == o1("dummy", dfs, 2)[0][0] assert 4 == o1("dummy", dfs2, 2)[0][0] o1._params = ParamDict([("a", 2)], deep=False) o1._execution_engine = "dummy" assert 4 == o1.process(dfs).as_array()[0][0] o1._params = ParamDict([("a", 2)], deep=False) o1._execution_engine = "dummy" assert 4 == o1.process(dfs2).as_array()[0][0]
def _generate_comap_empty_dfs(schemas: Any, named: bool) -> DataFrames: if named: return DataFrames( {k: ArrayDataFrame([], v) for k, v in schemas.items()}) else: return DataFrames([ArrayDataFrame([], v) for v in schemas.values()])
def execute(self, ctx: TaskContext) -> None: self._outputter._execution_engine = self._get_execution_engine(ctx) if self._input_has_key: self._outputter.process(DataFrames(ctx.inputs)) else: self._outputter.process(DataFrames(ctx.inputs.values())) # TODO: output dummy to force cache to work, should we fix adagio? ctx.outputs["_0"] = ArrayDataFrame([], "_0:int")
def execute(self, ctx: TaskContext) -> None: e = self._get_execution_engine(ctx) self._processor._execution_engine = e if self._input_has_key: inputs = DataFrames(ctx.inputs) else: inputs = DataFrames(ctx.inputs.values()) self._processor.validate_on_runtime(inputs) df = self._processor.process(inputs) df = self.set_result(ctx, df) ctx.outputs["_0"] = df
def execute(self, ctx: TaskContext) -> None: e = self._get_execution_engine(ctx) self._processor._execution_engine = e if self._input_has_key: df = self._processor.process(DataFrames(ctx.inputs)) else: df = self._processor.process(DataFrames(ctx.inputs.values())) df = self.handle_persist(df, e) df = self.handle_broadcast(df, e) self._set_result(ctx, df) ctx.outputs["_0"] = df
def execute(self, ctx: TaskContext) -> None: self._outputter._execution_engine = self._get_execution_engine(ctx) if self._input_has_key: inputs = DataFrames(ctx.inputs) else: inputs = DataFrames(ctx.inputs.values()) def exe(): self._outputter.validate_on_runtime(inputs) self._outputter.process(inputs) self._execute_with_modified_traceback(exe) # TODO: output dummy to force cache to work, should we fix adagio? ctx.outputs["_0"] = ArrayDataFrame([], "_0:int")
def execute(self, ctx: TaskContext) -> None: e = self._get_execution_engine(ctx) self._processor._execution_engine = e if self._input_has_key: inputs = DataFrames(ctx.inputs) else: inputs = DataFrames(ctx.inputs.values()) def exe() -> Any: self._processor.validate_on_runtime(inputs) return self._processor.process(inputs) df = self._execute_with_modified_traceback(exe) df = self.set_result(ctx, df) ctx.outputs["_0"] = df
def select(self, dfs: DataFrames, statement: str) -> DataFrame: dask_dfs = { k: self.execution_engine.to_df(v).native # type: ignore for k, v in dfs.items() } df = run_sql_on_dask(statement, dask_dfs) return DaskDataFrame(df)
def transform(self, dfs: DataFrames) -> LocalDataFrame: cb = _get_callback(self) if self._dfs_input: # function has DataFrames input self._wrapper.run( # type: ignore [dfs] + cb, self.params, ignore_unknown=False, output=False, ) elif not dfs.has_key: # input does not have key self._wrapper.run( # type: ignore list(dfs.values()) + cb, self.params, ignore_unknown=False, output=False, ) else: # input DataFrames has key p = dict(dfs) p.update(self.params) self._wrapper.run( [] + cb, p, ignore_unknown=False, output=False # type: ignore ) return ArrayDataFrame([], OUTPUT_TRANSFORMER_DUMMY_SCHEMA)
def select(self, dfs: DataFrames, statement: str) -> DataFrame: _dfs = { k: self.execution_engine.to_df(v).as_pandas() # type: ignore for k, v in dfs.items() } df = run_sql_on_pandas(statement, _dfs) return self.execution_engine.to_df(df)
def process(self, dfs: DataFrames) -> None: # TODO: how do we make sure multiple dfs are printed together? title = self.params.get_or_none("title", object) title = str(title) if title is not None else None rows = self.params.get("rows", 10) show_count = self.params.get("show_count", False) df_arr = list(dfs.values()) heads = [df.head(rows) for df in df_arr] counts = [df.count() if show_count else -1 for df in df_arr] with Show.LOCK: if Show._hook is None: if title is not None: print(title) for df, head, count in zip(df_arr, heads, counts): df._show(head_rows=head, rows=rows, count=count, title=None) else: for df, head, count in zip(df_arr, heads, counts): Show._hook( # pylint: disable=E1102 schema=df.schema, head_rows=head, title=title, rows=rows, count=count, )
def run(self, cursor: PartitionCursor, df: LocalDataFrame) -> LocalDataFrame: data = df.as_array(type_safe=True) assert_or_throw( len(data) == 1, FugueBug("each comap partition can have one and only one row"), ) dfs = DataFrames(list(self._get_dfs(data[0]))) return self.func(cursor, dfs)
def select(self, dfs: DataFrames, statement: str) -> DataFrame: sql_engine = create_engine("sqlite:///:memory:") for k, v in dfs.items(): v.as_pandas().to_sql(k, sql_engine, if_exists="replace", index=False) df = pd.read_sql_query(statement, sql_engine) return PandasDataFrame(df)
def test_transformer(): assert isinstance(t1, CoTransformer) df1 = ArrayDataFrame([[0, 2]], "a:int,b:int") df2 = ArrayDataFrame([[0, 2]], "a:int,c:int") dfs = DataFrames(df1, df2) t1._output_schema = t1.get_output_schema(dfs) assert t1.output_schema == "a:int,b:int" t2._output_schema = t2.get_output_schema(dfs) assert t2.output_schema == "b:int,a:int" assert [[0, 2, 1]] == list(t3(df1.as_array(), df2.as_pandas()))
def visitFugueOutputTask(self, ctx: fp.FugueOutputTaskContext): data = self.get_dict(ctx, "dfs", "using", "params", "partition") if "dfs" not in data: data["dfs"] = DataFrames(self.last) self.workflow.output( data["dfs"], using=data["using"], params=data.get("params"), pre_partition=data.get("partition"), )
def test_run_outputter(): df = ArrayDataFrame([[0]], "a:int") dfs = DataFrames(df1=df, df2=df) dfs2 = DataFrames(df, df) assert not dfs2.has_key class Ct(object): pass c = Ct() o1 = _to_outputter(t3) o1(df, df, 2, c) assert 4 == c.value c.value = 0 o1._params = ParamDict([("a", 2), ("b", c)], deep=False) o1._execution_engine = None o1.process(dfs) assert 4 == c.value c.value = 0 o1._params = ParamDict([("a", 2), ("b", c)], deep=False) o1._execution_engine = None o1.process(dfs2) assert 4 == c.value c = Ct() o1 = _to_outputter(t5) o1("dummy", dfs, 2, c) assert 4 == c.value c.value = 0 o1("dummy", dfs2, 2, c) assert 4 == c.value c.value = 0 o1._params = ParamDict([("a", 2), ("b", c)], deep=False) o1._execution_engine = NativeExecutionEngine() o1.process(dfs) assert 4 == c.value c.value = 0 o1._params = ParamDict([("a", 2), ("b", c)], deep=False) o1._execution_engine = NativeExecutionEngine() o1.process(dfs2) assert 4 == c.value
def select(self, dfs: DataFrames, statement: str) -> DataFrame: _dfs = { k: self.execution_engine.to_df(v).as_pandas() # type: ignore for k, v in dfs.items() } df = run_sql_on_pandas( statement, _dfs, ignore_case=self.execution_engine.compile_conf.get( FUGUE_CONF_SQL_IGNORE_CASE, False), ) return self.execution_engine.to_df(df)
def visitFuguePrintTask(self, ctx: fp.FuguePrintTaskContext) -> None: data = self.get_dict(ctx, "dfs") if "dfs" not in data: data["dfs"] = DataFrames(self.last) params: Dict[str, Any] = {} if ctx.rows is not None: params["rows"] = int(self.ctxToStr(ctx.rows)) if ctx.count is not None: params["show_count"] = True if ctx.title is not None: params["title"] = eval(self.ctxToStr(ctx.title)) self.workflow.show(data["dfs"], **params)
def visitFugueProcessTask( self, ctx: fp.FugueProcessTaskContext) -> WorkflowDataFrame: data = self.get_dict(ctx, "partition", "dfs", "params") if "dfs" not in data: data["dfs"] = DataFrames(self.last) p = data["params"] return self.workflow.process( data["dfs"], using=p["using"], schema=p.get("schema"), params=p.get("params"), pre_partition=data.get("partition"), )
def visitFugueTransformTask( self, ctx: fp.FugueTransformTaskContext) -> WorkflowDataFrame: data = self.get_dict(ctx, "partition", "dfs", "params") if "dfs" not in data: data["dfs"] = DataFrames(self.last) p = data["params"] # ignore errors is not implemented return self.workflow.transform( data["dfs"], using=p["using"], schema=p.get("schema"), params=p.get("params"), pre_partition=data.get("partition"), )
def process(self, dfs: DataFrames) -> None: args: List[Any] = [] kwargs: Dict[str, Any] = {} if self._need_engine: args.append(self.execution_engine) if self._use_dfs: args.append(dfs) else: if not dfs.has_key: args += dfs.values() else: kwargs.update(dfs) kwargs.update(self.params) return self._wrapper.run(args=args, kwargs=kwargs)
def visitFugueOutputTask(self, ctx: fp.FugueOutputTaskContext): data = self.get_dict(ctx, "dfs", "using", "params", "partition") if "dfs" not in data: data["dfs"] = DataFrames(self.last) using = _to_outputter( data["using"], global_vars=self.global_vars, local_vars=self.local_vars, ) self.workflow.output( data["dfs"], using=using, params=data.get("params"), pre_partition=data.get("partition"), )
def process(self, dfs: DataFrames) -> DataFrame: args: List[Any] = [] kwargs: Dict[str, Any] = {} if self._engine_param is not None: args.append(self._engine_param.to_input(self.execution_engine)) if self._use_dfs: args.append(dfs) else: if not dfs.has_key: args += dfs.values() else: kwargs.update(dfs) kwargs.update(self.params) return self._wrapper.run( args=args, kwargs=kwargs, output_schema=self.output_schema if self._need_output_schema else None, ctx=self.execution_engine, )
def transform(self, dfs: DataFrames) -> LocalDataFrame: if self._dfs_input: # function has DataFrames input return self._wrapper.run( # type: ignore [dfs], self.params, ignore_unknown=False, output_schema=self.output_schema, ) if not dfs.has_key: # input does not have key return self._wrapper.run( # type: ignore list(dfs.values()), self.params, ignore_unknown=False, output_schema=self.output_schema, ) else: # input DataFrames has key p = dict(dfs) p.update(self.params) return self._wrapper.run( # type: ignore [], p, ignore_unknown=False, output_schema=self.output_schema)
def test_transformer(): assert isinstance(t1, CoTransformer) df1 = ArrayDataFrame([[0, 2]], "a:int,b:int") df2 = ArrayDataFrame([[0, 2]], "a:int,c:int") dfs = DataFrames(df1, df2) assert t1.get_output_schema(dfs) == OUTPUT_TRANSFORMER_DUMMY_SCHEMA
def select(self, dfs: DataFrames, statement: str) -> DataFrame: for k, v in dfs.items(): self.execution_engine.register(v, k) # type: ignore return SparkDataFrame( self.execution_engine.spark_session.sql(statement) # type: ignore )
def mock_processor2(e: ExecutionEngine, dfs: DataFrames) -> DataFrame: assert "test" in e.conf return ArrayDataFrame([[sum(s.count() for s in dfs.values())]], "a:int")
def visitFugueDataFramesDict( self, ctx: fp.FugueDataFramesDictContext ) -> DataFrames: dfs = self.collectChildren(ctx, fp.FugueDataFramePairContext) return DataFrames(dfs)
def zip_all( self, dfs: DataFrames, how: str = "inner", partition_spec: PartitionSpec = EMPTY_PARTITION_SPEC, temp_path: Optional[str] = None, to_file_threshold: Any = -1, ) -> DataFrame: """Zip multiple dataframes together with given partition specifications. :param dfs: |DataFramesLikeObject| :param how: can accept ``inner``, ``left_outer``, ``right_outer``, ``full_outer``, ``cross``, defaults to ``inner`` :param partition_spec: |PartitionLikeObject|, defaults to empty. :param temp_path: file path to store the data (used only if the serialized data is larger than ``to_file_threshold``), defaults to None :param to_file_threshold: file byte size threshold, defaults to -1 :return: a zipped dataframe, the metadata of the dataframe will indicated it's zipped :Notice: * Please also read :meth:`~.zip` * If ``dfs`` is dict like, the zipped dataframe will be dict like, If ``dfs`` is list like, the zipped dataframe will be list like * It's fine to contain only one dataframe in ``dfs`` For more details and examples, read :ref:`Zip & Comap <tutorial:/tutorials/execution_engine.ipynb#zip-&-comap>`. """ assert_or_throw(len(dfs) > 0, "can't zip 0 dataframes") pairs = list(dfs.items()) has_name = dfs.has_key if len(dfs) == 1: return self._serialize_by_partition( pairs[0][1], partition_spec, pairs[0][0], temp_path, to_file_threshold, has_name=has_name, ) df = self.zip( pairs[0][1], pairs[1][1], how=how, partition_spec=partition_spec, temp_path=temp_path, to_file_threshold=to_file_threshold, df1_name=pairs[0][0] if has_name else None, df2_name=pairs[1][0] if has_name else None, ) for i in range(2, len(dfs)): df = self.zip( df, pairs[i][1], how=how, partition_spec=partition_spec, temp_path=temp_path, to_file_threshold=to_file_threshold, df2_name=pairs[i][0] if has_name else None, ) return df
def select( self, df: DataFrame, cols: SelectColumns, where: Optional[ColumnExpr] = None, having: Optional[ColumnExpr] = None, metadata: Any = None, ) -> DataFrame: """The functional interface for SQL select statement :param df: the dataframe to be operated on :param cols: column expressions :param where: ``WHERE`` condition expression, defaults to None :param having: ``having`` condition expression, defaults to None. It is used when ``cols`` contains aggregation columns, defaults to None :param metadata: dict-like object to add to the result dataframe, defaults to None. It's currently not used :return: the select result as a dataframe .. admonition:: New Since :class: hint **0.6.0** .. attention:: This interface is experimental, it's subjected to change in new versions. .. seealso:: Please find more expression examples in :mod:`fugue.column.sql` and :mod:`fugue.column.functions` .. admonition:: Examples .. code-block:: python import fugue.column.functions as f # select existed and new columns engine.select(df, SelectColumns(col("a"),col("b"),lit(1,"another"))) engine.select(df, SelectColumns(col("a"),(col("b")+lit(1)).alias("x"))) # aggregation # SELECT COUNT(DISTINCT *) AS x FROM df engine.select( df, SelectColumns(f.count_distinct(col("*")).alias("x"))) # SELECT a, MAX(b+1) AS x FROM df GROUP BY a engine.select( df, SelectColumns(col("a"),f.max(col("b")+lit(1)).alias("x"))) # SELECT a, MAX(b+1) AS x FROM df # WHERE b<2 AND a>1 # GROUP BY a # HAVING MAX(b+1)>0 engine.select( df, SelectColumns(col("a"),f.max(col("b")+lit(1)).alias("x")), where=(col("b")<2) & (col("a")>1), having=f.max(col("b")+lit(1))>0 ) """ gen = SQLExpressionGenerator(enable_cast=False) sql = gen.select(cols, "df", where=where, having=having) res = self.sql_engine.select(DataFrames(df=self.to_df(df)), sql) diff = gen.correct_select_schema(df.schema, cols, res.schema) return res if diff is None else res.alter_columns(diff)