예제 #1
0
def test_run_processor():
    df = ArrayDataFrame([[0]], "a:int")
    dfs = DataFrames(df1=df, df2=df)
    dfs2 = DataFrames(df, df)
    assert not dfs2.has_key

    o1 = _to_processor(t3)
    assert 4 == o1(df, df, 2).as_array()[0][0]

    o1._params = ParamDict([("a", 2)], deep=False)
    o1._execution_engine = None
    assert 4 == o1.process(dfs).as_array()[0][0]
    o1._params = ParamDict([("a", 2)], deep=False)
    o1._execution_engine = None
    assert 4 == o1.process(dfs2).as_array()[0][0]

    o1 = _to_processor(t5)
    assert 4 == o1("dummy", dfs, 2)[0][0]
    assert 4 == o1("dummy", dfs2, 2)[0][0]
    o1._params = ParamDict([("a", 2)], deep=False)
    o1._execution_engine = "dummy"
    assert 4 == o1.process(dfs).as_array()[0][0]
    o1._params = ParamDict([("a", 2)], deep=False)
    o1._execution_engine = "dummy"
    assert 4 == o1.process(dfs2).as_array()[0][0]
예제 #2
0
def _generate_comap_empty_dfs(schemas: Any, named: bool) -> DataFrames:
    if named:
        return DataFrames(
            {k: ArrayDataFrame([], v)
             for k, v in schemas.items()})
    else:
        return DataFrames([ArrayDataFrame([], v) for v in schemas.values()])
예제 #3
0
파일: _tasks.py 프로젝트: zywillc/fugue
 def execute(self, ctx: TaskContext) -> None:
     self._outputter._execution_engine = self._get_execution_engine(ctx)
     if self._input_has_key:
         self._outputter.process(DataFrames(ctx.inputs))
     else:
         self._outputter.process(DataFrames(ctx.inputs.values()))
     # TODO: output dummy to force cache to work, should we fix adagio?
     ctx.outputs["_0"] = ArrayDataFrame([], "_0:int")
예제 #4
0
 def execute(self, ctx: TaskContext) -> None:
     e = self._get_execution_engine(ctx)
     self._processor._execution_engine = e
     if self._input_has_key:
         inputs = DataFrames(ctx.inputs)
     else:
         inputs = DataFrames(ctx.inputs.values())
     self._processor.validate_on_runtime(inputs)
     df = self._processor.process(inputs)
     df = self.set_result(ctx, df)
     ctx.outputs["_0"] = df
예제 #5
0
파일: _tasks.py 프로젝트: zywillc/fugue
 def execute(self, ctx: TaskContext) -> None:
     e = self._get_execution_engine(ctx)
     self._processor._execution_engine = e
     if self._input_has_key:
         df = self._processor.process(DataFrames(ctx.inputs))
     else:
         df = self._processor.process(DataFrames(ctx.inputs.values()))
     df = self.handle_persist(df, e)
     df = self.handle_broadcast(df, e)
     self._set_result(ctx, df)
     ctx.outputs["_0"] = df
예제 #6
0
파일: _tasks.py 프로젝트: gityow/fugue
    def execute(self, ctx: TaskContext) -> None:
        self._outputter._execution_engine = self._get_execution_engine(ctx)
        if self._input_has_key:
            inputs = DataFrames(ctx.inputs)
        else:
            inputs = DataFrames(ctx.inputs.values())

        def exe():
            self._outputter.validate_on_runtime(inputs)
            self._outputter.process(inputs)

        self._execute_with_modified_traceback(exe)
        # TODO: output dummy to force cache to work, should we fix adagio?
        ctx.outputs["_0"] = ArrayDataFrame([], "_0:int")
예제 #7
0
파일: _tasks.py 프로젝트: gityow/fugue
    def execute(self, ctx: TaskContext) -> None:
        e = self._get_execution_engine(ctx)
        self._processor._execution_engine = e
        if self._input_has_key:
            inputs = DataFrames(ctx.inputs)
        else:
            inputs = DataFrames(ctx.inputs.values())

        def exe() -> Any:
            self._processor.validate_on_runtime(inputs)
            return self._processor.process(inputs)

        df = self._execute_with_modified_traceback(exe)
        df = self.set_result(ctx, df)
        ctx.outputs["_0"] = df
예제 #8
0
 def select(self, dfs: DataFrames, statement: str) -> DataFrame:
     dask_dfs = {
         k: self.execution_engine.to_df(v).native  # type: ignore
         for k, v in dfs.items()
     }
     df = run_sql_on_dask(statement, dask_dfs)
     return DaskDataFrame(df)
예제 #9
0
 def transform(self, dfs: DataFrames) -> LocalDataFrame:
     cb = _get_callback(self)
     if self._dfs_input:  # function has DataFrames input
         self._wrapper.run(  # type: ignore
             [dfs] + cb,
             self.params,
             ignore_unknown=False,
             output=False,
         )
     elif not dfs.has_key:  # input does not have key
         self._wrapper.run(  # type: ignore
             list(dfs.values()) + cb,
             self.params,
             ignore_unknown=False,
             output=False,
         )
     else:  # input DataFrames has key
         p = dict(dfs)
         p.update(self.params)
         self._wrapper.run(
             [] + cb,
             p,
             ignore_unknown=False,
             output=False  # type: ignore
         )
     return ArrayDataFrame([], OUTPUT_TRANSFORMER_DUMMY_SCHEMA)
예제 #10
0
 def select(self, dfs: DataFrames, statement: str) -> DataFrame:
     _dfs = {
         k: self.execution_engine.to_df(v).as_pandas()  # type: ignore
         for k, v in dfs.items()
     }
     df = run_sql_on_pandas(statement, _dfs)
     return self.execution_engine.to_df(df)
예제 #11
0
 def process(self, dfs: DataFrames) -> None:
     # TODO: how do we make sure multiple dfs are printed together?
     title = self.params.get_or_none("title", object)
     title = str(title) if title is not None else None
     rows = self.params.get("rows", 10)
     show_count = self.params.get("show_count", False)
     df_arr = list(dfs.values())
     heads = [df.head(rows) for df in df_arr]
     counts = [df.count() if show_count else -1 for df in df_arr]
     with Show.LOCK:
         if Show._hook is None:
             if title is not None:
                 print(title)
             for df, head, count in zip(df_arr, heads, counts):
                 df._show(head_rows=head,
                          rows=rows,
                          count=count,
                          title=None)
         else:
             for df, head, count in zip(df_arr, heads, counts):
                 Show._hook(  # pylint: disable=E1102
                     schema=df.schema,
                     head_rows=head,
                     title=title,
                     rows=rows,
                     count=count,
                 )
예제 #12
0
 def run(self, cursor: PartitionCursor, df: LocalDataFrame) -> LocalDataFrame:
     data = df.as_array(type_safe=True)
     assert_or_throw(
         len(data) == 1,
         FugueBug("each comap partition can have one and only one row"),
     )
     dfs = DataFrames(list(self._get_dfs(data[0])))
     return self.func(cursor, dfs)
예제 #13
0
 def select(self, dfs: DataFrames, statement: str) -> DataFrame:
     sql_engine = create_engine("sqlite:///:memory:")
     for k, v in dfs.items():
         v.as_pandas().to_sql(k,
                              sql_engine,
                              if_exists="replace",
                              index=False)
     df = pd.read_sql_query(statement, sql_engine)
     return PandasDataFrame(df)
예제 #14
0
def test_transformer():
    assert isinstance(t1, CoTransformer)
    df1 = ArrayDataFrame([[0, 2]], "a:int,b:int")
    df2 = ArrayDataFrame([[0, 2]], "a:int,c:int")
    dfs = DataFrames(df1, df2)
    t1._output_schema = t1.get_output_schema(dfs)
    assert t1.output_schema == "a:int,b:int"
    t2._output_schema = t2.get_output_schema(dfs)
    assert t2.output_schema == "b:int,a:int"
    assert [[0, 2, 1]] == list(t3(df1.as_array(), df2.as_pandas()))
예제 #15
0
 def visitFugueOutputTask(self, ctx: fp.FugueOutputTaskContext):
     data = self.get_dict(ctx, "dfs", "using", "params", "partition")
     if "dfs" not in data:
         data["dfs"] = DataFrames(self.last)
     self.workflow.output(
         data["dfs"],
         using=data["using"],
         params=data.get("params"),
         pre_partition=data.get("partition"),
     )
예제 #16
0
def test_run_outputter():
    df = ArrayDataFrame([[0]], "a:int")
    dfs = DataFrames(df1=df, df2=df)
    dfs2 = DataFrames(df, df)
    assert not dfs2.has_key

    class Ct(object):
        pass

    c = Ct()
    o1 = _to_outputter(t3)
    o1(df, df, 2, c)
    assert 4 == c.value
    c.value = 0
    o1._params = ParamDict([("a", 2), ("b", c)], deep=False)
    o1._execution_engine = None
    o1.process(dfs)
    assert 4 == c.value
    c.value = 0
    o1._params = ParamDict([("a", 2), ("b", c)], deep=False)
    o1._execution_engine = None
    o1.process(dfs2)
    assert 4 == c.value

    c = Ct()
    o1 = _to_outputter(t5)
    o1("dummy", dfs, 2, c)
    assert 4 == c.value
    c.value = 0
    o1("dummy", dfs2, 2, c)
    assert 4 == c.value
    c.value = 0
    o1._params = ParamDict([("a", 2), ("b", c)], deep=False)
    o1._execution_engine = NativeExecutionEngine()
    o1.process(dfs)
    assert 4 == c.value
    c.value = 0
    o1._params = ParamDict([("a", 2), ("b", c)], deep=False)
    o1._execution_engine = NativeExecutionEngine()
    o1.process(dfs2)
    assert 4 == c.value
예제 #17
0
 def select(self, dfs: DataFrames, statement: str) -> DataFrame:
     _dfs = {
         k: self.execution_engine.to_df(v).as_pandas()  # type: ignore
         for k, v in dfs.items()
     }
     df = run_sql_on_pandas(
         statement,
         _dfs,
         ignore_case=self.execution_engine.compile_conf.get(
             FUGUE_CONF_SQL_IGNORE_CASE, False),
     )
     return self.execution_engine.to_df(df)
예제 #18
0
 def visitFuguePrintTask(self, ctx: fp.FuguePrintTaskContext) -> None:
     data = self.get_dict(ctx, "dfs")
     if "dfs" not in data:
         data["dfs"] = DataFrames(self.last)
     params: Dict[str, Any] = {}
     if ctx.rows is not None:
         params["rows"] = int(self.ctxToStr(ctx.rows))
     if ctx.count is not None:
         params["show_count"] = True
     if ctx.title is not None:
         params["title"] = eval(self.ctxToStr(ctx.title))
     self.workflow.show(data["dfs"], **params)
예제 #19
0
파일: _visitors.py 프로젝트: zywillc/fugue
 def visitFugueProcessTask(
         self, ctx: fp.FugueProcessTaskContext) -> WorkflowDataFrame:
     data = self.get_dict(ctx, "partition", "dfs", "params")
     if "dfs" not in data:
         data["dfs"] = DataFrames(self.last)
     p = data["params"]
     return self.workflow.process(
         data["dfs"],
         using=p["using"],
         schema=p.get("schema"),
         params=p.get("params"),
         pre_partition=data.get("partition"),
     )
예제 #20
0
파일: _visitors.py 프로젝트: zywillc/fugue
 def visitFugueTransformTask(
         self, ctx: fp.FugueTransformTaskContext) -> WorkflowDataFrame:
     data = self.get_dict(ctx, "partition", "dfs", "params")
     if "dfs" not in data:
         data["dfs"] = DataFrames(self.last)
     p = data["params"]
     # ignore errors is not implemented
     return self.workflow.transform(
         data["dfs"],
         using=p["using"],
         schema=p.get("schema"),
         params=p.get("params"),
         pre_partition=data.get("partition"),
     )
예제 #21
0
 def process(self, dfs: DataFrames) -> None:
     args: List[Any] = []
     kwargs: Dict[str, Any] = {}
     if self._need_engine:
         args.append(self.execution_engine)
     if self._use_dfs:
         args.append(dfs)
     else:
         if not dfs.has_key:
             args += dfs.values()
         else:
             kwargs.update(dfs)
     kwargs.update(self.params)
     return self._wrapper.run(args=args, kwargs=kwargs)
예제 #22
0
파일: _visitors.py 프로젝트: WangCHX/fugue
 def visitFugueOutputTask(self, ctx: fp.FugueOutputTaskContext):
     data = self.get_dict(ctx, "dfs", "using", "params", "partition")
     if "dfs" not in data:
         data["dfs"] = DataFrames(self.last)
     using = _to_outputter(
         data["using"],
         global_vars=self.global_vars,
         local_vars=self.local_vars,
     )
     self.workflow.output(
         data["dfs"],
         using=using,
         params=data.get("params"),
         pre_partition=data.get("partition"),
     )
예제 #23
0
 def process(self, dfs: DataFrames) -> DataFrame:
     args: List[Any] = []
     kwargs: Dict[str, Any] = {}
     if self._engine_param is not None:
         args.append(self._engine_param.to_input(self.execution_engine))
     if self._use_dfs:
         args.append(dfs)
     else:
         if not dfs.has_key:
             args += dfs.values()
         else:
             kwargs.update(dfs)
     kwargs.update(self.params)
     return self._wrapper.run(
         args=args,
         kwargs=kwargs,
         output_schema=self.output_schema
         if self._need_output_schema else None,
         ctx=self.execution_engine,
     )
예제 #24
0
 def transform(self, dfs: DataFrames) -> LocalDataFrame:
     if self._dfs_input:  # function has DataFrames input
         return self._wrapper.run(  # type: ignore
             [dfs],
             self.params,
             ignore_unknown=False,
             output_schema=self.output_schema,
         )
     if not dfs.has_key:  # input does not have key
         return self._wrapper.run(  # type: ignore
             list(dfs.values()),
             self.params,
             ignore_unknown=False,
             output_schema=self.output_schema,
         )
     else:  # input DataFrames has key
         p = dict(dfs)
         p.update(self.params)
         return self._wrapper.run(  # type: ignore
             [],
             p,
             ignore_unknown=False,
             output_schema=self.output_schema)
def test_transformer():
    assert isinstance(t1, CoTransformer)
    df1 = ArrayDataFrame([[0, 2]], "a:int,b:int")
    df2 = ArrayDataFrame([[0, 2]], "a:int,c:int")
    dfs = DataFrames(df1, df2)
    assert t1.get_output_schema(dfs) == OUTPUT_TRANSFORMER_DUMMY_SCHEMA
예제 #26
0
 def select(self, dfs: DataFrames, statement: str) -> DataFrame:
     for k, v in dfs.items():
         self.execution_engine.register(v, k)  # type: ignore
     return SparkDataFrame(
         self.execution_engine.spark_session.sql(statement)  # type: ignore
     )
예제 #27
0
def mock_processor2(e: ExecutionEngine, dfs: DataFrames) -> DataFrame:
    assert "test" in e.conf
    return ArrayDataFrame([[sum(s.count() for s in dfs.values())]], "a:int")
예제 #28
0
 def visitFugueDataFramesDict(
     self, ctx: fp.FugueDataFramesDictContext
 ) -> DataFrames:
     dfs = self.collectChildren(ctx, fp.FugueDataFramePairContext)
     return DataFrames(dfs)
예제 #29
0
    def zip_all(
        self,
        dfs: DataFrames,
        how: str = "inner",
        partition_spec: PartitionSpec = EMPTY_PARTITION_SPEC,
        temp_path: Optional[str] = None,
        to_file_threshold: Any = -1,
    ) -> DataFrame:
        """Zip multiple dataframes together with given partition
        specifications.

        :param dfs: |DataFramesLikeObject|
        :param how: can accept ``inner``, ``left_outer``, ``right_outer``,
          ``full_outer``, ``cross``, defaults to ``inner``
        :param partition_spec: |PartitionLikeObject|, defaults to empty.
        :param temp_path: file path to store the data (used only if the serialized data
          is larger than ``to_file_threshold``), defaults to None
        :param to_file_threshold: file byte size threshold, defaults to -1

        :return: a zipped dataframe, the metadata of the
          dataframe will indicated it's zipped

        :Notice:

        * Please also read :meth:`~.zip`
        * If ``dfs`` is dict like, the zipped dataframe will be dict like,
          If ``dfs`` is list like, the zipped dataframe will be list like
        * It's fine to contain only one dataframe in ``dfs``

        For more details and examples, read
        :ref:`Zip & Comap <tutorial:/tutorials/execution_engine.ipynb#zip-&-comap>`.
        """
        assert_or_throw(len(dfs) > 0, "can't zip 0 dataframes")
        pairs = list(dfs.items())
        has_name = dfs.has_key
        if len(dfs) == 1:
            return self._serialize_by_partition(
                pairs[0][1],
                partition_spec,
                pairs[0][0],
                temp_path,
                to_file_threshold,
                has_name=has_name,
            )
        df = self.zip(
            pairs[0][1],
            pairs[1][1],
            how=how,
            partition_spec=partition_spec,
            temp_path=temp_path,
            to_file_threshold=to_file_threshold,
            df1_name=pairs[0][0] if has_name else None,
            df2_name=pairs[1][0] if has_name else None,
        )
        for i in range(2, len(dfs)):
            df = self.zip(
                df,
                pairs[i][1],
                how=how,
                partition_spec=partition_spec,
                temp_path=temp_path,
                to_file_threshold=to_file_threshold,
                df2_name=pairs[i][0] if has_name else None,
            )
        return df
예제 #30
0
    def select(
        self,
        df: DataFrame,
        cols: SelectColumns,
        where: Optional[ColumnExpr] = None,
        having: Optional[ColumnExpr] = None,
        metadata: Any = None,
    ) -> DataFrame:
        """The functional interface for SQL select statement

        :param df: the dataframe to be operated on
        :param cols: column expressions
        :param where: ``WHERE`` condition expression, defaults to None
        :param having: ``having`` condition expression, defaults to None. It
          is used when ``cols`` contains aggregation columns, defaults to None
        :param metadata: dict-like object to add to the result dataframe,
            defaults to None. It's currently not used
        :return: the select result as a dataframe

        .. admonition:: New Since
            :class: hint

            **0.6.0**

        .. attention::

            This interface is experimental, it's subjected to change in new versions.

        .. seealso::

            Please find more expression examples in :mod:`fugue.column.sql` and
            :mod:`fugue.column.functions`

        .. admonition:: Examples

            .. code-block:: python

                import fugue.column.functions as f

                # select existed and new columns
                engine.select(df, SelectColumns(col("a"),col("b"),lit(1,"another")))
                engine.select(df, SelectColumns(col("a"),(col("b")+lit(1)).alias("x")))

                # aggregation
                # SELECT COUNT(DISTINCT *) AS x FROM df
                engine.select(
                    df,
                    SelectColumns(f.count_distinct(col("*")).alias("x")))

                # SELECT a, MAX(b+1) AS x FROM df GROUP BY a
                engine.select(
                    df,
                    SelectColumns(col("a"),f.max(col("b")+lit(1)).alias("x")))

                # SELECT a, MAX(b+1) AS x FROM df
                #   WHERE b<2 AND a>1
                #   GROUP BY a
                #   HAVING MAX(b+1)>0
                engine.select(
                    df,
                    SelectColumns(col("a"),f.max(col("b")+lit(1)).alias("x")),
                    where=(col("b")<2) & (col("a")>1),
                    having=f.max(col("b")+lit(1))>0
                )
        """
        gen = SQLExpressionGenerator(enable_cast=False)
        sql = gen.select(cols, "df", where=where, having=having)
        res = self.sql_engine.select(DataFrames(df=self.to_df(df)), sql)
        diff = gen.correct_select_schema(df.schema, cols, res.schema)
        return res if diff is None else res.alter_columns(diff)