Пример #1
0
def test_show():
    class _CustomShow(object):
        def __init__(self):
            self.called = False

        def show(self, schema, head_rows, title, rows, count):
            print(schema, head_rows)
            print(title, rows, count)
            self.called = True

    cs = _CustomShow()
    with FugueSQLWorkflow() as dag:
        dag("""
        a = CREATE[[0], [1]] SCHEMA a: int
        b = CREATE[[0], [1]] SCHEMA a: int
        PRINT 10 ROWS FROM a, b ROWCOUNT TITLE "abc"
        PRINT a, b
        """)
    assert not cs.called
    Show.set_hook(cs.show)
    with FugueSQLWorkflow() as dag:
        dag("""
        a = CREATE[[0], [1]] SCHEMA a: int
        b = CREATE[[0], [1]] SCHEMA a: int
        PRINT 10 ROWS FROM a, b ROWCOUNT TITLE "abc"
        PRINT a, b
        """)

    assert cs.called
Пример #2
0
def test_use_soecial_df(tmpdir):
    # external non-workflowdataframe
    arr = ArrayDataFrame([[0], [1]], "a:int")
    fsql(
        """
        b=CREATE[[0], [1]] SCHEMA a: int
        a = SELECT * FROM a.x
        OUTPUT a, b USING assert_eq
        a = SELECT x.* FROM a.x AS x
        OUTPUT a, b USING assert_eq
        c=CREATE [[0,0],[1,1]] SCHEMA a:int,b:int
        d = SELECT x.*,y.a AS b FROM a.x x INNER JOIN a.x y ON x.a=y.a
        OUTPUT c, d USING assert_eq
        """,
        {
            "a.x": arr
        },
    ).run()

    # from yield file
    engine = NativeExecutionEngine(
        conf={"fugue.workflow.checkpoint.path": os.path.join(tmpdir, "ck")})
    with FugueSQLWorkflow(engine) as dag:
        dag("CREATE[[0], [1]] SCHEMA a: int YIELD FILE AS b")
        res = dag.yields["b"]

    with FugueSQLWorkflow(engine) as dag:
        dag(
            """
        b=CREATE[[0], [1]] SCHEMA a: int
        a = SELECT * FROM a.x
        OUTPUT a, b USING assert_eq
        """,
            {"a.x": res},
        )
Пример #3
0
def test_conf_override():
    with raises(FugueSQLSyntaxError):
        FugueSQLWorkflow()("create [[0]] schema a:int")
    with FugueSQLWorkflow(None,
                          {"fugue.sql.compile.ignore_case": "true"}) as dag:
        a = dag.df([[0], [1]], "a:int")
        dag("""
        create [[0],[1]] schema a:int
        b = select *
        output a,b using assert_eq""")
Пример #4
0
def test_conf_override():
    with raises(FugueSQLSyntaxError):
        FugueSQLWorkflow()("create [[0]] schema a:int")
    with FugueSQLWorkflow(
            NativeExecutionEngine({"fugue.sql.compile.ignore_case":
                                   "true"})) as dag:
        a = dag.df([[0], [1]], "a:int")
        dag("""
        b = create [[0],[1]] schema a:int
        output a,b using assert_eq""")
Пример #5
0
def test_sql():
    register_execution_engine(
        "da", lambda conf, **kwargs: DaskExecutionEngine(conf=conf))
    df = dd.from_pandas(pd.DataFrame([[0], [1]], columns=["a"]), npartitions=2)
    dag = FugueSQLWorkflow()
    dag(
        """
    SELECT * FROM df WHERE a>0
    PRINT
    """,
        df=df,
    )
    dag.run("da")
Пример #6
0
 def test(self):
     with FugueSQLWorkflow() as dag:
         dag("""
         a = CREATE [[0],[1]] SCHEMA a:int
         b = TRANSFORM USING self.t
         OUTPUT a,b USING assert_eq
         """)
Пример #7
0
def test_sql():
    session = SparkSession.builder.getOrCreate()
    register_execution_engine(
        "s",
        lambda conf, **kwargs: SparkExecutionEngine(conf=conf,
                                                    spark_session=session),
    )
    df = session.createDataFrame(pd.DataFrame([[0], [1]], columns=["a"]))
    dag = FugueSQLWorkflow()
    dag(
        """
    SELECT * FROM df WHERE a>0
    PRINT
    """,
        df=df,
    )
    dag.run("s")
Пример #8
0
def test_workflow_conf():
    dag = FugueSQLWorkflow(NativeExecutionEngine({"x": 10}))
    assert 10 == dag.conf.get_or_throw("x", int)
    assert not dag.conf.get_or_throw("fugue.sql.compile.ignore_case", bool)

    dag = FugueSQLWorkflow(
        NativeExecutionEngine({
            "x": 10,
            "fugue.sql.compile.ignore_case": True
        }))
    assert 10 == dag.conf.get_or_throw("x", int)
    assert dag.conf.get_or_throw("fugue.sql.compile.ignore_case", bool)

    dag = FugueSQLWorkflow(NativeExecutionEngine({"x": 10}),
                           {"fugue.sql.compile.ignore_case": "true"})
    assert 10 == dag.conf.get_or_throw("x", int)
    assert dag.conf.get_or_throw("fugue.sql.compile.ignore_case", bool)
Пример #9
0
def test_workflow_conf():
    dag = FugueSQLWorkflow(
        NativeExecutionEngine({
            "x": 10,
            "fugue.sql.compile.simple_assign": "false"
        }))
    assert 10 == dag.conf.get_or_throw("x", int)
    assert not dag.conf.get_or_throw("fugue.sql.compile.simple_assign", bool)
    assert not dag.conf.get_or_throw("fugue.sql.compile.ignore_case", bool)
Пример #10
0
def test_process_module():
    # pylint: disable=no-value-for-parameter
    def process(df: pd.DataFrame, d: int = 1) -> pd.DataFrame:
        df["a"] += d
        return df

    @module
    def p1(wf: FugueSQLWorkflow, df: WorkflowDataFrame) -> WorkflowDataFrame:
        return df.process(process)

    @module()
    def p2(wf: FugueWorkflow, dfs: WorkflowDataFrames,
           d: int) -> WorkflowDataFrames:
        return WorkflowDataFrames(
            {k: v.process(process, params={"d": d})
             for k, v in dfs.items()})

    @module(as_method=True, name="p4")
    def p3(df: WorkflowDataFrame) -> WorkflowDataFrame:
        return df.process(process)

    assert p1.has_input
    assert not p1.has_dfs_input
    assert p2.has_dfs_input

    with FugueSQLWorkflow() as dag:
        df = dag.df([[0]], "a:int")
        p1(df).assert_eq(dag.df([[1]], "a:int"))
        p1(dag, df).assert_eq(dag.df([[1]], "a:int"))
        p1(df=df).assert_eq(dag.df([[1]], "a:int"))
        p1(df=df, wf=dag).assert_eq(dag.df([[1]], "a:int"))

    with FugueWorkflow() as dag:
        dfs = WorkflowDataFrames(aa=dag.df([[0]], "a:int"),
                                 bb=dag.df([[10]], "a:int"))
        r = p2(dag, dfs, 1)
        r["aa"].assert_eq(dag.df([[1]], "a:int"))
        r["bb"].assert_eq(dag.df([[11]], "a:int"))

        r = p2(dfs, 1)
        r["aa"].assert_eq(dag.df([[1]], "a:int"))
        r["bb"].assert_eq(dag.df([[11]], "a:int"))

        r = p2(d=1, dfs=dfs, wf=dag)
        r["aa"].assert_eq(dag.df([[1]], "a:int"))
        r["bb"].assert_eq(dag.df([[11]], "a:int"))

        r = p2(d=1, dfs=dfs)
        r["aa"].assert_eq(dag.df([[1]], "a:int"))
        r["bb"].assert_eq(dag.df([[11]], "a:int"))

    with FugueWorkflow() as dag:
        df = dag.df([[0]], "a:int")
        p3(df).assert_eq(dag.df([[1]], "a:int"))
        p3(df=df).assert_eq(dag.df([[1]], "a:int"))
        df.p4().assert_eq(dag.df([[1]], "a:int"))
Пример #11
0
def test_multiple_sql_with_reset():
    with FugueSQLWorkflow() as dag:
        a = dag.df([[0], [1]], "a:int")
        dag("b = CREATE [[0],[1]] SCHEMA a:int")
        a = dag.df([[0], [2]], "a:int")
        b = dag.df([[0], [2]], "a:int")
        dag("""
        OUTPUT a, b USING assert_eq
        OUTPUT a, (CREATE[[0], [2]] SCHEMA a: int) USING assert_eq
        """)
Пример #12
0
def test_lazy_use_df():
    df1 = pd.DataFrame([[0]], columns=["a"])
    df2 = pd.DataFrame([[1]], columns=["a"])
    # although df2 is defined as a local variable
    # since it is not used in dag1, so it was never converted

    dag1 = FugueSQLWorkflow()
    dag1("""PRINT df1""")

    dag2 = FugueSQLWorkflow()
    dag2.df(df1).show()

    assert dag1.spec_uuid() == dag2.spec_uuid()
Пример #13
0
def test_multiple_blocks():
    with FugueSQLWorkflow() as dag:
        a = dag.df([[0], [1]], "a:int")
        c = 1
        dag("""
        OUTPUT a, (CREATE[[0], [1]] SCHEMA a: int) USING assert_eq
        """)
    # dataframe can't pass to another workflow
    with FugueSQLWorkflow() as dag:
        assert "a" in locals()
        with raises(FugueSQLError):
            dag("""
            OUTPUT a, (CREATE[[0], [1]] SCHEMA a: int) USING assert_eq
            """)
    # other local variables are fine
    with FugueSQLWorkflow() as dag:
        a = dag.df([[0], [1]], "a:int")
        dag("""
        OUTPUT a, (CREATE[[0], [{{c}}]] SCHEMA a: int) USING assert_eq
        """)
Пример #14
0
def test_use_param():
    with FugueSQLWorkflow() as dag:
        a = dag.df([[0], [1]], "a:int")
        x = 0
        dag(
            """
        b=CREATE[[{{x}}], [{{y}}]] SCHEMA a: int
        OUTPUT a, b USING assert_eq
        """,
            y=1,
        )
Пример #15
0
def fsql_dask(
    sql: str,
    ctx: Optional[Context] = None,
    register: bool = False,
    fugue_conf: Any = None,
) -> Dict[str, dd.DataFrame]:
    """Fugue SQL utility function that can consume Context directly. Fugue SQL is a language
    extending standard SQL. It makes SQL eligible to describe end to end workflows. It also
    enables you to invoke python extensions in the SQL like language.

    For more, please read
    `Fugue SQl Tutorial <https://fugue-tutorials.readthedocs.io/en/latest/tutorials/fugue_sql/index.html/>`_

    Args:
        sql: (:obj:`str`): Fugue SQL statement
        ctx (:class:`dask_sql.Context`): The context to operate on, defaults to None
        register (:obj:`bool`): Whether to register named steps back to the context
          (if provided), defaults to False
        fugue_conf (:obj:`Any`): a dictionary like object containing Fugue specific configs

    Example:
        .. code-block:: python
            # schema: *
            def median(df:pd.DataFrame) -> pd.DataFrame:
                df["y"] = df["y"].median()
                return df.head(1)

            # Create a context with tables df1, df2
            c = Context()
            ...
            result = fsql_dask('''
            j = SELECT df1.*, df2.x
                FROM df1 INNER JOIN df2 ON df1.key = df2.key
                PERSIST  # using persist because j will be used twice
            TAKE 5 ROWS PREPARTITION BY x PRESORT key
            PRINT
            TRANSFORM j PREPARTITION BY x USING median
            PRINT
            ''', c, register=True)
            assert "j" in result
            assert "j" in c.tables

    """
    _global, _local = get_caller_global_local_vars()

    dag = FugueSQLWorkflow()
    dfs = {} if ctx is None else {k: dag.df(v.df) for k, v in ctx.tables.items()}
    result = dag._sql(sql, _global, _local, **dfs)
    dag.run(DaskSQLExecutionEngine(conf=fugue_conf))

    result_dfs = {
        k: v.result.native
        for k, v in result.items()
        if isinstance(v, WorkflowDataFrame)
    }
    if register and ctx is not None:
        for k, v in result_dfs.items():
            ctx.create_table(k, v)
    return result_dfs
Пример #16
0
def test_jinja_keyword_in_sql():
    with FugueSQLWorkflow(("", "sqlite")) as dag:
        dag("""{% raw -%}
            CREATE [["{%'{%'"]] SCHEMA a:str
            SELECT * WHERE a LIKE '{%'
            PRINT
            {%- endraw %}""")

        df = dag.df([["b"]], "a:str")
        x = "b"
        dag("""
        df2 = SELECT *
        FROM df
        WHERE a = "{{x}}"
        OUTPUT df, df2 USING assert_eq
        """)
Пример #17
0
def test_use_df(tmpdir):
    # df generated inside dag
    with FugueSQLWorkflow() as dag:
        a = dag.df([[0], [1]], "a:int")
        dag("""
        b=CREATE[[0], [1]] SCHEMA a: int
        OUTPUT a, b USING assert_eq
        """)
        dag.sql_vars["b"].assert_eq(a)

    # external non-workflowdataframe
    arr = ArrayDataFrame([[0], [1]], "a:int")
    with FugueSQLWorkflow() as dag:
        dag(
            """
        b=CREATE[[0], [1]] SCHEMA a: int
        OUTPUT a, b USING assert_eq
        """,
            a=arr,
        )
        dag.sql_vars["b"].assert_eq(dag.df([[0], [1]], "a:int"))

    # from yield file
    engine = NativeExecutionEngine(
        conf={"fugue.workflow.checkpoint.path": os.path.join(tmpdir, "ck")})
    with FugueSQLWorkflow(engine) as dag:
        dag("CREATE[[0], [1]] SCHEMA a: int YIELD FILE AS b")
        res = dag.yields["b"]

    with FugueSQLWorkflow(engine) as dag:
        dag(
            """
        b=CREATE[[0], [1]] SCHEMA a: int
        OUTPUT a, b USING assert_eq
        """,
            a=res,
        )

    # from yield dataframe
    engine = NativeExecutionEngine()
    with FugueSQLWorkflow(engine) as dag:
        dag("CREATE[[0], [1]] SCHEMA a: int YIELD DATAFRAME AS b")
        res = dag.yields["b"]

    with FugueSQLWorkflow(engine) as dag:
        dag(
            """
        b=CREATE[[0], [1]] SCHEMA a: int
        OUTPUT a, b USING assert_eq
        """,
            a=res,
        )
Пример #18
0
def test_local_instance_as_extension():
    class _Mock(object):
        # schema: *
        def t(self, df: pd.DataFrame) -> pd.DataFrame:
            return df

        def test(self):
            with FugueSQLWorkflow() as dag:
                dag("""
                a = CREATE [[0],[1]] SCHEMA a:int
                b = TRANSFORM USING self.t
                OUTPUT a,b USING assert_eq
                """)

    m = _Mock()
    m.test()
    with FugueSQLWorkflow() as dag:
        dag("""
        a = CREATE [[0],[1]] SCHEMA a:int
        b = TRANSFORM USING m.t
        OUTPUT a,b USING assert_eq
        """)
Пример #19
0
def test_call_back():
    class CB(object):
        def __init__(self):
            self.n = 0

        def incr(self, n):
            self.n += n
            return self.n

    cb = CB()

    # schema: *
    def t(df: pd.DataFrame, incr: callable) -> pd.DataFrame:
        incr(1)
        return df

    with FugueSQLWorkflow() as dag:
        dag("""
        a = CREATE [[0],[1],[1]] SCHEMA a:int
        TRANSFORM PREPARTITION BY a USING t CALLBACK cb.incr PERSIST
        OUTTRANSFORM a PREPARTITION BY a USING t CALLBACK cb.incr
        """)

    assert 4 == cb.n
Пример #20
0
 def dag(self) -> FugueSQLWorkflow:
     return FugueSQLWorkflow(self.engine)
Пример #21
0
def test_function_calls():
    with FugueSQLWorkflow() as dag:
        a = dag.df([[0], [1]], "a:int")
        _eq(dag, a)
Пример #22
0
 def i1(wf: FugueSQLWorkflow) -> WorkflowDataFrame:
     return wf.df([[0]], "a:int")
Пример #23
0
def test_multiple_sql():
    with FugueSQLWorkflow() as dag:
        a = dag.df([[0], [1]], "a:int")
        dag("b = CREATE [[0],[1]] SCHEMA a:int")
        dag("OUTPUT a,b USING assert_eq")