Пример #1
0
def fsql_dask(
    sql: str,
    ctx: Optional[Context] = None,
    register: bool = False,
    fugue_conf: Any = None,
) -> Dict[str, dd.DataFrame]:
    """Fugue SQL utility function that can consume Context directly. Fugue SQL is a language
    extending standard SQL. It makes SQL eligible to describe end to end workflows. It also
    enables you to invoke python extensions in the SQL like language.

    For more, please read
    `Fugue SQl Tutorial <https://fugue-tutorials.readthedocs.io/en/latest/tutorials/fugue_sql/index.html/>`_

    Args:
        sql: (:obj:`str`): Fugue SQL statement
        ctx (:class:`dask_sql.Context`): The context to operate on, defaults to None
        register (:obj:`bool`): Whether to register named steps back to the context
          (if provided), defaults to False
        fugue_conf (:obj:`Any`): a dictionary like object containing Fugue specific configs

    Example:
        .. code-block:: python
            # schema: *
            def median(df:pd.DataFrame) -> pd.DataFrame:
                df["y"] = df["y"].median()
                return df.head(1)

            # Create a context with tables df1, df2
            c = Context()
            ...
            result = fsql_dask('''
            j = SELECT df1.*, df2.x
                FROM df1 INNER JOIN df2 ON df1.key = df2.key
                PERSIST  # using persist because j will be used twice
            TAKE 5 ROWS PREPARTITION BY x PRESORT key
            PRINT
            TRANSFORM j PREPARTITION BY x USING median
            PRINT
            ''', c, register=True)
            assert "j" in result
            assert "j" in c.tables

    """
    _global, _local = get_caller_global_local_vars()

    dag = FugueSQLWorkflow()
    dfs = {} if ctx is None else {k: dag.df(v.df) for k, v in ctx.tables.items()}
    result = dag._sql(sql, _global, _local, **dfs)
    dag.run(DaskSQLExecutionEngine(conf=fugue_conf))

    result_dfs = {
        k: v.result.native
        for k, v in result.items()
        if isinstance(v, WorkflowDataFrame)
    }
    if register and ctx is not None:
        for k, v in result_dfs.items():
            ctx.create_table(k, v)
    return result_dfs
Пример #2
0
def test_sql():
    register_execution_engine(
        "da", lambda conf, **kwargs: DaskExecutionEngine(conf=conf))
    df = dd.from_pandas(pd.DataFrame([[0], [1]], columns=["a"]), npartitions=2)
    dag = FugueSQLWorkflow()
    dag(
        """
    SELECT * FROM df WHERE a>0
    PRINT
    """,
        df=df,
    )
    dag.run("da")
Пример #3
0
def test_sql():
    session = SparkSession.builder.getOrCreate()
    register_execution_engine(
        "s",
        lambda conf, **kwargs: SparkExecutionEngine(conf=conf,
                                                    spark_session=session),
    )
    df = session.createDataFrame(pd.DataFrame([[0], [1]], columns=["a"]))
    dag = FugueSQLWorkflow()
    dag(
        """
    SELECT * FROM df WHERE a>0
    PRINT
    """,
        df=df,
    )
    dag.run("s")