Пример #1
0
 def __init__(
     self,
     code: str,
     rule: str,
     ignore_case: bool = False,
     simple_assign: bool = True,
     ansi_sql: bool = False,
 ):
     self._rule = rule
     self._raw_code = code
     self._raw_lines = code.splitlines()
     if ignore_case:
         self._code, self._tree, self._stream = _to_cased_code(
             code, rule, simple_assign=simple_assign, ansi_sql=ansi_sql)
     else:
         try:
             self._code = code
             self._tree, self._stream = _to_tree(
                 self._code,
                 self._rule,
                 False,
                 simple_assign=simple_assign,
                 ansi_sql=ansi_sql,
             )
         except FugueSQLSyntaxError as e:
             if _detect_case_issue(code, _FUGUE_SQL_CASE_ISSUE_THREHOLD):
                 prefix = (
                     "(FugueSQL requires uppercase characters by default. "
                     f"To ignore casing, turn on {FUGUE_CONF_SQL_IGNORE_CASE})"
                 )
                 msg = prefix + "\n" + str(e)
                 raise FugueSQLSyntaxError(msg).with_traceback(
                     None) from None
             else:
                 raise FugueSQLSyntaxError(str(e)) from None
Пример #2
0
 def get_df(self, key: str,
            ctx: fp.FugueDataFrameMemberContext) -> WorkflowDataFrame:
     assert_or_throw(
         key in self.variables,
         lambda: FugueSQLSyntaxError(f"{key} is not defined"),
     )
     if isinstance(self.variables[key], LazyWorkflowDataFrame):
         assert_or_throw(
             ctx is None,
             FugueSQLSyntaxError(
                 "can't specify index or key for dataframe"),
         )
         return self.variables[key].get_df()  # type: ignore
     if isinstance(self.variables[key], WorkflowDataFrame):
         assert_or_throw(
             ctx is None,
             FugueSQLSyntaxError(
                 "can't specify index or key for dataframe"),
         )
         return self.variables[key]  # type: ignore
     assert_or_throw(
         ctx is not None,
         FugueSQLSyntaxError("must specify index or key for dataframes"),
     )
     if ctx.index is not None:
         return self.variables[key][int(self.ctxToStr(ctx.index))]
     else:
         return self.variables[key][self.ctxToStr(ctx.key)]  # type: ignore
Пример #3
0
 def visitFugueModuleTask(self, ctx: fp.FugueModuleTaskContext) -> None:
     data = self.get_dict(ctx, "assign", "dfs", "using", "params")
     sub = _to_module(
         data["using"],
         global_vars=self.global_vars,
         local_vars=self.local_vars,
     )
     varname = data["assign"][0] if "assign" in data else None
     if varname is not None:
         assert_or_throw(
             sub.has_single_output or sub.has_multiple_output,
             FugueSQLSyntaxError(
                 "invalid assignment for module without output"),
         )
     if sub.has_input:
         dfs = data["dfs"] if "dfs" in data else WorkflowDataFrames(
             self.last)
     else:
         dfs = WorkflowDataFrames()
     p = data["params"] if "params" in data else {}
     if sub.has_dfs_input:
         result = sub(dfs, **p)
     elif len(dfs) == 0:
         result = sub(self.workflow, **p)
     elif len(dfs) == 1 or not dfs.has_key:
         result = sub(*list(dfs.values()), **p)
     else:
         result = sub(**dfs, **p)
     if sub.has_single_output or sub.has_multiple_output:
         self.variables[varname] = result
     if sub.has_single_output:
         self._last = result
Пример #4
0
 def visitFugueWildSchema(self, ctx: fp.FugueWildSchemaContext) -> str:
     schema = ",".join(
         self.collectChildren(ctx, fp.FugueWildSchemaPairContext))
     if schema.count("*") > 1:
         raise FugueSQLSyntaxError(
             f"invalid {schema} * can appear at most once")
     return schema
Пример #5
0
 def _process_assignable(self, df: WorkflowDataFrame, ctx: Tree):
     data = self.get_dict(ctx, "assign", "persist", "broadcast")
     if "assign" in data:
         varname, sign = data["assign"]
     else:
         varname, sign = None, None
     need_checkpoint = sign == "??"
     if "persist" in data:
         is_checkpoint, value = data["persist"]
         if need_checkpoint or is_checkpoint:
             assert_or_throw(
                 is_checkpoint,
                 FugueSQLSyntaxError(
                     "can't persist when checkpoint is specified"),
             )
             df = df.checkpoint(value)
         else:
             df = df.persist(value)
     elif need_checkpoint:
         df = df.checkpoint()
     if "broadcast" in data:
         df = df.broadcast()
     if varname is not None:
         self.variables[varname] = df
     self._last = df
Пример #6
0
 def fsql(self, line: str, cell: str, local_ns: Any = None) -> None:
     try:
         dag = fugue_sql.fsql(
             "\n" + cell, local_ns, fsql_ignore_case=self._fsql_ignore_case
         )
     except FugueSQLSyntaxError as ex:
         raise FugueSQLSyntaxError(str(ex)).with_traceback(None) from None
     dag.run(self.get_engine(line, {} if local_ns is None else local_ns))
     for k, v in dag.yields.items():
         if isinstance(v, YieldedDataFrame):
             local_ns[k] = v.result  # type: ignore
         else:
             local_ns[k] = v  # type: ignore
Пример #7
0
 def visitTableName(self, ctx: fp.TableNameContext) -> Iterable[Any]:
     table_name = self.ctxToStr(ctx.multipartIdentifier(), delimit="")
     if table_name not in self.variables:
         assert_or_throw(
             ctx.fugueDataFrameMember() is None,
             FugueSQLSyntaxError(
                 "can't specify index or key for dataframe"),
         )
         table: Any = self.hooks.on_select_source_not_found(
             self.workflow, table_name)
     else:
         table = self.get_df(table_name, ctx.fugueDataFrameMember())
     if isinstance(table, str):
         yield table
         yield from self._get_query_elements(ctx.sample())
         yield from self._get_query_elements(ctx.tableAlias())
     else:
         yield table
         yield from self._get_query_elements(ctx.sample())
         if ctx.tableAlias().strictIdentifier() is not None:
             yield from self._get_query_elements(ctx.tableAlias())
         elif validate_triad_var_name(table_name):
             yield "AS"
             yield table_name
Пример #8
0
 def syntaxError(self, recognizer, offendingSymbol, line, column, msg, e):
     raise FugueSQLSyntaxError(
         f"{msg}\nline {line}: {self._lines[line - 1]}\n{offendingSymbol}"
     )
Пример #9
0
def fsql(sql: str,
         *args: Any,
         fsql_ignore_case: bool = False,
         **kwargs: Any) -> FugueSQLWorkflow:
    """Fugue SQL functional interface

    :param sql: the Fugue SQL string (can be a jinja template)
    :param args: variables related to the SQL string
    :param fsql_ignore_case: whether to ignore case when parsing the SQL string
        defaults to False.
    :param kwargs: variables related to the SQL string
    :return: the translated Fugue workflow

    .. code-block:: python

        # Basic case
        fsql('''
        CREATE [[0]] SCHEMA a:int
        PRINT
        ''').run()

        # With external data sources
        df = pd.DataFrame([[0],[1]], columns=["a"])
        fsql('''
        SELECT * FROM df WHERE a=0
        PRINT
        ''').run()

        # With external variables
        df = pd.DataFrame([[0],[1]], columns=["a"])
        t = 1
        fsql('''
        SELECT * FROM df WHERE a={{t}}
        PRINT
        ''').run()

        # The following is the explicit way to specify variables and datafrems
        # (recommended)
        df = pd.DataFrame([[0],[1]], columns=["a"])
        t = 1
        fsql('''
        SELECT * FROM df WHERE a={{t}}
        PRINT
        ''', df=df, t=t).run()

        # Using extensions
        def dummy(df:pd.DataFrame) -> pd.DataFrame:
            return df

        fsql('''
        CREATE [[0]] SCHEMA a:int
        TRANSFORM USING dummy SCHEMA *
        PRINT
        ''').run()

        # It's recommended to provide full path of the extension inside
        # Fugue SQL, so the SQL definition and exeuction can be more
        # independent from the extension definition.

        # Run with different execution engines
        sql = '''
        CREATE [[0]] SCHEMA a:int
        TRANSFORM USING dummy SCHEMA *
        PRINT
        '''

        fsql(sql).run(user_defined_spark_session())
        fsql(sql).run(SparkExecutionEngine, {"spark.executor.instances":10})
        fsql(sql).run(DaskExecutionEngine)

        # Passing dataframes between fsql calls
        result = fsql('''
        CREATE [[0]] SCHEMA a:int
        YIELD DATAFRAME AS x

        CREATE [[1]] SCHEMA a:int
        YIELD DATAFRAME AS y
        ''').run(DaskExecutionEngine)

        fsql('''
        SELECT * FROM x
        UNION
        SELECT * FROM y
        UNION
        SELECT * FROM z

        PRINT
        ''', result, z=pd.DataFrame([[2]], columns=["z"])).run()

        # Get framework native dataframes
        result["x"].native  # Dask dataframe
        result["y"].native  # Dask dataframe
        result["x"].as_pandas()  # Pandas dataframe

        # Use lower case fugue sql
        df = pd.DataFrame([[0],[1]], columns=["a"])
        t = 1
        fsql('''
        select * from df where a={{t}}
        print
        ''', df=df, t=t, fsql_ignore_case=True).run()
    """
    global_vars, local_vars = get_caller_global_local_vars()
    dag = FugueSQLWorkflow(None,
                           {FUGUE_CONF_SQL_IGNORE_CASE: fsql_ignore_case})
    try:
        dag._sql(sql, global_vars, local_vars, *args, **kwargs)
    except FugueSQLSyntaxError as ex:
        raise FugueSQLSyntaxError(str(ex)).with_traceback(None) from None
    return dag