def __init__( self, code: str, rule: str, ignore_case: bool = False, simple_assign: bool = True, ansi_sql: bool = False, ): self._rule = rule self._raw_code = code self._raw_lines = code.splitlines() if ignore_case: self._code, self._tree, self._stream = _to_cased_code( code, rule, simple_assign=simple_assign, ansi_sql=ansi_sql) else: try: self._code = code self._tree, self._stream = _to_tree( self._code, self._rule, False, simple_assign=simple_assign, ansi_sql=ansi_sql, ) except FugueSQLSyntaxError as e: if _detect_case_issue(code, _FUGUE_SQL_CASE_ISSUE_THREHOLD): prefix = ( "(FugueSQL requires uppercase characters by default. " f"To ignore casing, turn on {FUGUE_CONF_SQL_IGNORE_CASE})" ) msg = prefix + "\n" + str(e) raise FugueSQLSyntaxError(msg).with_traceback( None) from None else: raise FugueSQLSyntaxError(str(e)) from None
def get_df(self, key: str, ctx: fp.FugueDataFrameMemberContext) -> WorkflowDataFrame: assert_or_throw( key in self.variables, lambda: FugueSQLSyntaxError(f"{key} is not defined"), ) if isinstance(self.variables[key], LazyWorkflowDataFrame): assert_or_throw( ctx is None, FugueSQLSyntaxError( "can't specify index or key for dataframe"), ) return self.variables[key].get_df() # type: ignore if isinstance(self.variables[key], WorkflowDataFrame): assert_or_throw( ctx is None, FugueSQLSyntaxError( "can't specify index or key for dataframe"), ) return self.variables[key] # type: ignore assert_or_throw( ctx is not None, FugueSQLSyntaxError("must specify index or key for dataframes"), ) if ctx.index is not None: return self.variables[key][int(self.ctxToStr(ctx.index))] else: return self.variables[key][self.ctxToStr(ctx.key)] # type: ignore
def visitFugueModuleTask(self, ctx: fp.FugueModuleTaskContext) -> None: data = self.get_dict(ctx, "assign", "dfs", "using", "params") sub = _to_module( data["using"], global_vars=self.global_vars, local_vars=self.local_vars, ) varname = data["assign"][0] if "assign" in data else None if varname is not None: assert_or_throw( sub.has_single_output or sub.has_multiple_output, FugueSQLSyntaxError( "invalid assignment for module without output"), ) if sub.has_input: dfs = data["dfs"] if "dfs" in data else WorkflowDataFrames( self.last) else: dfs = WorkflowDataFrames() p = data["params"] if "params" in data else {} if sub.has_dfs_input: result = sub(dfs, **p) elif len(dfs) == 0: result = sub(self.workflow, **p) elif len(dfs) == 1 or not dfs.has_key: result = sub(*list(dfs.values()), **p) else: result = sub(**dfs, **p) if sub.has_single_output or sub.has_multiple_output: self.variables[varname] = result if sub.has_single_output: self._last = result
def visitFugueWildSchema(self, ctx: fp.FugueWildSchemaContext) -> str: schema = ",".join( self.collectChildren(ctx, fp.FugueWildSchemaPairContext)) if schema.count("*") > 1: raise FugueSQLSyntaxError( f"invalid {schema} * can appear at most once") return schema
def _process_assignable(self, df: WorkflowDataFrame, ctx: Tree): data = self.get_dict(ctx, "assign", "persist", "broadcast") if "assign" in data: varname, sign = data["assign"] else: varname, sign = None, None need_checkpoint = sign == "??" if "persist" in data: is_checkpoint, value = data["persist"] if need_checkpoint or is_checkpoint: assert_or_throw( is_checkpoint, FugueSQLSyntaxError( "can't persist when checkpoint is specified"), ) df = df.checkpoint(value) else: df = df.persist(value) elif need_checkpoint: df = df.checkpoint() if "broadcast" in data: df = df.broadcast() if varname is not None: self.variables[varname] = df self._last = df
def fsql(self, line: str, cell: str, local_ns: Any = None) -> None: try: dag = fugue_sql.fsql( "\n" + cell, local_ns, fsql_ignore_case=self._fsql_ignore_case ) except FugueSQLSyntaxError as ex: raise FugueSQLSyntaxError(str(ex)).with_traceback(None) from None dag.run(self.get_engine(line, {} if local_ns is None else local_ns)) for k, v in dag.yields.items(): if isinstance(v, YieldedDataFrame): local_ns[k] = v.result # type: ignore else: local_ns[k] = v # type: ignore
def visitTableName(self, ctx: fp.TableNameContext) -> Iterable[Any]: table_name = self.ctxToStr(ctx.multipartIdentifier(), delimit="") if table_name not in self.variables: assert_or_throw( ctx.fugueDataFrameMember() is None, FugueSQLSyntaxError( "can't specify index or key for dataframe"), ) table: Any = self.hooks.on_select_source_not_found( self.workflow, table_name) else: table = self.get_df(table_name, ctx.fugueDataFrameMember()) if isinstance(table, str): yield table yield from self._get_query_elements(ctx.sample()) yield from self._get_query_elements(ctx.tableAlias()) else: yield table yield from self._get_query_elements(ctx.sample()) if ctx.tableAlias().strictIdentifier() is not None: yield from self._get_query_elements(ctx.tableAlias()) elif validate_triad_var_name(table_name): yield "AS" yield table_name
def syntaxError(self, recognizer, offendingSymbol, line, column, msg, e): raise FugueSQLSyntaxError( f"{msg}\nline {line}: {self._lines[line - 1]}\n{offendingSymbol}" )
def fsql(sql: str, *args: Any, fsql_ignore_case: bool = False, **kwargs: Any) -> FugueSQLWorkflow: """Fugue SQL functional interface :param sql: the Fugue SQL string (can be a jinja template) :param args: variables related to the SQL string :param fsql_ignore_case: whether to ignore case when parsing the SQL string defaults to False. :param kwargs: variables related to the SQL string :return: the translated Fugue workflow .. code-block:: python # Basic case fsql(''' CREATE [[0]] SCHEMA a:int PRINT ''').run() # With external data sources df = pd.DataFrame([[0],[1]], columns=["a"]) fsql(''' SELECT * FROM df WHERE a=0 PRINT ''').run() # With external variables df = pd.DataFrame([[0],[1]], columns=["a"]) t = 1 fsql(''' SELECT * FROM df WHERE a={{t}} PRINT ''').run() # The following is the explicit way to specify variables and datafrems # (recommended) df = pd.DataFrame([[0],[1]], columns=["a"]) t = 1 fsql(''' SELECT * FROM df WHERE a={{t}} PRINT ''', df=df, t=t).run() # Using extensions def dummy(df:pd.DataFrame) -> pd.DataFrame: return df fsql(''' CREATE [[0]] SCHEMA a:int TRANSFORM USING dummy SCHEMA * PRINT ''').run() # It's recommended to provide full path of the extension inside # Fugue SQL, so the SQL definition and exeuction can be more # independent from the extension definition. # Run with different execution engines sql = ''' CREATE [[0]] SCHEMA a:int TRANSFORM USING dummy SCHEMA * PRINT ''' fsql(sql).run(user_defined_spark_session()) fsql(sql).run(SparkExecutionEngine, {"spark.executor.instances":10}) fsql(sql).run(DaskExecutionEngine) # Passing dataframes between fsql calls result = fsql(''' CREATE [[0]] SCHEMA a:int YIELD DATAFRAME AS x CREATE [[1]] SCHEMA a:int YIELD DATAFRAME AS y ''').run(DaskExecutionEngine) fsql(''' SELECT * FROM x UNION SELECT * FROM y UNION SELECT * FROM z PRINT ''', result, z=pd.DataFrame([[2]], columns=["z"])).run() # Get framework native dataframes result["x"].native # Dask dataframe result["y"].native # Dask dataframe result["x"].as_pandas() # Pandas dataframe # Use lower case fugue sql df = pd.DataFrame([[0],[1]], columns=["a"]) t = 1 fsql(''' select * from df where a={{t}} print ''', df=df, t=t, fsql_ignore_case=True).run() """ global_vars, local_vars = get_caller_global_local_vars() dag = FugueSQLWorkflow(None, {FUGUE_CONF_SQL_IGNORE_CASE: fsql_ignore_case}) try: dag._sql(sql, global_vars, local_vars, *args, **kwargs) except FugueSQLSyntaxError as ex: raise FugueSQLSyntaxError(str(ex)).with_traceback(None) from None return dag