def __init__(self, groupCols: List, parent: DataFrame): self.having = [] self.groupCols = [] computedAliases = [c.alias for c in parent.computedCols] for theCol in groupCols: if isinstance(theCol, str): theRef = None if theCol in computedAliases: theRef = ColRef(theCol, None) else: theRef = ColRef(theCol, self) theCol = theRef elif isinstance(theCol, ColRef): self.updateRef(theCol) elif isinstance(theCol, Expr): pass else: raise ExpressionException( f"invalid grouping column type: {type(theCol)}") self.groupCols.append(theCol) self.aggFunc = [] super().__init__(self.groupCols, parent, GrizzlyGenerator._incrAndGetTupleVar())
def join(self, other, on, how="inner", comp="="): if isinstance(on, list): lOn = None rOn = None from grizzly.expression import ExpressionException if not self.hasColumn(on[0]): raise ExpressionException( f"No such column {on[0]} for join in left hand side") else: lOn = ColRef(on[0], self) if not other.hasColumn(on[1]): raise ExpressionException( f"No such column {on[1]} for join in right hand side") else: rOn = ColRef(on[1], other) on = [lOn, rOn] return Join(self, other, on, how, comp)
def _getSQLFuncName(aggType) -> str: if isinstance(aggType, str): return aggType if isinstance(aggType, AggregateType): if aggType == AggregateType.MEAN: return "avg" return str(aggType)[len("AggregateType."):].lower() # if we get here it's not a string and not a AggType --> error raise ExpressionException( f"invalid function value: {aggType}, expected string or AggregateType, but got {type(aggType)}" )
def __init__(self, by: list, ascending: bool, parent): super().__init__(parent.columns, parent, GrizzlyGenerator._incrAndGetTupleVar()) sortCols = [] for col in by: if isinstance(col, Projection): sortCols.append(self.updateRef(col.columns[0])) elif isinstance(col, str): sortCols.append(ColRef(col, self)) elif isinstance(col, ColRef): sortCols.append(self.updateRef(col)) else: raise ExpressionException( f"unsupported type for oder by value: {type(col)}") self.by = sortCols self.ascending = ascending
def __getitem__(self, key): theType = type(key) if isinstance(key, slice): # used for LIMIT .. OFFSET if key.step is not None: logger.warn( "Step is not supported for slice access on DataFrames") n = key.stop offset = key.start if key.start is not None else None return self.limit(n, offset) elif theType is ColRef: # if in the projection list e.g. "df.a" was given return self.project(key) elif isinstance(key, BoolExpr) or isinstance( key, LogicExpr): # e.g. a filter expression return self.filter(key) elif theType is str: # a single string is given -> project to that column return ColRef(key, self) elif theType is list: projList = [] for e in key: t = type(e) if t is str: projList.append(ColRef(e, self)) elif t is ColRef: c = ColRef(e.colName(), self) projList.append(c) else: raise ExpressionException( f"expected a column name string or column reference, but got {e}" ) return self.project(projList) else: print(f"{key} has type {theType} -- ignoring") return self
def _exprToSQL(self, expr) -> Tuple[List[str], str]: exprSQL = "" pre = [] # right hand side is a string constant if expr is None: exprSQL = "NULL" elif isinstance(expr, str): exprSQL = expr # TODO: currently to handle *, but maybe this should done earlier and be converted into a special ColRef? # we were given a constant elif isinstance(expr, Constant): if isinstance(expr.value, str): exprSQL = f"'{expr.value}'" elif isinstance(expr.value, list): eSQLs = [] for x in expr.value: (ePre, eSQL) = self._exprToSQL(x) eSQLs.append(eSQL) pre += ePre exprSQL = ",".join(eSQLs) exprSQL = f"({exprSQL})" else: exprSQL = f"{expr.value}" # TODO: should LogicExpr be merged into BoolExpr ? elif isinstance(expr, LogicExpr): (lPre, l) = self._exprToSQL(expr.left) (rPre, r) = self._exprToSQL(expr.right) if isinstance(expr.left, LogicExpr): l = f"({l})" if isinstance(expr.right, LogicExpr): r = f"({r})" if expr.operand == LogicOperation.AND: exprSQL = f"{l} and {r}" elif expr.operand == LogicOperation.OR: exprSQL = f"{l} or {r}" elif expr.operand == LogicOperation.NOT: exprSQL = f"not {l}" elif expr.operand == LogicOperation.XOR: exprSQL = f"{l} xor {r}" else: raise ExpressionException( f"unknown logical operation: {expr.operand}") pre = lPre + rPre elif isinstance(expr, BoolExpr): if not expr.right and not (expr.operand == BooleanOperation.EQ or expr.operand == BooleanOperation.NE): raise ExpressionException( "only == and != allowed for comparison with None (NULL)") (lPre, l) = self._exprToSQL(expr.left) (rPre, r) = self._exprToSQL(expr.right) opStr = None if expr.operand == BooleanOperation.EQ: opStr = "=" if expr.right is not None else "is" elif expr.operand == BooleanOperation.NE: opStr = "<>" if expr.right is not None else "is not" elif expr.operand == BooleanOperation.GE: opStr = ">=" elif expr.operand == BooleanOperation.GT: opStr = ">" elif expr.operand == BooleanOperation.LE: opStr = "<=" elif expr.operand == BooleanOperation.LT: opStr = "<" else: raise ExpressionException( f"unknown boolean operation: {expr.operand}") exprSQL = f"{l} {opStr} {r}" pre = lPre + rPre elif isinstance(expr, ArithmExpr): (lPre, l) = self._exprToSQL(expr.left) (rPre, r) = self._exprToSQL(expr.right) if not isinstance(expr.left, ColRef) and not isinstance( expr.left, Constant): l = f"({l})" if not isinstance(expr.right, ColRef) and not isinstance( expr.right, Constant): r = f"({r})" opStr = None if expr.operand == ArithmeticOperation.ADD: opStr = "+" elif expr.operand == ArithmeticOperation.SUB: opStr = "-" elif expr.operand == ArithmeticOperation.MUL: opStr = "*" elif expr.operand == ArithmeticOperation.DIV: opStr = "/" elif expr.operand == ArithmeticOperation.MOD: opStr = "%" exprSQL = f"{l} {opStr} {r}" pre = lPre + rPre elif isinstance(expr, SetExpr): (lPre, l) = self._exprToSQL(expr.left) if isinstance(expr.right, list): (rPre, r) = ([], ",".join([str(x) for x in expr.right])) else: # should be a DF (rPre, r) = self._exprToSQL(expr.right) if not isinstance(expr.left, ColRef) and not isinstance( expr.left, Constant): l = f"({l})" if not isinstance(expr.right, ColRef) and not isinstance( expr.right, Constant): r = f"({r})" opStr = "UNKNOWN" if expr.operand == SetOperation.IN: opStr = "IN" exprSQL = f"{l} {opStr} {r}" pre = lPre + rPre # if the thing to produce is a DataFrame, we probably have a subquery elif isinstance(expr, DataFrame): # if right hand side is a DataFrame, we need to create code first subQry = Query(self) (pre, exprSQL) = subQry._buildFrom(expr) # it's a plain column reference elif isinstance(expr, ColRef): if expr.df and expr.column != "*": exprSQL = f"{expr.df.alias}.{expr.column}" else: exprSQL = expr.column if expr.alias: exprSQL += f" as {expr.alias}" # it's a computed column, the value could be anything elif isinstance(expr, ComputedCol): (pre, exprSQL) = self._exprToSQL(expr.value) if not isinstance(expr.value, FuncCall): exprSQL = f"({exprSQL})" if expr.alias: exprSQL = f"{exprSQL} as {expr.alias}" # it's a function call -> produce CREATE func if necessary and call elif isinstance(expr, FuncCall): (pre, exprSQL) = self._generateFuncCall(expr) # seems to be something we forgot above or unknown to us. raise an exception else: raise ExpressionException(f"don't know how to handle {expr}") return (pre, exprSQL)
def _buildFrom(self, df) -> Tuple[List[str], str, str]: if df is not None: computedCols = [] preCode = [] for x in df.computedCols: (exprPre, exprSQL) = self.generator._exprToSQL(x) preCode += exprPre computedCols.append(exprSQL) computedCols = ",".join(computedCols) if isinstance(df, Table): proj = "*" if computedCols: proj += "," + computedCols return (preCode, f"SELECT {proj} FROM {df.table} {df.alias}") elif isinstance(df, ExternalTable): proj = "*" if computedCols: proj += "," + computedCols tablePre = SQLGenerator._generateCreateExtTable( df, self.generator.templates) qry = f"SELECT {proj} FROM {df.table} {df.alias}" return (preCode + tablePre, qry) elif isinstance(df, Projection): subQry = Query(self.generator) (pre, parentSQL) = subQry._buildFrom(df.parents[0]) prefixed = "*" if df.columns: prefixed = [] for attr in df.columns: (ePre, exprSQL) = self.generator._exprToSQL(attr) pre += ePre prefixed.append(exprSQL) prefixed = ",".join(prefixed) if computedCols: prefixed += "," + computedCols qry = f"SELECT { 'DISTINCT ' if df.doDistinct else ''}{prefixed} FROM ({parentSQL}) {df.alias}" return (preCode + pre, qry) elif isinstance(df, Filter): subQry = Query(self.generator) (pre, parentSQL) = subQry._buildFrom(df.parents[0]) (exprPre, exprStr) = self.generator._exprToSQL(df.expr) proj = "*" if computedCols: proj += "," + computedCols qry = f"SELECT {proj} FROM ({parentSQL}) {df.alias} WHERE {exprStr}" return (preCode + pre + exprPre, qry) elif isinstance(df, Join): lQry = Query(self.generator) (lpre, lparentSQL) = lQry._buildFrom(df.leftParent()) rQry = Query(self.generator) (rpre, rparentSQL) = rQry._buildFrom(df.rightParent()) lAlias = df.leftParent().alias rAlias = df.rightParent().alias if isinstance(df.on, ColRef): (exprPre, onSQL) = self.generator._exprToSQL(df.on) onSQL = f"USING ({onSQL})" preCode += exprPre elif isinstance(df.on, LogicExpr) or isinstance( df.on, BoolExpr): (exprPre, onSQL) = self.generator._exprToSQL(df.on) onSQL = "ON " + onSQL preCode += exprPre elif isinstance(df.on, list): if len(df.on) != 2: raise ExpressionException( "on condition must consist of exacltly two columns" ) (lOnPre, lOn) = self.generator._exprToSQL(df.on[0]) (rOnPre, rOn) = self.generator._exprToSQL(df.on[1]) onSQL = f"ON {lOn} {df.comp} {rOn}" preCode += lOnPre preCode += rOnPre else: onSQL = "" # let the DB figure it out itself proj = "*" if computedCols: proj += "," + computedCols joinSQL = f"SELECT {proj} FROM ({lparentSQL}) {lAlias} {df.how} JOIN ({rparentSQL}) {rAlias} {onSQL}" return (preCode + lpre + rpre, joinSQL) elif isinstance(df, Grouping): subQry = Query(self.generator) (pre, parentSQL) = subQry._buildFrom(df.parents[0]) byCols = [] for attr in df.groupCols: (exprPre, exprSQL) = self.generator._exprToSQL(attr) pre += exprPre byCols.append(exprSQL) by = ",".join(byCols) funcCode = "" for f in df.aggFunc: (fPre, fCode) = self.generator._generateFuncCall(f) pre += fPre funcCode += ", " + fCode groupSQL = f"SELECT {by} {funcCode} FROM ({parentSQL}) {df.alias} GROUP BY {by}" havings = [] if df.having: for h in df.having: (hPre, hSQL) = self.generator._exprToSQL(h) pre += hPre havings.append(hSQL) exprStr = " AND ".join(havings) groupSQL += f" HAVING {exprStr}" #if the computed column is an aggregate over the groups, # it should not be added as an extra query, but rather # merged into this projection list if computedCols: tVar = GrizzlyGenerator._incrAndGetTupleVar() proj = "*," + computedCols groupSQL = f"SELECT {proj} FROM {groupSQL} {tVar}" return (preCode + pre, groupSQL) elif isinstance(df, Limit): subQry = Query(self.generator) (pre, parentSQL) = subQry._buildFrom(df.parents[0]) limitClause = self.generator.templates["limit"].lower() (lPre, limitExpr) = self.generator._exprToSQL(df.limit) pre += lPre limitSQL = None if limitClause == "top": limitSQL = f"SELECT TOP {limitExpr} {df.alias}.* FROM ({parentSQL}) {df.alias}" elif limitClause == "limit": limitSQL = f"SELECT {df.alias}.* FROM ({parentSQL}) {df.alias} LIMIT {limitExpr}" else: raise ValueError( f"Unknown keyword for LIMIT: {limitClause}") if df.offset is not None: (oPre, offsetExpr) = self.generator._exprToSQL(df.offset) pre += oPre limitSQL += f" OFFSET {offsetExpr}" return (preCode + pre, limitSQL) elif isinstance(df, Ordering): subQry = Query(self.generator) (pre, parentSQL) = subQry._buildFrom(df.parents[0]) by = [] for attr in df.by: (exprPre, exprSQL) = self.generator._exprToSQL(attr) pre += exprPre by.append(exprSQL) by = ",".join(by) direction = "ASC" if df.ascending else "DESC" qry = f"SELECT * FROM ({parentSQL}) {df.alias} ORDER BY {by} {direction}" return (preCode + pre, qry) else: raise ValueError(f"unsupported operator {type(df)}") else: return ""