Пример #1
0
    def __init__(self, groupCols: List, parent: DataFrame):
        self.having = []
        self.groupCols = []
        computedAliases = [c.alias for c in parent.computedCols]
        for theCol in groupCols:
            if isinstance(theCol, str):
                theRef = None
                if theCol in computedAliases:
                    theRef = ColRef(theCol, None)
                else:
                    theRef = ColRef(theCol, self)
                theCol = theRef
            elif isinstance(theCol, ColRef):
                self.updateRef(theCol)
            elif isinstance(theCol, Expr):
                pass
            else:
                raise ExpressionException(
                    f"invalid grouping column type: {type(theCol)}")

            self.groupCols.append(theCol)

        self.aggFunc = []

        super().__init__(self.groupCols, parent,
                         GrizzlyGenerator._incrAndGetTupleVar())
Пример #2
0
    def join(self, other, on, how="inner", comp="="):

        if isinstance(on, list):

            lOn = None
            rOn = None
            from grizzly.expression import ExpressionException
            if not self.hasColumn(on[0]):
                raise ExpressionException(
                    f"No such column {on[0]} for join in left hand side")
            else:
                lOn = ColRef(on[0], self)
            if not other.hasColumn(on[1]):
                raise ExpressionException(
                    f"No such column {on[1]} for join in right hand side")
            else:
                rOn = ColRef(on[1], other)

            on = [lOn, rOn]

        return Join(self, other, on, how, comp)
Пример #3
0
    def _getSQLFuncName(aggType) -> str:
        if isinstance(aggType, str):
            return aggType

        if isinstance(aggType, AggregateType):
            if aggType == AggregateType.MEAN:
                return "avg"

            return str(aggType)[len("AggregateType."):].lower()

        # if we get here it's not a string and not a AggType --> error
        raise ExpressionException(
            f"invalid function value: {aggType}, expected string or AggregateType, but got {type(aggType)}"
        )
Пример #4
0
    def __init__(self, by: list, ascending: bool, parent):
        super().__init__(parent.columns, parent,
                         GrizzlyGenerator._incrAndGetTupleVar())

        sortCols = []
        for col in by:
            if isinstance(col, Projection):
                sortCols.append(self.updateRef(col.columns[0]))
            elif isinstance(col, str):
                sortCols.append(ColRef(col, self))
            elif isinstance(col, ColRef):
                sortCols.append(self.updateRef(col))
            else:
                raise ExpressionException(
                    f"unsupported type for oder by value: {type(col)}")

        self.by = sortCols
        self.ascending = ascending
Пример #5
0
    def __getitem__(self, key):
        theType = type(key)

        if isinstance(key, slice):  # used for LIMIT .. OFFSET
            if key.step is not None:
                logger.warn(
                    "Step is not supported for slice access on DataFrames")

            n = key.stop

            offset = key.start if key.start is not None else None
            return self.limit(n, offset)

        elif theType is ColRef:  # if in the projection list e.g. "df.a" was given
            return self.project(key)

        elif isinstance(key, BoolExpr) or isinstance(
                key, LogicExpr):  # e.g. a filter expression
            return self.filter(key)

        elif theType is str:  # a single string is given -> project to that column
            return ColRef(key, self)

        elif theType is list:

            projList = []
            for e in key:
                t = type(e)
                if t is str:
                    projList.append(ColRef(e, self))
                elif t is ColRef:
                    c = ColRef(e.colName(), self)
                    projList.append(c)
                else:
                    raise ExpressionException(
                        f"expected a column name string or column reference, but got {e}"
                    )

            return self.project(projList)
        else:
            print(f"{key} has type {theType} -- ignoring")
            return self
Пример #6
0
    def _exprToSQL(self, expr) -> Tuple[List[str], str]:
        exprSQL = ""
        pre = []

        # right hand side is a string constant
        if expr is None:
            exprSQL = "NULL"

        elif isinstance(expr, str):
            exprSQL = expr  # TODO: currently to handle *, but maybe this should done earlier and be converted into a special ColRef?

        # we were given a constant
        elif isinstance(expr, Constant):
            if isinstance(expr.value, str):
                exprSQL = f"'{expr.value}'"
            elif isinstance(expr.value, list):

                eSQLs = []
                for x in expr.value:
                    (ePre, eSQL) = self._exprToSQL(x)
                    eSQLs.append(eSQL)
                    pre += ePre

                exprSQL = ",".join(eSQLs)
                exprSQL = f"({exprSQL})"

            else:
                exprSQL = f"{expr.value}"

        # TODO: should LogicExpr be merged into BoolExpr ?
        elif isinstance(expr, LogicExpr):

            (lPre, l) = self._exprToSQL(expr.left)
            (rPre, r) = self._exprToSQL(expr.right)

            if isinstance(expr.left, LogicExpr):
                l = f"({l})"
            if isinstance(expr.right, LogicExpr):
                r = f"({r})"

            if expr.operand == LogicOperation.AND:
                exprSQL = f"{l} and {r}"
            elif expr.operand == LogicOperation.OR:
                exprSQL = f"{l} or {r}"
            elif expr.operand == LogicOperation.NOT:
                exprSQL = f"not {l}"
            elif expr.operand == LogicOperation.XOR:
                exprSQL = f"{l} xor {r}"
            else:
                raise ExpressionException(
                    f"unknown logical operation: {expr.operand}")

            pre = lPre + rPre

        elif isinstance(expr, BoolExpr):

            if not expr.right and not (expr.operand == BooleanOperation.EQ
                                       or expr.operand == BooleanOperation.NE):
                raise ExpressionException(
                    "only == and != allowed for comparison with None (NULL)")

            (lPre, l) = self._exprToSQL(expr.left)
            (rPre, r) = self._exprToSQL(expr.right)

            opStr = None
            if expr.operand == BooleanOperation.EQ:
                opStr = "=" if expr.right is not None else "is"
            elif expr.operand == BooleanOperation.NE:
                opStr = "<>" if expr.right is not None else "is not"
            elif expr.operand == BooleanOperation.GE:
                opStr = ">="
            elif expr.operand == BooleanOperation.GT:
                opStr = ">"
            elif expr.operand == BooleanOperation.LE:
                opStr = "<="
            elif expr.operand == BooleanOperation.LT:
                opStr = "<"
            else:
                raise ExpressionException(
                    f"unknown boolean operation: {expr.operand}")

            exprSQL = f"{l} {opStr} {r}"
            pre = lPre + rPre

        elif isinstance(expr, ArithmExpr):
            (lPre, l) = self._exprToSQL(expr.left)
            (rPre, r) = self._exprToSQL(expr.right)

            if not isinstance(expr.left, ColRef) and not isinstance(
                    expr.left, Constant):
                l = f"({l})"
            if not isinstance(expr.right, ColRef) and not isinstance(
                    expr.right, Constant):
                r = f"({r})"

            opStr = None
            if expr.operand == ArithmeticOperation.ADD:
                opStr = "+"
            elif expr.operand == ArithmeticOperation.SUB:
                opStr = "-"
            elif expr.operand == ArithmeticOperation.MUL:
                opStr = "*"
            elif expr.operand == ArithmeticOperation.DIV:
                opStr = "/"
            elif expr.operand == ArithmeticOperation.MOD:
                opStr = "%"

            exprSQL = f"{l} {opStr} {r}"
            pre = lPre + rPre

        elif isinstance(expr, SetExpr):
            (lPre, l) = self._exprToSQL(expr.left)

            if isinstance(expr.right, list):
                (rPre, r) = ([], ",".join([str(x) for x in expr.right]))
            else:  # should be a DF
                (rPre, r) = self._exprToSQL(expr.right)

            if not isinstance(expr.left, ColRef) and not isinstance(
                    expr.left, Constant):
                l = f"({l})"
            if not isinstance(expr.right, ColRef) and not isinstance(
                    expr.right, Constant):
                r = f"({r})"

            opStr = "UNKNOWN"
            if expr.operand == SetOperation.IN:
                opStr = "IN"

            exprSQL = f"{l} {opStr} {r}"
            pre = lPre + rPre

        # if the thing to produce is a DataFrame, we probably have a subquery
        elif isinstance(expr, DataFrame):
            # if right hand side is a DataFrame, we need to create code first
            subQry = Query(self)
            (pre, exprSQL) = subQry._buildFrom(expr)

        # it's a plain column reference
        elif isinstance(expr, ColRef):
            if expr.df and expr.column != "*":
                exprSQL = f"{expr.df.alias}.{expr.column}"
            else:
                exprSQL = expr.column

            if expr.alias:
                exprSQL += f" as {expr.alias}"

        # it's a computed column, the value could be anything
        elif isinstance(expr, ComputedCol):
            (pre, exprSQL) = self._exprToSQL(expr.value)

            if not isinstance(expr.value, FuncCall):
                exprSQL = f"({exprSQL})"

                if expr.alias:
                    exprSQL = f"{exprSQL} as {expr.alias}"

        # it's a function call -> produce CREATE func if necessary and call
        elif isinstance(expr, FuncCall):

            (pre, exprSQL) = self._generateFuncCall(expr)

        # seems to be something we forgot above or unknown to us. raise an exception
        else:
            raise ExpressionException(f"don't know how to handle {expr}")

        return (pre, exprSQL)
Пример #7
0
    def _buildFrom(self, df) -> Tuple[List[str], str, str]:

        if df is not None:

            computedCols = []
            preCode = []

            for x in df.computedCols:
                (exprPre, exprSQL) = self.generator._exprToSQL(x)
                preCode += exprPre
                computedCols.append(exprSQL)

            computedCols = ",".join(computedCols)

            if isinstance(df, Table):
                proj = "*"
                if computedCols:
                    proj += "," + computedCols

                return (preCode, f"SELECT {proj} FROM {df.table} {df.alias}")

            elif isinstance(df, ExternalTable):
                proj = "*"
                if computedCols:
                    proj += "," + computedCols

                tablePre = SQLGenerator._generateCreateExtTable(
                    df, self.generator.templates)
                qry = f"SELECT {proj} FROM {df.table} {df.alias}"

                return (preCode + tablePre, qry)

            elif isinstance(df, Projection):
                subQry = Query(self.generator)
                (pre, parentSQL) = subQry._buildFrom(df.parents[0])

                prefixed = "*"
                if df.columns:
                    prefixed = []

                    for attr in df.columns:
                        (ePre, exprSQL) = self.generator._exprToSQL(attr)

                        pre += ePre
                        prefixed.append(exprSQL)

                    prefixed = ",".join(prefixed)

                if computedCols:
                    prefixed += "," + computedCols

                qry = f"SELECT { 'DISTINCT ' if df.doDistinct else ''}{prefixed} FROM ({parentSQL}) {df.alias}"

                return (preCode + pre, qry)

            elif isinstance(df, Filter):
                subQry = Query(self.generator)
                (pre, parentSQL) = subQry._buildFrom(df.parents[0])

                (exprPre, exprStr) = self.generator._exprToSQL(df.expr)

                proj = "*"
                if computedCols:
                    proj += "," + computedCols

                qry = f"SELECT {proj} FROM ({parentSQL}) {df.alias} WHERE {exprStr}"

                return (preCode + pre + exprPre, qry)

            elif isinstance(df, Join):

                lQry = Query(self.generator)
                (lpre, lparentSQL) = lQry._buildFrom(df.leftParent())

                rQry = Query(self.generator)
                (rpre, rparentSQL) = rQry._buildFrom(df.rightParent())

                lAlias = df.leftParent().alias
                rAlias = df.rightParent().alias

                if isinstance(df.on, ColRef):
                    (exprPre, onSQL) = self.generator._exprToSQL(df.on)
                    onSQL = f"USING ({onSQL})"
                    preCode += exprPre
                elif isinstance(df.on, LogicExpr) or isinstance(
                        df.on, BoolExpr):
                    (exprPre, onSQL) = self.generator._exprToSQL(df.on)
                    onSQL = "ON " + onSQL
                    preCode += exprPre
                elif isinstance(df.on, list):

                    if len(df.on) != 2:
                        raise ExpressionException(
                            "on condition must consist of exacltly two columns"
                        )

                    (lOnPre, lOn) = self.generator._exprToSQL(df.on[0])
                    (rOnPre, rOn) = self.generator._exprToSQL(df.on[1])

                    onSQL = f"ON {lOn} {df.comp} {rOn}"
                    preCode += lOnPre
                    preCode += rOnPre
                else:
                    onSQL = ""  # let the DB figure it out itself

                proj = "*"
                if computedCols:
                    proj += "," + computedCols

                joinSQL = f"SELECT {proj} FROM ({lparentSQL}) {lAlias} {df.how} JOIN ({rparentSQL}) {rAlias} {onSQL}"

                return (preCode + lpre + rpre, joinSQL)

            elif isinstance(df, Grouping):
                subQry = Query(self.generator)
                (pre, parentSQL) = subQry._buildFrom(df.parents[0])

                byCols = []
                for attr in df.groupCols:
                    (exprPre, exprSQL) = self.generator._exprToSQL(attr)
                    pre += exprPre
                    byCols.append(exprSQL)

                by = ",".join(byCols)

                funcCode = ""
                for f in df.aggFunc:
                    (fPre, fCode) = self.generator._generateFuncCall(f)
                    pre += fPre
                    funcCode += ", " + fCode

                groupSQL = f"SELECT {by} {funcCode} FROM ({parentSQL}) {df.alias} GROUP BY {by}"

                havings = []
                if df.having:
                    for h in df.having:
                        (hPre, hSQL) = self.generator._exprToSQL(h)
                        pre += hPre
                        havings.append(hSQL)

                    exprStr = " AND ".join(havings)
                    groupSQL += f" HAVING {exprStr}"

                #if the computed column is an aggregate over the groups,
                # it should not be added as an extra query, but rather
                # merged into this projection list
                if computedCols:
                    tVar = GrizzlyGenerator._incrAndGetTupleVar()
                    proj = "*," + computedCols
                    groupSQL = f"SELECT {proj} FROM {groupSQL} {tVar}"

                return (preCode + pre, groupSQL)

            elif isinstance(df, Limit):
                subQry = Query(self.generator)
                (pre, parentSQL) = subQry._buildFrom(df.parents[0])

                limitClause = self.generator.templates["limit"].lower()

                (lPre, limitExpr) = self.generator._exprToSQL(df.limit)
                pre += lPre

                limitSQL = None
                if limitClause == "top":
                    limitSQL = f"SELECT TOP {limitExpr} {df.alias}.* FROM ({parentSQL}) {df.alias}"
                elif limitClause == "limit":
                    limitSQL = f"SELECT {df.alias}.* FROM ({parentSQL}) {df.alias} LIMIT {limitExpr}"
                else:
                    raise ValueError(
                        f"Unknown keyword for LIMIT: {limitClause}")

                if df.offset is not None:
                    (oPre, offsetExpr) = self.generator._exprToSQL(df.offset)
                    pre += oPre
                    limitSQL += f" OFFSET {offsetExpr}"

                return (preCode + pre, limitSQL)

            elif isinstance(df, Ordering):
                subQry = Query(self.generator)
                (pre, parentSQL) = subQry._buildFrom(df.parents[0])

                by = []
                for attr in df.by:
                    (exprPre, exprSQL) = self.generator._exprToSQL(attr)
                    pre += exprPre
                    by.append(exprSQL)

                by = ",".join(by)
                direction = "ASC" if df.ascending else "DESC"

                qry = f"SELECT * FROM ({parentSQL}) {df.alias} ORDER BY {by} {direction}"

                return (preCode + pre, qry)

            else:
                raise ValueError(f"unsupported operator {type(df)}")

        else:
            return ""