Exemplo n.º 1
0
    def _substitute_alias(cls, expr: QueryBaseType,
                          aliases_map: dict) -> QueryBaseType:
        if is_column(expr) and expr.get_column_name() in aliases_map:
            alias_expr = aliases_map[expr.get_column_name()]
            if is_expression(alias_expr):
                return alias_expr.copy()
            else:
                return alias_expr
        elif is_expression(expr):
            args = []
            for arg in expr.arguments:
                args.append(cls._substitute_alias(arg, aliases_map))
            expr.set_arguments(tuple(args))

        return expr
Exemplo n.º 2
0
    def _process_expressions(
            self, expressions: Tuple['QueryBaseType', ...],
            processed_shared_ids: Set[str]) -> Tuple['OperatorArgument', ...]:
        """
        Process a list of expressions.

        Parameters
        ----------
        expressions : Tuple['QueryBaseType', ...]
            A list of Expressions to process.
        processed_shared_ids : Set[str]
            Set of shared expression IDs, that are already processed.

        Returns
        -------
        Tuple[OperatorArgument, ...]
            List of VectorizedExpression, Column or Literal.
        """
        operators = []

        for expr in expressions:
            if is_expression(expr):
                operator_expr = self._process_expressions_tree(
                    expr,  # type: ignore
                    processed_shared_ids)
            else:
                operator_expr = expr  # type: ignore
            operators.append(operator_expr)

        return tuple(operators)
Exemplo n.º 3
0
    def _ensure_groupby_select_correctness(
            cls, select_expressions: Tuple[QueryBaseType, ...],
            group_by: Tuple[QueryBaseType, ...]) -> None:
        """
        Ensure structural correctness of the GROUP BY clause.

        For example SELECT columns are either part of the GROUP BY clause
        or used in the aggregate functions.
        """
        if not group_by:
            return
        usage_msg = ('Only aggregate functions and columns present in the '
                     '"GROUP BY" clause are allowed.')

        group_by_columns = set(c for c in group_by if is_column(c))
        group_by_expressions = set(e for e in group_by if is_expression(e))

        for column in select_expressions:
            if is_column(column) and column not in group_by_columns:
                column = cast(Column, column)
                raise ParserError(
                    f'Column "{column.get_column_name()}" is not part of the '
                    f'"GROUP BY" clause. {usage_msg}.')
            elif is_expression(column):
                column = cast(Expression, column)
                is_aggr_expr = False
                for expr in flatten_expressions_tree(column):
                    if is_aggregate_func(expr.function_name):
                        is_aggr_expr = True
                        break

                if (column not in group_by_expressions and not is_aggr_expr):
                    op_name = (column.function_name if column.function_name
                               else column.sql_operator)
                    raise ParserError(
                        f'Operator "{op_name}" is neither aggregate function '
                        f'nor part of the "GROUP BY" clause. {usage_msg}.')
            elif is_literal(column):
                column = cast(Literal, column)
                raise ParserError(
                    f'Literal value "{column.value}" is not allowed '
                    f'in the "GROUP BY" mode. {usage_msg}.')
Exemplo n.º 4
0
 def _is_aggregate_query(
     cls,
     select_expressions: Tuple[QueryBaseType, ...],
 ) -> bool:
     """
     Test if query contains aggregate functions.
     """
     for sel_expr in select_expressions:
         if is_expression(sel_expr):
             for expr in flatten_expressions_tree(sel_expr):
                 if is_aggregate_func(expr.function_name):
                     return True
     return False
Exemplo n.º 5
0
 def _ensure_col_exists(cls, expr: QueryBaseType,
                        schema: pa.Schema) -> None:
     """
     Recursively process expressions tree and raise
     if column was not found.
     """
     if is_column(expr):
         column_name = expr.get_column_name()
         if column_name not in schema.names:
             raise ParserError(f"Column '{column_name}' is not found.")
     elif is_expression(expr):
         for arg in expr.arguments:
             cls._ensure_col_exists(arg, schema)
Exemplo n.º 6
0
def flatten_expressions_tree(
        expression: Optional[Expression]) -> Tuple[Expression, ...]:
    """
    Flatten expressions tree into a list.
    """
    if not expression:
        return tuple()

    expressions = [expression]
    for arg in expression.arguments:
        if is_expression(arg):
            expressions.extend(flatten_expressions_tree(arg))

    return tuple(expressions)
Exemplo n.º 7
0
 def has_count_star(self):
     for expr in self.select_expressions:
         if (is_expression(expr)
                 and expr.function_name
                 and expr.function_name.lower() == 'count_star'):
             return True
Exemplo n.º 8
0
    def plan_query(self) -> Operator:
        """
        Create a query execution plan.

        Using a query AST, generate a query plan outlining the operations
        needed to be performed to execute a query.

        Returns
        -------
        Operator
            Query plan.
        """
        self._query = Binder.bind(query=self._query)

        processed_shared_ids: Set[str] = set()

        used_columns = self._query.get_all_used_column_names()
        unused_columns = set(self._schema.names) - used_columns
        skip_table = False
        # Need to test for the cases like count(*),
        # when all columns will get removed.
        project_args = used_columns
        if unused_columns:
            if len(unused_columns) < len(self._schema.names):
                project_args = self._query.get_all_used_columns()
            elif self._query.has_count_star():
                project_args = [Column(self._schema.names[0])]
            else:
                skip_table = True

        if self._reader:
            current_op = FileReaderOperator(self._reader)
        elif skip_table:
            current_op = EmptyTableReaderOperator()
        else:
            current_op = TableReaderOperator(self._table)

        if unused_columns and not skip_table:
            current_op = ProjectOperator(arguments=project_args,
                                         parent_operator=current_op)

        if self._query.where_condition:
            current_op = self._new_filter_operator(
                filter_expression=self._query.where_condition,
                parent_operator=current_op,
                processed_shared_ids=processed_shared_ids)

        group_by_exprs = self._query.group_by
        if self._query.distinct:
            group_by_exprs += self._query.select_expressions

        if self._query.is_aggregate():
            inner_agg_exprs = []
            for expr in traverse_exprs(self._query.select_expressions):
                if is_aggregate_func(expr.function_name):
                    project_args = []
                    has_inner = False
                    for inner_expr in expr.arguments:
                        if is_expression(inner_expr):
                            if not inner_expr.is_shared():
                                inner_col_id = str(id(inner_expr))
                                inner_expr.set_shared_id(inner_col_id)
                            inner_agg_exprs.append(inner_expr)
                            inner_expr = Column(inner_expr.get_shared_id())
                            has_inner = True
                        project_args.append(inner_expr)
                    if has_inner:
                        expr.set_arguments(tuple(project_args))
            group_by = []
            group_by_col_names = set()
            for expr in group_by_exprs:
                if is_expression(expr):
                    inner_agg_exprs.append(expr)
                    expr = Column(expr.get_shared_id())
                group_by.append(expr)
                group_by_col_names.add(expr.get_column_name())

            if inner_agg_exprs:
                current_op = ProjectOperator(
                    arguments=self._process_expressions(
                        tuple(inner_agg_exprs), processed_shared_ids),
                    parent_operator=current_op,
                    keep_input_table=True)

            proc_sel_exprs = self._process_expressions(
                self._query.get_select_plus_post_agg_cols(),
                processed_shared_ids)

            # Traverse all aggregate functions and if function is an expression
            # argument - replace it with a column object.
            agg_funcs = []

            def substitute(expr_node):
                nonlocal agg_funcs
                inner_agg_funcs = False
                if not is_vect_expression(expr_node):
                    return
                if isinstance(expr_node, AggregateFunction):
                    agg_funcs.append(expr_node)
                    return
                replacement_args = list(expr_node.arguments)
                for arg in expr_node.arguments:
                    if (is_vect_expression(arg)
                            and isinstance(arg, AggregateFunction)):
                        agg_funcs.append(arg)
                        replacement_args.append(Column(arg.get_column_name()))
                        inner_agg_funcs = True
                    else:
                        replacement_args.append(arg)
                if inner_agg_funcs:
                    expr_node.arguments = replacement_args

            for expr in proc_sel_exprs:
                traverse_exprs_tree(expr, substitute)

            # Find all groupby columns in the tree of each select expression.
            agg_cols = []

            def find_groupby_cols(expr_node):
                nonlocal agg_cols
                if expr_node.get_column_name() in group_by_col_names:
                    agg_cols.append(expr_node)

            for expr in proc_sel_exprs:
                traverse_exprs_tree(expr, find_groupby_cols)

            current_op = AggregateOperator(
                parent_operator=current_op,
                group_by_columns=self._process_expressions(
                    group_by, processed_shared_ids),
                agg_funcs=agg_funcs,
                agg_cols=agg_cols)

        if self._query.having:
            current_op = self._new_filter_operator(
                filter_expression=self._query.having,
                parent_operator=current_op,
                processed_shared_ids=processed_shared_ids)

        if self._query.order_by:
            current_op = SortOperator(
                self._process_expressions(self._query.order_by,
                                          processed_shared_ids),
                self._query.sort_order, current_op)

        sel_exprs = self._process_expressions(self._query.select_expressions,
                                              processed_shared_ids)
        current_op = ProjectOperator(arguments=sel_exprs,
                                     parent_operator=current_op,
                                     col_names=self._column_names(
                                         self._query.select_expressions))

        if self._query.has_limit():
            current_op = SliceOperator(
                self._query.limit,  # type: ignore
                self._query.offset,
                current_op)

        current_op = MaterializeTableOperator(parent_operator=current_op)

        return current_op
Exemplo n.º 9
0
    def _process_expressions_tree(
            self, expr: Expression,
            processed_shared_ids: Set[str]) -> VectorizedExpression:
        """
        Recursively process Expressions tree and create VectorizedExpression
        tree.

        Essentially, transform Expressions into VectorizedExpression(s)
        implementing given Expression.

        Parameters
        ----------
        expr : Expression
            Expression to process.
        processed_shared_ids : Set[str]
            Set of shared expression IDs, that are already processed.

        Returns
        -------
        VectorizedExpression
            VectorizedExpression.
        """
        assert expr
        if (expr.is_shared() and expr.get_shared_id() in processed_shared_ids):
            return Column(expr.get_shared_id())

        arguments: List[OperatorArgument] = []

        for arg in expr.arguments:  # type: Any
            if is_expression(arg):
                arg = self._process_expressions_tree(arg, processed_shared_ids)
            arguments.append(arg)

        if expr.sql_operator in SQL_OPERATOR_FUNCTIONS.keys():
            func, func_type = SQL_OPERATOR_FUNCTIONS[expr.sql_operator]
            vec_expr = self._new_vectorized_expression(
                kernel=func,
                arguments=arguments,
                func_type=func_type,
                is_binary_func=(expr.sql_operator in BINARY_OPERATORS))

        elif expr.sql_operator in (SQLOperator.LIKE, SQLOperator.NOT_LIKE):
            vec_expr = LikeFunction(
                tuple(arguments),  # type: ignore
                expr.sql_operator == SQLOperator.NOT_LIKE,
            )
        elif expr.sql_operator == SQLOperator.FUNCTION:
            assert expr.function_name

            function_name = expr.function_name.lower()  # type: str

            if is_aggregate_func(function_name):
                function_name = ensure_numpy_mapping(function_name)

                vec_expr = AggregateFunction(function_name, *arguments)
                if not expr.is_shared():
                    expr.set_shared_id(f'{function_name}_{id(expr)}')
            else:
                func, func_type = lookup_udf(function_name)
                vec_expr = self._new_vectorized_expression(
                    kernel=func,
                    arguments=arguments,
                    func_type=func_type,
                    is_binary_func=(expr.sql_operator in BINARY_OPERATORS))

        else:
            raise PlannerError(f'Unsupported SQLOperator: {expr.sql_operator}')

        if expr.is_shared():
            vec_expr.set_shared_id(expr.get_shared_id())
            processed_shared_ids.add(expr.get_shared_id())

        return vec_expr